treat 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +0 -1
- data/files/INFO +1 -1
- data/lib/treat/entities/abilities/buildable.rb +2 -6
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/delegatable.rb +2 -2
- data/lib/treat/entities/abilities/doable.rb +6 -1
- data/lib/treat/entities/abilities/iterable.rb +8 -0
- data/lib/treat/entities/abilities/magical.rb +1 -1
- data/lib/treat/extractors.rb +1 -1
- data/lib/treat/formatters/visualizers/standoff.rb +1 -1
- data/lib/treat/groupable.rb +4 -0
- data/lib/treat/installer.rb +33 -13
- data/lib/treat/kernel.rb +0 -4
- data/lib/treat/languages/arabic.rb +1 -1
- data/lib/treat/languages/chinese.rb +1 -1
- data/lib/treat/languages/dutch.rb +1 -1
- data/lib/treat/languages/english.rb +1 -1
- data/lib/treat/languages/french.rb +4 -4
- data/lib/treat/languages/german.rb +3 -3
- data/lib/treat/languages/italian.rb +1 -1
- data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
- data/lib/treat/languages/polish.rb +1 -1
- data/lib/treat/languages/portuguese.rb +1 -1
- data/lib/treat/languages/russian.rb +1 -1
- data/lib/treat/languages/spanish.rb +1 -1
- data/lib/treat/languages/swedish.rb +1 -1
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
- data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
- data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
- data/lib/treat/lexicalizers.rb +2 -1
- data/lib/treat/processors/parsers/enju.rb +2 -2
- data/lib/treat/processors/parsers/stanford.rb +17 -11
- data/lib/treat/processors/segmenters/punkt.rb +5 -2
- data/lib/treat/processors/segmenters/tactful.rb +5 -1
- data/lib/treat/processors/tokenizers/ptb.rb +11 -3
- data/lib/treat/processors/tokenizers/punkt.rb +0 -3
- data/lib/treat/processors/tokenizers/tactful.rb +3 -0
- data/lib/treat/universalisation/encodings.rb +12 -0
- data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
- data/lib/treat/universalisation.rb +9 -0
- data/lib/treat.rb +2 -2
- metadata +6 -6
- data/lib/treat/linguistics.rb +0 -9
- data/lib/treat/processors/tokenizers/perl.rb +0 -132
data/LICENSE
CHANGED
@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
|
|
20
20
|
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
21
21
|
- processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
|
22
22
|
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
23
|
-
- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
|
24
23
|
- processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
|
25
24
|
- extractors/topics/reuters.rb - Mark Watson (GPL license)
|
26
25
|
- inflectors/declensions/english.rb - Thomas Sawyer (MIT license)
|
data/files/INFO
CHANGED
@@ -1 +1 @@
|
|
1
|
-
This is a folder containing the files downloaded by Treat.
|
1
|
+
This is a folder containing the files downloaded by Treat from the internet.
|
@@ -4,12 +4,11 @@
|
|
4
4
|
# is pretty much self-explanatory.
|
5
5
|
module Treat::Entities::Abilities::Buildable
|
6
6
|
|
7
|
-
require 'treat/helpers/decimal_point_escaper'
|
8
7
|
require 'fileutils'
|
9
8
|
|
10
9
|
# Simple regexps to match common entities.
|
11
10
|
WordRegexp = /^[[:alpha:]\-']+$/
|
12
|
-
NumberRegexp = /^#?([0-9]+)(
|
11
|
+
NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
|
13
12
|
PunctRegexp = /^[[:punct:]\$]+$/
|
14
13
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
15
14
|
EmailRegexp = /.+\@.+\..+/
|
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
|
|
57
56
|
# instead of from_string directly).
|
58
57
|
def from_string(string, enforce_type = false)
|
59
58
|
|
60
|
-
Treat::Helpers::DecimalPointEscaper.escape!(string)
|
61
|
-
|
62
59
|
enforce_type = true if caller_method == :build
|
63
60
|
|
64
61
|
unless self == Treat::Entities::Entity
|
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
|
|
74
71
|
end
|
75
72
|
|
76
73
|
e
|
74
|
+
|
77
75
|
end
|
78
76
|
|
79
77
|
# Build a document from an URL.
|
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
|
|
116
114
|
"a numeric object."
|
117
115
|
end
|
118
116
|
n = numeric.to_s
|
119
|
-
Treat::Helpers::DecimalPointEscaper.unescape!(n)
|
120
117
|
Treat::Entities::Number.new(n)
|
121
118
|
end
|
122
119
|
|
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
|
|
319
316
|
end
|
320
317
|
|
321
318
|
def create_collection(fv)
|
322
|
-
debug("Creating new collection in directory #{fv}.")
|
323
319
|
FileUtils.mkdir(fv)
|
324
320
|
Treat::Entities::Collection.new(fv)
|
325
321
|
end
|
@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
|
|
24
24
|
return unless has_children?
|
25
25
|
raise Treat::Exception,
|
26
26
|
"Warning: can't #{caller_method(2)} "+
|
27
|
-
"
|
28
|
-
"
|
27
|
+
"the text \"#{short_value}\", because it " +
|
28
|
+
"already has children."
|
29
29
|
end
|
30
30
|
|
31
31
|
end
|
@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
|
|
104
104
|
if !klass[g] || !klass[g][0]
|
105
105
|
d = ucc(cl(group))
|
106
106
|
d.gsub!('_', ' ')
|
107
|
-
d =
|
107
|
+
d = d[0..-2]
|
108
108
|
raise Treat::Exception, "No #{d}" +
|
109
|
-
"
|
109
|
+
" is available for the " +
|
110
110
|
"#{lang.to_s.capitalize} language."
|
111
111
|
end
|
112
112
|
return klass[g][0]
|
@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
|
|
37
37
|
end
|
38
38
|
if f || entity_types.include?(:entity)
|
39
39
|
send(task, worker, options)
|
40
|
+
if group.recursive
|
41
|
+
each do |entity|
|
42
|
+
entity.do_task(task, worker, options, group)
|
43
|
+
end
|
44
|
+
end
|
40
45
|
else
|
41
|
-
|
46
|
+
each do |entity|
|
42
47
|
entity.do_task(task, worker, options, group)
|
43
48
|
end
|
44
49
|
unless entity_types.include?(type)
|
@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
|
|
95
95
|
as
|
96
96
|
end
|
97
97
|
|
98
|
+
# Returns the first ancestor that has a feature
|
99
|
+
# with the given name, otherwise nil.
|
100
|
+
def ancestor_with_feature(type, feature)
|
101
|
+
each_ancestor(type) do |ancestor|
|
102
|
+
return ancestor if ancestor.has?(feature)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
98
106
|
alias :ancestors_with_type :ancestors_with_types
|
99
107
|
|
100
108
|
# Number of children that have a given feature.
|
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
|
|
25
25
|
def magic(sym, *args)
|
26
26
|
|
27
27
|
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
28
|
-
@@cats_regexp ||= "(#{Treat::
|
28
|
+
@@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
|
29
29
|
|
30
30
|
method = sym.to_s =~ /entities/ ?
|
31
31
|
sym.to_s.gsub('entities', 'entitys') :
|
data/lib/treat/extractors.rb
CHANGED
data/lib/treat/groupable.rb
CHANGED
@@ -95,8 +95,12 @@ module Treat::Groupable
|
|
95
95
|
attr_accessor :presets
|
96
96
|
# The preset option to use with preset functions.
|
97
97
|
attr_accessor :preset_option
|
98
|
+
# Whether to recurse within multiple targets or not.
|
99
|
+
attr_accessor :recursive
|
98
100
|
end
|
99
101
|
|
102
|
+
self.recursive = false
|
103
|
+
|
100
104
|
# Return the method corresponding to the group.
|
101
105
|
# This method resolves the name of the method
|
102
106
|
# that a group should provide based on the name
|
data/lib/treat/installer.rb
CHANGED
@@ -82,9 +82,13 @@ module Treat::Installer
|
|
82
82
|
begin
|
83
83
|
Gem::Specification.find_by_name('punkt-segmenter')
|
84
84
|
title "Downloading model for the Punkt segmenter for the #{l}."
|
85
|
-
|
85
|
+
# Need fix
|
86
|
+
download_punkt_models([language.to_s])
|
86
87
|
rescue Gem::LoadError; end
|
87
|
-
|
88
|
+
|
89
|
+
# Download reuters models always
|
90
|
+
download_reuters_models
|
91
|
+
|
88
92
|
# If stanford is installed, download models.
|
89
93
|
begin
|
90
94
|
Gem::Specification.find_by_name('stanford-core-nlp')
|
@@ -92,7 +96,10 @@ module Treat::Installer
|
|
92
96
|
"model files for the the #{l}.\n\n"
|
93
97
|
package = (language == :english) ? :english : :all
|
94
98
|
download_stanford(package)
|
95
|
-
rescue Gem::LoadError
|
99
|
+
rescue Gem::LoadError
|
100
|
+
puts 'Stanford-core-nlp gem not installed.'
|
101
|
+
puts 'Skipping download of Stanford models.'
|
102
|
+
end
|
96
103
|
|
97
104
|
title "Install external binary libraries " +
|
98
105
|
"(requires port, apt-get or win-get).\n"
|
@@ -124,7 +131,7 @@ module Treat::Installer
|
|
124
131
|
install_dependencies(false)
|
125
132
|
install_language_dependencies(dep, false)
|
126
133
|
download_stanford(:minimal)
|
127
|
-
download_punkt_models(:english)
|
134
|
+
download_punkt_models([:english])
|
128
135
|
end
|
129
136
|
|
130
137
|
def self.install_dependencies(optionally)
|
@@ -166,7 +173,7 @@ module Treat::Installer
|
|
166
173
|
unless man
|
167
174
|
puts "Skipping installation of the "+
|
168
175
|
"following binaries:\n\n"
|
169
|
-
|
176
|
+
Binary.each do |binary, purpose|
|
170
177
|
puts "- #{binary} to #{purpose}"
|
171
178
|
end
|
172
179
|
return
|
@@ -227,22 +234,35 @@ module Treat::Installer
|
|
227
234
|
|
228
235
|
end
|
229
236
|
|
230
|
-
def self.download_punkt_models(
|
237
|
+
def self.download_punkt_models(languages)
|
238
|
+
languages.map! { |l| "#{l}.yaml" }
|
239
|
+
download_models 'punkt', languages
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.download_reuters_models
|
243
|
+
files = ["industry.xml", "region.xml", "topics.xml"]
|
244
|
+
download_models 'reuters', files
|
245
|
+
end
|
231
246
|
|
232
|
-
|
233
|
-
dest = "#{Treat.models}punkt/"
|
247
|
+
def self.download_models(directory, files)
|
234
248
|
|
235
|
-
|
236
|
-
'http', Server, 'treat/punkt', f, Treat.tmp)
|
249
|
+
dest = "#{Treat.models}#{directory}/"
|
237
250
|
|
238
251
|
unless File.readable?(dest)
|
239
|
-
puts "- Creating directory models
|
252
|
+
puts "- Creating directory models/#{directory} ..."
|
240
253
|
FileUtils.mkdir_p(File.absolute_path(dest))
|
241
254
|
end
|
242
255
|
|
243
|
-
puts "- Copying model file to models/punkt ..."
|
244
|
-
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
245
256
|
|
257
|
+
files.each do |file|
|
258
|
+
puts "- Downloading #{file} ..."
|
259
|
+
loc = Treat::Downloader.download(
|
260
|
+
'http', Server, "treat/#{directory}", file, Treat.tmp)
|
261
|
+
puts "- Copying file to models/#{directory} ..."
|
262
|
+
FileUtils.cp(loc, File.join(Paths[:models], directory, file))
|
263
|
+
end
|
264
|
+
|
265
|
+
|
246
266
|
puts "- Cleaning up..."
|
247
267
|
FileUtils.rm_rf(Paths[:tmp] + Server)
|
248
268
|
|
data/lib/treat/kernel.rb
CHANGED
@@ -31,7 +31,7 @@ class Treat::Languages::English
|
|
31
31
|
:chunkers => [:txt],
|
32
32
|
:parsers => [:stanford, :enju],
|
33
33
|
:segmenters => [:tactful, :punkt, :stanford],
|
34
|
-
:tokenizers => [:
|
34
|
+
:tokenizers => [:ptb, :stanford, :tactful, :punkt]
|
35
35
|
}
|
36
36
|
|
37
37
|
Retrievers = {
|
@@ -6,14 +6,14 @@ class Treat::Languages::French
|
|
6
6
|
Extractors = {}
|
7
7
|
Inflectors = {}
|
8
8
|
Lexicalizers = {
|
9
|
-
:
|
10
|
-
:
|
9
|
+
:taggers => [:stanford],
|
10
|
+
:categorizers => [:from_tag]
|
11
11
|
}
|
12
12
|
Processors = {
|
13
13
|
:chunkers => [:txt],
|
14
14
|
:parsers => [:stanford],
|
15
|
-
:segmenters => [:
|
16
|
-
:tokenizers => [:
|
15
|
+
:segmenters => [:tactful],
|
16
|
+
:tokenizers => [:tactful]
|
17
17
|
}
|
18
18
|
Retrievers = {}
|
19
19
|
|
@@ -6,14 +6,14 @@ class Treat::Languages::German
|
|
6
6
|
Extractors = {}
|
7
7
|
Inflectors = {}
|
8
8
|
Lexicalizers = {
|
9
|
-
:
|
10
|
-
:
|
9
|
+
:taggers => [:stanford],
|
10
|
+
:categorizers => [:from_tag]
|
11
11
|
}
|
12
12
|
Processors = {
|
13
13
|
:chunkers => [:txt],
|
14
14
|
:parsers => [:stanford],
|
15
15
|
:segmenters => [:punkt],
|
16
|
-
:tokenizers => [:
|
16
|
+
:tokenizers => [:tactful]
|
17
17
|
}
|
18
18
|
Retrievers = {}
|
19
19
|
|
@@ -1,11 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# A list of all possible word categories.
|
1
|
+
class Treat::Languages::Language
|
2
|
+
|
4
3
|
WordCategories = [
|
5
4
|
:adjective, :adverb, :noun, :verb, :interjection,
|
6
5
|
:clitic, :coverb, :conjunction, :determiner, :particle,
|
7
6
|
:preposition, :pronoun, :number, :symbol, :punctuation,
|
8
7
|
:complementizer
|
9
8
|
]
|
10
|
-
|
9
|
+
|
11
10
|
end
|
@@ -3,17 +3,19 @@
|
|
3
3
|
# from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
|
4
4
|
class Treat::Lexicalizers::Categorizers::FromTag
|
5
5
|
|
6
|
-
Pttc = Treat::
|
7
|
-
Wttc = Treat::
|
8
|
-
Ptc = Treat::
|
6
|
+
Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
|
7
|
+
Wttc = Treat::Universalisation::Tags::WordTagToCategory
|
8
|
+
Ptc = Treat::Universalisation::Tags::PunctuationToCategory
|
9
9
|
|
10
10
|
# Find the category of the entity from its tag.
|
11
11
|
def self.category(entity, options = {})
|
12
12
|
|
13
13
|
tag = entity.check_has(:tag)
|
14
|
+
|
14
15
|
return :unknown if tag.nil? || tag == '' || entity.type == :symbol
|
15
16
|
return :sentence if tag == 'S' || entity.type == :sentence
|
16
17
|
return :number if entity.type == :number
|
18
|
+
|
17
19
|
return Ptc[entity.to_s] if entity.type == :punctuation
|
18
20
|
|
19
21
|
if entity.is_a?(Treat::Entities::Phrase)
|
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
|
|
29
31
|
|
30
32
|
if entity.has?(:tag_set)
|
31
33
|
ts = entity.get(:tag_set)
|
32
|
-
elsif entity.parent_phrase &&
|
33
|
-
entity.parent_phrase.has?(:tag_set)
|
34
|
-
ts = entity.parent_phrase.get(:tag_set)
|
35
34
|
else
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
a = entity.ancestor_with_feature(:phrase, :tag_set)
|
36
|
+
if a
|
37
|
+
ts = a.get(:tag_set)
|
38
|
+
else
|
39
|
+
raise Treat::Exception,
|
40
|
+
"No information can be found regarding "+
|
41
|
+
"which tag set to use."
|
42
|
+
end
|
39
43
|
end
|
40
|
-
|
44
|
+
|
41
45
|
if cat[ts]
|
42
46
|
return cat[ts]
|
43
47
|
else
|
@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
38
38
|
end
|
39
39
|
|
40
40
|
# Handle tags for sentences and phrases.
|
41
|
-
|
42
41
|
if entity.is_a?(Treat::Entities::Sentence) ||
|
43
42
|
(entity.is_a?(Treat::Entities::Phrase) &&
|
44
43
|
!entity.parent_sentence)
|
45
|
-
|
44
|
+
|
45
|
+
tag_set = Treat::Universalisation::Tags::
|
46
|
+
StanfordTagSetForLanguage[
|
47
|
+
Treat::Languages.describe(lang)]
|
48
|
+
entity.set :tag_set, tag_set
|
46
49
|
end
|
47
50
|
|
48
51
|
if entity.is_a?(Treat::Entities::Sentence)
|
data/lib/treat/lexicalizers.rb
CHANGED
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
|
|
23
23
|
@@parser = nil
|
24
24
|
|
25
25
|
# A hash of Enju cat tags mapped to word categories.
|
26
|
-
Ectc = Treat::
|
26
|
+
Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
|
27
27
|
|
28
28
|
# A hash of Enju cat/xcat pairs mapped to PTB tags.
|
29
|
-
Ecxtp = Treat::
|
29
|
+
Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
|
30
30
|
|
31
31
|
# Parse the entity into its syntactical
|
32
32
|
# phrases using Enju.
|
@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
|
|
27
27
|
lang = entity.language
|
28
28
|
init(lang, options)
|
29
29
|
|
30
|
+
tag_set = Treat::Universalisation::Tags::
|
31
|
+
StanfordTagSetForLanguage[
|
32
|
+
Treat::Languages.describe(lang)]
|
33
|
+
|
30
34
|
text = ::StanfordCoreNLP::Text.new(val)
|
31
35
|
@@parsers[lang].annotate(text)
|
32
36
|
|
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
|
|
37
41
|
tag = s.get(:category).to_s
|
38
42
|
tag_s, tag_opt = *tag.split('-')
|
39
43
|
tag_s ||= 'S'
|
40
|
-
entity.set :tag_set, :penn
|
41
44
|
entity.set :tag, tag_s
|
42
45
|
entity.set :tag_opt, tag_opt if tag_opt
|
43
|
-
recurse(s.get(:tree).children[0], entity)
|
44
|
-
break
|
46
|
+
recurse(s.get(:tree).children[0], entity, tag_set)
|
47
|
+
break #######
|
45
48
|
else
|
46
49
|
recurse(s.get(:tree), entity)
|
47
50
|
end
|
48
51
|
|
49
52
|
end
|
50
53
|
|
54
|
+
entity.set :tag_set, tag_set
|
55
|
+
|
51
56
|
end
|
52
57
|
|
53
58
|
def self.init(lang, options)
|
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
|
|
76
81
|
|
77
82
|
# Helper method which recurses the tree supplied by
|
78
83
|
# the Stanford parser.
|
79
|
-
def self.recurse(java_node, ruby_node, additional_tags = [])
|
84
|
+
def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
|
80
85
|
|
81
86
|
if java_node.num_children == 0
|
82
87
|
|
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
|
|
85
90
|
tag_s, tag_opt = *tag.split('-')
|
86
91
|
tag_s ||= ''
|
87
92
|
ruby_node.value = java_node.value.to_s.strip
|
88
|
-
ruby_node.set :tag_set, :penn
|
89
93
|
ruby_node.set :tag, tag_s
|
90
94
|
ruby_node.set :tag_opt, tag_opt if tag_opt
|
91
|
-
ruby_node.set :tag_set, :penn
|
92
95
|
ruby_node.set :lemma, label.get(:lemma).to_s
|
93
96
|
|
94
97
|
additional_tags.each do |t|
|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
|
|
103
106
|
if java_node.num_children == 1 &&
|
104
107
|
java_node.children[0].num_children == 0
|
105
108
|
recurse(java_node.children[0],
|
106
|
-
ruby_node, additional_tags)
|
109
|
+
ruby_node, tag_set, additional_tags)
|
107
110
|
return
|
108
111
|
end
|
109
112
|
|
110
113
|
java_node.children.each do |java_child|
|
114
|
+
|
111
115
|
label = java_child.label
|
112
116
|
tag = label.get(:category).to_s
|
113
117
|
tag_s, tag_opt = *tag.split('-')
|
114
118
|
tag_s ||= ''
|
115
|
-
|
116
|
-
if Treat::
|
119
|
+
|
120
|
+
if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
|
121
|
+
Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
|
117
122
|
ruby_child = Treat::Entities::Phrase.new
|
118
123
|
else
|
119
124
|
l = java_child.children[0].to_s
|
120
125
|
v = java_child.children[0].value.to_s.strip
|
126
|
+
|
121
127
|
# Mhmhmhmhmhm
|
122
128
|
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
123
129
|
ruby_child = Treat::Entities::Token.from_string(val)
|
124
130
|
end
|
125
131
|
|
126
|
-
ruby_child.set :tag_set, :penn
|
127
132
|
ruby_child.set :tag, tag_s
|
128
133
|
ruby_child.set :tag_opt, tag_opt if tag_opt
|
129
134
|
ruby_node << ruby_child
|
130
135
|
|
131
136
|
unless java_child.children.empty?
|
132
|
-
recurse(java_child, ruby_child, additional_tags)
|
137
|
+
recurse(java_child, ruby_child, tag_set, additional_tags)
|
133
138
|
end
|
134
139
|
|
135
140
|
end
|
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
|
|
137
142
|
end
|
138
143
|
|
139
144
|
end
|
145
|
+
|
140
146
|
end
|
@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
|
|
38
38
|
|
39
39
|
s = entity.to_s
|
40
40
|
|
41
|
-
# Replace
|
41
|
+
# Replace the point in all floating-point numbers
|
42
|
+
# by ^^; this is a fix since Punkt trips on decimal
|
43
|
+
# numbers.
|
42
44
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
43
|
-
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
|
45
|
+
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
44
46
|
|
45
47
|
result = @@segmenters[lang].
|
46
48
|
sentences_from_text(s,
|
47
49
|
:output => :sentences_text)
|
48
50
|
|
49
51
|
result.each do |sentence|
|
52
|
+
# Unescape the sentence.
|
50
53
|
Treat::Helpers::DecimalPointEscaper.
|
51
54
|
unescape!(sentence)
|
52
55
|
entity << Treat::Entities::Phrase.
|
@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
|
|
29
29
|
entity.check_hasnt_children
|
30
30
|
|
31
31
|
s = entity.to_s
|
32
|
+
|
32
33
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
33
34
|
|
34
|
-
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
|
35
|
+
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
35
36
|
|
36
37
|
@@segmenter ||= TactfulTokenizer::Model.new
|
37
38
|
|
38
39
|
sentences = @@segmenter.tokenize_text(s)
|
40
|
+
|
39
41
|
sentences.each do |sentence|
|
40
42
|
Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
|
43
|
+
puts sentence.to_s if sentence.to_s.include?('staff')
|
41
44
|
entity << Treat::Entities::Phrase.from_string(sentence)
|
42
45
|
end
|
46
|
+
|
43
47
|
end
|
44
48
|
|
45
49
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# A native rule-basd tokenizer based on the one
|
2
3
|
# developped by Robert Macyntyre in 1995 for the Penn
|
3
4
|
# Treebank project. This tokenizer follows the
|
@@ -11,8 +12,6 @@
|
|
11
12
|
# you can redistribute it and/or modify it under the
|
12
13
|
# same terms as Ruby itself.
|
13
14
|
module Treat::Processors::Tokenizers::PTB
|
14
|
-
|
15
|
-
require 'treat/helpers/decimal_point_escaper'
|
16
15
|
|
17
16
|
# Tokenize the entity using a native rule-based algorithm.
|
18
17
|
def self.tokenize(entity, options = {})
|
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
|
|
33
32
|
|
34
33
|
# Helper method to split the string into tokens.
|
35
34
|
def self.split(string)
|
35
|
+
|
36
36
|
s = " " + string + " "
|
37
|
-
|
37
|
+
|
38
|
+
# Translate some common extended ascii
|
39
|
+
# characters to quotes
|
40
|
+
s.gsub!(/‘/,'`')
|
41
|
+
s.gsub!(/’/,"'")
|
42
|
+
s.gsub!(/“/,"``")
|
43
|
+
s.gsub!(/”/,"''")
|
44
|
+
|
45
|
+
|
38
46
|
s.gsub!(/\s+/," ")
|
39
47
|
s.gsub!(/(\s+)''/,'\1"')
|
40
48
|
s.gsub!(/(\s+)``/,'\1"')
|
@@ -14,8 +14,6 @@
|
|
14
14
|
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
15
15
|
class Treat::Processors::Tokenizers::Punkt
|
16
16
|
|
17
|
-
require 'treat/helpers/decimal_point_escaper'
|
18
|
-
|
19
17
|
SentEndChars = ['.', '?', '!']
|
20
18
|
ReSentEndChars = /[.?!]/
|
21
19
|
InternalPunctuation = [',', ':', ';']
|
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
|
|
35
33
|
entity.check_hasnt_children
|
36
34
|
|
37
35
|
s = entity.to_s
|
38
|
-
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
39
36
|
|
40
37
|
s.scan(ReWordTokenizer).each do |token|
|
41
38
|
if SentEndChars.include?(token[-1])
|
@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
|
|
51
51
|
entity.check_hasnt_children
|
52
52
|
|
53
53
|
s = entity.to_s
|
54
|
+
|
54
55
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
55
56
|
|
56
57
|
ReTokenize.each do |rules|
|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
|
|
58
59
|
end
|
59
60
|
|
60
61
|
s.split(' ').each do |token|
|
62
|
+
|
63
|
+
Treat::Helpers::DecimalPointEscaper.unescape!(token)
|
61
64
|
entity << Treat::Entities::Token.
|
62
65
|
from_string(token)
|
63
66
|
end
|
@@ -1,14 +1,20 @@
|
|
1
|
-
module Treat::
|
1
|
+
module Treat::Universalisation::Tags
|
2
2
|
|
3
3
|
ClawsC5 = 0
|
4
4
|
Brown = 1
|
5
5
|
Penn = 2
|
6
|
-
|
6
|
+
Stuttgart = 3
|
7
7
|
PennChinese = 4
|
8
|
-
|
8
|
+
Paris7 = 5
|
9
|
+
|
10
|
+
StanfordTagSetForLanguage = {
|
11
|
+
:french => :paris7,
|
12
|
+
:english => :penn,
|
13
|
+
:german => :stuttgart
|
14
|
+
}
|
9
15
|
|
10
16
|
PTBClauseTagDescription = [
|
11
|
-
['S', '
|
17
|
+
['S', 'Paris7 declarative clause'],
|
12
18
|
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
13
19
|
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
14
20
|
['SINV', 'Inverted declarative sentence'],
|
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
|
|
26
32
|
|
27
33
|
AlignedPhraseTags =
|
28
34
|
[
|
29
|
-
'
|
30
|
-
'
|
31
|
-
'Conjunction phrase', ['', '', 'CONJP'],
|
32
|
-
'Fragment', ['', '', 'FRAG'],
|
33
|
-
'
|
34
|
-
'List marker', ['', '', 'LST'],
|
35
|
-
'Not a phrase', ['', '', 'NAC'],
|
36
|
-
'Noun phrase', ['', '', 'NP'],
|
37
|
-
'
|
38
|
-
'
|
39
|
-
'
|
40
|
-
'
|
41
|
-
'
|
42
|
-
'
|
43
|
-
'
|
44
|
-
'
|
45
|
-
'
|
46
|
-
'
|
47
|
-
'
|
48
|
-
'Wh
|
49
|
-
'
|
50
|
-
'
|
51
|
-
'
|
52
|
-
'
|
35
|
+
'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
|
36
|
+
'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
|
37
|
+
'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
|
38
|
+
'Fragment', ['', '', 'FRAG', '', '', ''],
|
39
|
+
'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
|
40
|
+
'List marker', ['', '', 'LST', '', '', ''],
|
41
|
+
'Not a phrase', ['', '', 'NAC', '', '', ''],
|
42
|
+
'Noun phrase', ['', '', 'NP', '', '', 'NP'],
|
43
|
+
'Verbal nucleus', ['', '', '', '', '', 'VN'],
|
44
|
+
'Head of noun phrase', ['', '', 'NX', '', '', ''],
|
45
|
+
'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
|
46
|
+
'Parenthetical', ['', '', 'PRN', '', '', ''],
|
47
|
+
'Particle', ['', '', 'PRT', '', '', ''],
|
48
|
+
'Participial phrase', ['', '', '', '', '', 'VPart'],
|
49
|
+
'Quantifier phrase', ['', '', 'QP', '', '', ''],
|
50
|
+
'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
|
51
|
+
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
52
|
+
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
53
|
+
'Verb phrase', ['', '', 'VP', '', '', ''],
|
54
|
+
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
55
|
+
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
56
|
+
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
57
|
+
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
58
|
+
'Unknown', ['', '', 'X', '', '', ''],
|
59
|
+
'Phrase', ['', '', 'P', '', '', 'Sint'],
|
60
|
+
'Sentence', ['', '', 'S', '', '', 'SENT'],
|
61
|
+
'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
|
53
62
|
]
|
54
63
|
|
55
64
|
# A description of Enju categories.
|
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
|
|
139
148
|
# JRS?
|
140
149
|
|
141
150
|
|
142
|
-
|
151
|
+
Paris7WordTagToCategory = {
|
143
152
|
'C' => :complementizer,
|
144
153
|
'PN' => :punctuation,
|
145
154
|
'SC' => :conjunction
|
146
155
|
}
|
147
|
-
|
156
|
+
|
148
157
|
PunctuationToCategory = {
|
149
158
|
'.' => :period,
|
150
159
|
',' => :comma,
|
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
|
|
152
161
|
':' => :colon,
|
153
162
|
'!' => :exclamation,
|
154
163
|
'?' => :interrogation,
|
155
|
-
'"' => :
|
156
|
-
"'" => :
|
157
|
-
|
164
|
+
'"' => :double_quote,
|
165
|
+
"'" => :single_quote,
|
158
166
|
'$' => :dollar,
|
159
167
|
'%' => :percent,
|
160
168
|
'#' => :hash,
|
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
|
|
227
235
|
'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
|
228
236
|
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
229
237
|
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
230
|
-
|
238
|
+
'Interjection', ['', '', '', '', '', 'I'],
|
231
239
|
'Localizer', ['', '', '', '', 'LC'],
|
232
240
|
|
233
241
|
'Measure word', ['', '', '', '', 'M'],
|
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
|
|
366
374
|
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
367
375
|
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
368
376
|
]
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
377
|
+
|
378
|
+
# Paris7 Treebank functional tags
|
379
|
+
=begin
|
380
|
+
SUJ (subject)
|
381
|
+
OBJ (direct object)
|
382
|
+
ATS (predicative complement of a subject)
|
383
|
+
ATO (predicative complement of a direct object)
|
384
|
+
MOD (modifier or adjunct)
|
385
|
+
A-OBJ (indirect complement introduced by à)
|
386
|
+
DE-OBJ (indirect complement introduced by de)
|
387
|
+
P-OBJ (indirect complement introduced by another preposition)
|
388
|
+
=end
|
389
|
+
|
390
|
+
# !! Extremely ugly code follows.
|
391
|
+
|
392
|
+
# Generate word tag -> category hash.
|
393
|
+
wttc = {}
|
394
|
+
|
395
|
+
Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
|
374
396
|
|
375
397
|
category = desc.gsub(',', ' ,').
|
376
398
|
split(' ')[0].downcase.intern
|
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
|
|
378
400
|
wttc[tags[ClawsC5]] ||= {}
|
379
401
|
wttc[tags[Brown]] ||= {}
|
380
402
|
wttc[tags[Penn]] ||= {}
|
381
|
-
wttc[tags[
|
403
|
+
wttc[tags[Stuttgart]] ||= {}
|
382
404
|
wttc[tags[PennChinese]] ||= {}
|
383
|
-
wttc[tags[
|
405
|
+
wttc[tags[Paris7]] ||= {}
|
384
406
|
|
385
407
|
wttc[tags[ClawsC5]][:claws_5] = category
|
386
408
|
wttc[tags[Brown]][:brown] = category
|
387
409
|
wttc[tags[Penn]][:penn] = category
|
388
|
-
wttc[tags[
|
410
|
+
wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
|
389
411
|
wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
|
390
|
-
wttc[tags[
|
412
|
+
wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
|
391
413
|
|
392
414
|
end
|
415
|
+
|
393
416
|
# A hash converting word tags to word categories.
|
394
417
|
WordTagToCategory = wttc
|
395
418
|
|
396
419
|
# A hash converting phrase tag to categories.
|
397
420
|
pttc = {}
|
398
|
-
|
421
|
+
|
422
|
+
Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
|
423
|
+
|
399
424
|
category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
|
425
|
+
|
400
426
|
pttc[tags[Penn]] ||= {};
|
427
|
+
pttc[tags[Paris7]] ||= {};
|
428
|
+
|
429
|
+
pttc[tags[Penn]][:penn] = category
|
430
|
+
pttc[tags[Paris7]][:paris7] = category
|
431
|
+
|
401
432
|
# Not yet for other tag sts.
|
402
433
|
#pttc[tags[0]][:claws_5] = category
|
403
434
|
#pttc[tags[1]][:brown] = category
|
404
|
-
|
435
|
+
|
405
436
|
end
|
406
|
-
|
437
|
+
|
407
438
|
# A hash converting word tags to word categories.
|
408
439
|
PhraseTagToCategory = pttc
|
409
440
|
|
data/lib/treat.rb
CHANGED
@@ -10,7 +10,7 @@ module Treat
|
|
10
10
|
end
|
11
11
|
|
12
12
|
# The current version of Treat.
|
13
|
-
VERSION = "1.0.
|
13
|
+
VERSION = "1.0.5"
|
14
14
|
|
15
15
|
# Add methods to handle syntactic sugar,
|
16
16
|
# language configuration options, and paths.
|
@@ -44,7 +44,7 @@ module Treat
|
|
44
44
|
require 'treat/kernel'
|
45
45
|
require 'treat/downloader'
|
46
46
|
require 'treat/languages'
|
47
|
-
require 'treat/
|
47
|
+
require 'treat/universalisation'
|
48
48
|
require 'treat/entities'
|
49
49
|
require 'treat/categories'
|
50
50
|
require 'treat/data_set'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-05-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- lib/treat/languages/german.rb
|
162
162
|
- lib/treat/languages/greek.rb
|
163
163
|
- lib/treat/languages/italian.rb
|
164
|
+
- lib/treat/languages/language.rb
|
164
165
|
- lib/treat/languages/list.txt
|
165
166
|
- lib/treat/languages/polish.rb
|
166
167
|
- lib/treat/languages/portuguese.rb
|
@@ -176,9 +177,6 @@ files:
|
|
176
177
|
- lib/treat/lexicalizers/taggers/lingua.rb
|
177
178
|
- lib/treat/lexicalizers/taggers/stanford.rb
|
178
179
|
- lib/treat/lexicalizers.rb
|
179
|
-
- lib/treat/linguistics/categories.rb
|
180
|
-
- lib/treat/linguistics/tags.rb
|
181
|
-
- lib/treat/linguistics.rb
|
182
180
|
- lib/treat/loaders/linguistics.rb
|
183
181
|
- lib/treat/loaders/stanford.rb
|
184
182
|
- lib/treat/object.rb
|
@@ -190,7 +188,6 @@ files:
|
|
190
188
|
- lib/treat/processors/segmenters/punkt.rb
|
191
189
|
- lib/treat/processors/segmenters/stanford.rb
|
192
190
|
- lib/treat/processors/segmenters/tactful.rb
|
193
|
-
- lib/treat/processors/tokenizers/perl.rb
|
194
191
|
- lib/treat/processors/tokenizers/ptb.rb
|
195
192
|
- lib/treat/processors/tokenizers/punkt.rb
|
196
193
|
- lib/treat/processors/tokenizers/stanford.rb
|
@@ -202,6 +199,9 @@ files:
|
|
202
199
|
- lib/treat/retrievers.rb
|
203
200
|
- lib/treat/server.rb
|
204
201
|
- lib/treat/tree.rb
|
202
|
+
- lib/treat/universalisation/encodings.rb
|
203
|
+
- lib/treat/universalisation/tags.rb
|
204
|
+
- lib/treat/universalisation.rb
|
205
205
|
- lib/treat.rb
|
206
206
|
- spec/collection.rb
|
207
207
|
- spec/document.rb
|
data/lib/treat/linguistics.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# Tokenize the entity using a native rule-based
|
4
|
-
# algorithm. This tokenizer is a port from an
|
5
|
-
# unknown Perl module, which I have lifted from
|
6
|
-
# the 'rbtagger' gem.
|
7
|
-
#
|
8
|
-
# Author: Todd A. Fisher
|
9
|
-
#
|
10
|
-
# This code is free to use under the terms of
|
11
|
-
# the MIT license.
|
12
|
-
#
|
13
|
-
# Original project website:
|
14
|
-
#
|
15
|
-
# https://github.com/taf2/rb-brill-tagger
|
16
|
-
module Treat::Processors::Tokenizers::Perl
|
17
|
-
|
18
|
-
require 'treat/helpers/decimal_point_escaper'
|
19
|
-
|
20
|
-
# Tokenize the entity using a rule-based algorithm
|
21
|
-
# ported from Perl by Todd A. Fisher.
|
22
|
-
#
|
23
|
-
# Options: none.
|
24
|
-
def self.tokenize(entity, options = {})
|
25
|
-
|
26
|
-
entity.check_hasnt_children
|
27
|
-
s = entity.to_s
|
28
|
-
|
29
|
-
tokens = get_tokens(entity.to_s)
|
30
|
-
tokens[1..-1].each do |token|
|
31
|
-
next if token =~ /^\s*$/
|
32
|
-
entity << Treat::Entities::Token.
|
33
|
-
from_string(token)
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
# Helper method to perform the tokenization.
|
39
|
-
def self.get_tokens(string)
|
40
|
-
|
41
|
-
# Normalize all whitespace
|
42
|
-
text = string.gsub(/\s+/,' ')
|
43
|
-
|
44
|
-
# Replace all decimal points by ^^
|
45
|
-
Treat::Helpers::DecimalPointEscaper.escape!(text)
|
46
|
-
|
47
|
-
=begin
|
48
|
-
|
49
|
-
# Translate some common extended ascii
|
50
|
-
# characters to quotes
|
51
|
-
text.gsub!(/‘/,'`')
|
52
|
-
text.gsub!(/’/,"'")
|
53
|
-
text.gsub!(/“/,"``")
|
54
|
-
text.gsub!(/”/,"''")
|
55
|
-
|
56
|
-
# Attempt to get correct directional quotes
|
57
|
-
# s{\"\b} { `` }g;
|
58
|
-
text.gsub!(/\"\b/,' `` ')
|
59
|
-
# s{\b\"} { '' }g;
|
60
|
-
text.gsub!(/\b\"/," '' ")
|
61
|
-
#s{\"(?=\s)} { '' }g;
|
62
|
-
text.gsub!(/\"(?=\s)/," '' ")
|
63
|
-
#s{\"} { `` }g;
|
64
|
-
text.gsub!(/\"(?=\s)/," `` ")
|
65
|
-
=end
|
66
|
-
|
67
|
-
# Isolate ellipses
|
68
|
-
# s{\.\.\.} { ... }g;
|
69
|
-
text.gsub!(/\.\.\./,' ... ')
|
70
|
-
# Isolate any embedded punctuation chars
|
71
|
-
# s{([,;:\@\#\$\%&])} { $1 }g;
|
72
|
-
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
73
|
-
|
74
|
-
# Assume sentence tokenization has been
|
75
|
-
# done first, so split FINAL
|
76
|
-
# periods only.
|
77
|
-
# s/ ([^.]) \. ([\]\)\}\>\"\']*)
|
78
|
-
# [ \t]* $ /$1 .$2 /gx;
|
79
|
-
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
80
|
-
# however, we may as well split ALL
|
81
|
-
# question marks and exclamation points,
|
82
|
-
# since they shouldn't have the abbrev.
|
83
|
-
# -marker ambiguity problem
|
84
|
-
#s{([?!])} { $1 }g;
|
85
|
-
text.gsub!(/([?!])/, ' \1 ')
|
86
|
-
# parentheses, brackets, etc.
|
87
|
-
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
88
|
-
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
89
|
-
#s/(-{2,})/ $1 /g;
|
90
|
-
text.gsub!(/(-{2,})/,' \1 ')
|
91
|
-
|
92
|
-
# Add a space to the beginning and end of
|
93
|
-
# each line, to reduce # of regexps below.
|
94
|
-
#s/$/ /;
|
95
|
-
text.gsub!(/$/," ")
|
96
|
-
#s/^/ /;
|
97
|
-
text.gsub!(/^/," ")
|
98
|
-
|
99
|
-
# possessive or close-single-quote
|
100
|
-
#s/\([^\']\)\' /$1 \' /g;
|
101
|
-
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
102
|
-
# as in it's, I'm, we'd
|
103
|
-
#s/\'([smd]) / \'$1 /ig;
|
104
|
-
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
105
|
-
#s/\'(ll|re|ve) / \'$1 /ig;
|
106
|
-
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
107
|
-
#s/n\'t / n\'t /ig;
|
108
|
-
text.gsub!(/n\'t /i," n't ")
|
109
|
-
|
110
|
-
#s/ (can)(not) / $1 $2 /ig;
|
111
|
-
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
112
|
-
#s/ (d\')(ye) / $1 $2 /ig;
|
113
|
-
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
114
|
-
#s/ (gim)(me) / $1 $2 /ig;
|
115
|
-
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
116
|
-
#s/ (gon)(na) / $1 $2 /ig;
|
117
|
-
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
118
|
-
#s/ (got)(ta) / $1 $2 /ig;
|
119
|
-
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
120
|
-
#s/ (lem)(me) / $1 $2 /ig;
|
121
|
-
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
122
|
-
#s/ (more)(\'n) / $1 $2 /ig;
|
123
|
-
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
124
|
-
#s/ (\'t)(is|was) / $1 $2 /ig;
|
125
|
-
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
126
|
-
#s/ (wan)(na) / $1 $2 /ig;
|
127
|
-
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
128
|
-
text.split(/\s/)
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|