treat 1.0.4 → 1.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +0 -1
- data/files/INFO +1 -1
- data/lib/treat/entities/abilities/buildable.rb +2 -6
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/delegatable.rb +2 -2
- data/lib/treat/entities/abilities/doable.rb +6 -1
- data/lib/treat/entities/abilities/iterable.rb +8 -0
- data/lib/treat/entities/abilities/magical.rb +1 -1
- data/lib/treat/extractors.rb +1 -1
- data/lib/treat/formatters/visualizers/standoff.rb +1 -1
- data/lib/treat/groupable.rb +4 -0
- data/lib/treat/installer.rb +33 -13
- data/lib/treat/kernel.rb +0 -4
- data/lib/treat/languages/arabic.rb +1 -1
- data/lib/treat/languages/chinese.rb +1 -1
- data/lib/treat/languages/dutch.rb +1 -1
- data/lib/treat/languages/english.rb +1 -1
- data/lib/treat/languages/french.rb +4 -4
- data/lib/treat/languages/german.rb +3 -3
- data/lib/treat/languages/italian.rb +1 -1
- data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
- data/lib/treat/languages/polish.rb +1 -1
- data/lib/treat/languages/portuguese.rb +1 -1
- data/lib/treat/languages/russian.rb +1 -1
- data/lib/treat/languages/spanish.rb +1 -1
- data/lib/treat/languages/swedish.rb +1 -1
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
- data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
- data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
- data/lib/treat/lexicalizers.rb +2 -1
- data/lib/treat/processors/parsers/enju.rb +2 -2
- data/lib/treat/processors/parsers/stanford.rb +17 -11
- data/lib/treat/processors/segmenters/punkt.rb +5 -2
- data/lib/treat/processors/segmenters/tactful.rb +5 -1
- data/lib/treat/processors/tokenizers/ptb.rb +11 -3
- data/lib/treat/processors/tokenizers/punkt.rb +0 -3
- data/lib/treat/processors/tokenizers/tactful.rb +3 -0
- data/lib/treat/universalisation/encodings.rb +12 -0
- data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
- data/lib/treat/universalisation.rb +9 -0
- data/lib/treat.rb +2 -2
- metadata +6 -6
- data/lib/treat/linguistics.rb +0 -9
- data/lib/treat/processors/tokenizers/perl.rb +0 -132
data/LICENSE
CHANGED
@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
|
|
20
20
|
- formatters/readers/odt.rb - Mark Watson (GPL license)
|
21
21
|
- processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
|
22
22
|
- processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
|
23
|
-
- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
|
24
23
|
- processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
|
25
24
|
- extractors/topics/reuters.rb - Mark Watson (GPL license)
|
26
25
|
- inflectors/declensions/english.rb - Thomas Sawyer (MIT license)
|
data/files/INFO
CHANGED
@@ -1 +1 @@
|
|
1
|
-
This is a folder containing the files downloaded by Treat.
|
1
|
+
This is a folder containing the files downloaded by Treat from the internet.
|
@@ -4,12 +4,11 @@
|
|
4
4
|
# is pretty much self-explanatory.
|
5
5
|
module Treat::Entities::Abilities::Buildable
|
6
6
|
|
7
|
-
require 'treat/helpers/decimal_point_escaper'
|
8
7
|
require 'fileutils'
|
9
8
|
|
10
9
|
# Simple regexps to match common entities.
|
11
10
|
WordRegexp = /^[[:alpha:]\-']+$/
|
12
|
-
NumberRegexp = /^#?([0-9]+)(
|
11
|
+
NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
|
13
12
|
PunctRegexp = /^[[:punct:]\$]+$/
|
14
13
|
UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
|
15
14
|
EmailRegexp = /.+\@.+\..+/
|
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
|
|
57
56
|
# instead of from_string directly).
|
58
57
|
def from_string(string, enforce_type = false)
|
59
58
|
|
60
|
-
Treat::Helpers::DecimalPointEscaper.escape!(string)
|
61
|
-
|
62
59
|
enforce_type = true if caller_method == :build
|
63
60
|
|
64
61
|
unless self == Treat::Entities::Entity
|
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
|
|
74
71
|
end
|
75
72
|
|
76
73
|
e
|
74
|
+
|
77
75
|
end
|
78
76
|
|
79
77
|
# Build a document from an URL.
|
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
|
|
116
114
|
"a numeric object."
|
117
115
|
end
|
118
116
|
n = numeric.to_s
|
119
|
-
Treat::Helpers::DecimalPointEscaper.unescape!(n)
|
120
117
|
Treat::Entities::Number.new(n)
|
121
118
|
end
|
122
119
|
|
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
|
|
319
316
|
end
|
320
317
|
|
321
318
|
def create_collection(fv)
|
322
|
-
debug("Creating new collection in directory #{fv}.")
|
323
319
|
FileUtils.mkdir(fv)
|
324
320
|
Treat::Entities::Collection.new(fv)
|
325
321
|
end
|
@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
|
|
24
24
|
return unless has_children?
|
25
25
|
raise Treat::Exception,
|
26
26
|
"Warning: can't #{caller_method(2)} "+
|
27
|
-
"
|
28
|
-
"
|
27
|
+
"the text \"#{short_value}\", because it " +
|
28
|
+
"already has children."
|
29
29
|
end
|
30
30
|
|
31
31
|
end
|
@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
|
|
104
104
|
if !klass[g] || !klass[g][0]
|
105
105
|
d = ucc(cl(group))
|
106
106
|
d.gsub!('_', ' ')
|
107
|
-
d =
|
107
|
+
d = d[0..-2]
|
108
108
|
raise Treat::Exception, "No #{d}" +
|
109
|
-
"
|
109
|
+
" is available for the " +
|
110
110
|
"#{lang.to_s.capitalize} language."
|
111
111
|
end
|
112
112
|
return klass[g][0]
|
@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
|
|
37
37
|
end
|
38
38
|
if f || entity_types.include?(:entity)
|
39
39
|
send(task, worker, options)
|
40
|
+
if group.recursive
|
41
|
+
each do |entity|
|
42
|
+
entity.do_task(task, worker, options, group)
|
43
|
+
end
|
44
|
+
end
|
40
45
|
else
|
41
|
-
|
46
|
+
each do |entity|
|
42
47
|
entity.do_task(task, worker, options, group)
|
43
48
|
end
|
44
49
|
unless entity_types.include?(type)
|
@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
|
|
95
95
|
as
|
96
96
|
end
|
97
97
|
|
98
|
+
# Returns the first ancestor that has a feature
|
99
|
+
# with the given name, otherwise nil.
|
100
|
+
def ancestor_with_feature(type, feature)
|
101
|
+
each_ancestor(type) do |ancestor|
|
102
|
+
return ancestor if ancestor.has?(feature)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
98
106
|
alias :ancestors_with_type :ancestors_with_types
|
99
107
|
|
100
108
|
# Number of children that have a given feature.
|
@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
|
|
25
25
|
def magic(sym, *args)
|
26
26
|
|
27
27
|
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
28
|
-
@@cats_regexp ||= "(#{Treat::
|
28
|
+
@@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
|
29
29
|
|
30
30
|
method = sym.to_s =~ /entities/ ?
|
31
31
|
sym.to_s.gsub('entities', 'entitys') :
|
data/lib/treat/extractors.rb
CHANGED
data/lib/treat/groupable.rb
CHANGED
@@ -95,8 +95,12 @@ module Treat::Groupable
|
|
95
95
|
attr_accessor :presets
|
96
96
|
# The preset option to use with preset functions.
|
97
97
|
attr_accessor :preset_option
|
98
|
+
# Whether to recurse within multiple targets or not.
|
99
|
+
attr_accessor :recursive
|
98
100
|
end
|
99
101
|
|
102
|
+
self.recursive = false
|
103
|
+
|
100
104
|
# Return the method corresponding to the group.
|
101
105
|
# This method resolves the name of the method
|
102
106
|
# that a group should provide based on the name
|
data/lib/treat/installer.rb
CHANGED
@@ -82,9 +82,13 @@ module Treat::Installer
|
|
82
82
|
begin
|
83
83
|
Gem::Specification.find_by_name('punkt-segmenter')
|
84
84
|
title "Downloading model for the Punkt segmenter for the #{l}."
|
85
|
-
|
85
|
+
# Need fix
|
86
|
+
download_punkt_models([language.to_s])
|
86
87
|
rescue Gem::LoadError; end
|
87
|
-
|
88
|
+
|
89
|
+
# Download reuters models always
|
90
|
+
download_reuters_models
|
91
|
+
|
88
92
|
# If stanford is installed, download models.
|
89
93
|
begin
|
90
94
|
Gem::Specification.find_by_name('stanford-core-nlp')
|
@@ -92,7 +96,10 @@ module Treat::Installer
|
|
92
96
|
"model files for the the #{l}.\n\n"
|
93
97
|
package = (language == :english) ? :english : :all
|
94
98
|
download_stanford(package)
|
95
|
-
rescue Gem::LoadError
|
99
|
+
rescue Gem::LoadError
|
100
|
+
puts 'Stanford-core-nlp gem not installed.'
|
101
|
+
puts 'Skipping download of Stanford models.'
|
102
|
+
end
|
96
103
|
|
97
104
|
title "Install external binary libraries " +
|
98
105
|
"(requires port, apt-get or win-get).\n"
|
@@ -124,7 +131,7 @@ module Treat::Installer
|
|
124
131
|
install_dependencies(false)
|
125
132
|
install_language_dependencies(dep, false)
|
126
133
|
download_stanford(:minimal)
|
127
|
-
download_punkt_models(:english)
|
134
|
+
download_punkt_models([:english])
|
128
135
|
end
|
129
136
|
|
130
137
|
def self.install_dependencies(optionally)
|
@@ -166,7 +173,7 @@ module Treat::Installer
|
|
166
173
|
unless man
|
167
174
|
puts "Skipping installation of the "+
|
168
175
|
"following binaries:\n\n"
|
169
|
-
|
176
|
+
Binary.each do |binary, purpose|
|
170
177
|
puts "- #{binary} to #{purpose}"
|
171
178
|
end
|
172
179
|
return
|
@@ -227,22 +234,35 @@ module Treat::Installer
|
|
227
234
|
|
228
235
|
end
|
229
236
|
|
230
|
-
def self.download_punkt_models(
|
237
|
+
def self.download_punkt_models(languages)
|
238
|
+
languages.map! { |l| "#{l}.yaml" }
|
239
|
+
download_models 'punkt', languages
|
240
|
+
end
|
241
|
+
|
242
|
+
def self.download_reuters_models
|
243
|
+
files = ["industry.xml", "region.xml", "topics.xml"]
|
244
|
+
download_models 'reuters', files
|
245
|
+
end
|
231
246
|
|
232
|
-
|
233
|
-
dest = "#{Treat.models}punkt/"
|
247
|
+
def self.download_models(directory, files)
|
234
248
|
|
235
|
-
|
236
|
-
'http', Server, 'treat/punkt', f, Treat.tmp)
|
249
|
+
dest = "#{Treat.models}#{directory}/"
|
237
250
|
|
238
251
|
unless File.readable?(dest)
|
239
|
-
puts "- Creating directory models
|
252
|
+
puts "- Creating directory models/#{directory} ..."
|
240
253
|
FileUtils.mkdir_p(File.absolute_path(dest))
|
241
254
|
end
|
242
255
|
|
243
|
-
puts "- Copying model file to models/punkt ..."
|
244
|
-
FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
|
245
256
|
|
257
|
+
files.each do |file|
|
258
|
+
puts "- Downloading #{file} ..."
|
259
|
+
loc = Treat::Downloader.download(
|
260
|
+
'http', Server, "treat/#{directory}", file, Treat.tmp)
|
261
|
+
puts "- Copying file to models/#{directory} ..."
|
262
|
+
FileUtils.cp(loc, File.join(Paths[:models], directory, file))
|
263
|
+
end
|
264
|
+
|
265
|
+
|
246
266
|
puts "- Cleaning up..."
|
247
267
|
FileUtils.rm_rf(Paths[:tmp] + Server)
|
248
268
|
|
data/lib/treat/kernel.rb
CHANGED
@@ -31,7 +31,7 @@ class Treat::Languages::English
|
|
31
31
|
:chunkers => [:txt],
|
32
32
|
:parsers => [:stanford, :enju],
|
33
33
|
:segmenters => [:tactful, :punkt, :stanford],
|
34
|
-
:tokenizers => [:
|
34
|
+
:tokenizers => [:ptb, :stanford, :tactful, :punkt]
|
35
35
|
}
|
36
36
|
|
37
37
|
Retrievers = {
|
@@ -6,14 +6,14 @@ class Treat::Languages::French
|
|
6
6
|
Extractors = {}
|
7
7
|
Inflectors = {}
|
8
8
|
Lexicalizers = {
|
9
|
-
:
|
10
|
-
:
|
9
|
+
:taggers => [:stanford],
|
10
|
+
:categorizers => [:from_tag]
|
11
11
|
}
|
12
12
|
Processors = {
|
13
13
|
:chunkers => [:txt],
|
14
14
|
:parsers => [:stanford],
|
15
|
-
:segmenters => [:
|
16
|
-
:tokenizers => [:
|
15
|
+
:segmenters => [:tactful],
|
16
|
+
:tokenizers => [:tactful]
|
17
17
|
}
|
18
18
|
Retrievers = {}
|
19
19
|
|
@@ -6,14 +6,14 @@ class Treat::Languages::German
|
|
6
6
|
Extractors = {}
|
7
7
|
Inflectors = {}
|
8
8
|
Lexicalizers = {
|
9
|
-
:
|
10
|
-
:
|
9
|
+
:taggers => [:stanford],
|
10
|
+
:categorizers => [:from_tag]
|
11
11
|
}
|
12
12
|
Processors = {
|
13
13
|
:chunkers => [:txt],
|
14
14
|
:parsers => [:stanford],
|
15
15
|
:segmenters => [:punkt],
|
16
|
-
:tokenizers => [:
|
16
|
+
:tokenizers => [:tactful]
|
17
17
|
}
|
18
18
|
Retrievers = {}
|
19
19
|
|
@@ -1,11 +1,10 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
# A list of all possible word categories.
|
1
|
+
class Treat::Languages::Language
|
2
|
+
|
4
3
|
WordCategories = [
|
5
4
|
:adjective, :adverb, :noun, :verb, :interjection,
|
6
5
|
:clitic, :coverb, :conjunction, :determiner, :particle,
|
7
6
|
:preposition, :pronoun, :number, :symbol, :punctuation,
|
8
7
|
:complementizer
|
9
8
|
]
|
10
|
-
|
9
|
+
|
11
10
|
end
|
@@ -3,17 +3,19 @@
|
|
3
3
|
# from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
|
4
4
|
class Treat::Lexicalizers::Categorizers::FromTag
|
5
5
|
|
6
|
-
Pttc = Treat::
|
7
|
-
Wttc = Treat::
|
8
|
-
Ptc = Treat::
|
6
|
+
Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
|
7
|
+
Wttc = Treat::Universalisation::Tags::WordTagToCategory
|
8
|
+
Ptc = Treat::Universalisation::Tags::PunctuationToCategory
|
9
9
|
|
10
10
|
# Find the category of the entity from its tag.
|
11
11
|
def self.category(entity, options = {})
|
12
12
|
|
13
13
|
tag = entity.check_has(:tag)
|
14
|
+
|
14
15
|
return :unknown if tag.nil? || tag == '' || entity.type == :symbol
|
15
16
|
return :sentence if tag == 'S' || entity.type == :sentence
|
16
17
|
return :number if entity.type == :number
|
18
|
+
|
17
19
|
return Ptc[entity.to_s] if entity.type == :punctuation
|
18
20
|
|
19
21
|
if entity.is_a?(Treat::Entities::Phrase)
|
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
|
|
29
31
|
|
30
32
|
if entity.has?(:tag_set)
|
31
33
|
ts = entity.get(:tag_set)
|
32
|
-
elsif entity.parent_phrase &&
|
33
|
-
entity.parent_phrase.has?(:tag_set)
|
34
|
-
ts = entity.parent_phrase.get(:tag_set)
|
35
34
|
else
|
36
|
-
|
37
|
-
|
38
|
-
|
35
|
+
a = entity.ancestor_with_feature(:phrase, :tag_set)
|
36
|
+
if a
|
37
|
+
ts = a.get(:tag_set)
|
38
|
+
else
|
39
|
+
raise Treat::Exception,
|
40
|
+
"No information can be found regarding "+
|
41
|
+
"which tag set to use."
|
42
|
+
end
|
39
43
|
end
|
40
|
-
|
44
|
+
|
41
45
|
if cat[ts]
|
42
46
|
return cat[ts]
|
43
47
|
else
|
@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
38
38
|
end
|
39
39
|
|
40
40
|
# Handle tags for sentences and phrases.
|
41
|
-
|
42
41
|
if entity.is_a?(Treat::Entities::Sentence) ||
|
43
42
|
(entity.is_a?(Treat::Entities::Phrase) &&
|
44
43
|
!entity.parent_sentence)
|
45
|
-
|
44
|
+
|
45
|
+
tag_set = Treat::Universalisation::Tags::
|
46
|
+
StanfordTagSetForLanguage[
|
47
|
+
Treat::Languages.describe(lang)]
|
48
|
+
entity.set :tag_set, tag_set
|
46
49
|
end
|
47
50
|
|
48
51
|
if entity.is_a?(Treat::Entities::Sentence)
|
data/lib/treat/lexicalizers.rb
CHANGED
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
|
|
23
23
|
@@parser = nil
|
24
24
|
|
25
25
|
# A hash of Enju cat tags mapped to word categories.
|
26
|
-
Ectc = Treat::
|
26
|
+
Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
|
27
27
|
|
28
28
|
# A hash of Enju cat/xcat pairs mapped to PTB tags.
|
29
|
-
Ecxtp = Treat::
|
29
|
+
Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
|
30
30
|
|
31
31
|
# Parse the entity into its syntactical
|
32
32
|
# phrases using Enju.
|
@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
|
|
27
27
|
lang = entity.language
|
28
28
|
init(lang, options)
|
29
29
|
|
30
|
+
tag_set = Treat::Universalisation::Tags::
|
31
|
+
StanfordTagSetForLanguage[
|
32
|
+
Treat::Languages.describe(lang)]
|
33
|
+
|
30
34
|
text = ::StanfordCoreNLP::Text.new(val)
|
31
35
|
@@parsers[lang].annotate(text)
|
32
36
|
|
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
|
|
37
41
|
tag = s.get(:category).to_s
|
38
42
|
tag_s, tag_opt = *tag.split('-')
|
39
43
|
tag_s ||= 'S'
|
40
|
-
entity.set :tag_set, :penn
|
41
44
|
entity.set :tag, tag_s
|
42
45
|
entity.set :tag_opt, tag_opt if tag_opt
|
43
|
-
recurse(s.get(:tree).children[0], entity)
|
44
|
-
break
|
46
|
+
recurse(s.get(:tree).children[0], entity, tag_set)
|
47
|
+
break #######
|
45
48
|
else
|
46
49
|
recurse(s.get(:tree), entity)
|
47
50
|
end
|
48
51
|
|
49
52
|
end
|
50
53
|
|
54
|
+
entity.set :tag_set, tag_set
|
55
|
+
|
51
56
|
end
|
52
57
|
|
53
58
|
def self.init(lang, options)
|
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
|
|
76
81
|
|
77
82
|
# Helper method which recurses the tree supplied by
|
78
83
|
# the Stanford parser.
|
79
|
-
def self.recurse(java_node, ruby_node, additional_tags = [])
|
84
|
+
def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
|
80
85
|
|
81
86
|
if java_node.num_children == 0
|
82
87
|
|
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
|
|
85
90
|
tag_s, tag_opt = *tag.split('-')
|
86
91
|
tag_s ||= ''
|
87
92
|
ruby_node.value = java_node.value.to_s.strip
|
88
|
-
ruby_node.set :tag_set, :penn
|
89
93
|
ruby_node.set :tag, tag_s
|
90
94
|
ruby_node.set :tag_opt, tag_opt if tag_opt
|
91
|
-
ruby_node.set :tag_set, :penn
|
92
95
|
ruby_node.set :lemma, label.get(:lemma).to_s
|
93
96
|
|
94
97
|
additional_tags.each do |t|
|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
|
|
103
106
|
if java_node.num_children == 1 &&
|
104
107
|
java_node.children[0].num_children == 0
|
105
108
|
recurse(java_node.children[0],
|
106
|
-
ruby_node, additional_tags)
|
109
|
+
ruby_node, tag_set, additional_tags)
|
107
110
|
return
|
108
111
|
end
|
109
112
|
|
110
113
|
java_node.children.each do |java_child|
|
114
|
+
|
111
115
|
label = java_child.label
|
112
116
|
tag = label.get(:category).to_s
|
113
117
|
tag_s, tag_opt = *tag.split('-')
|
114
118
|
tag_s ||= ''
|
115
|
-
|
116
|
-
if Treat::
|
119
|
+
|
120
|
+
if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
|
121
|
+
Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
|
117
122
|
ruby_child = Treat::Entities::Phrase.new
|
118
123
|
else
|
119
124
|
l = java_child.children[0].to_s
|
120
125
|
v = java_child.children[0].value.to_s.strip
|
126
|
+
|
121
127
|
# Mhmhmhmhmhm
|
122
128
|
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
123
129
|
ruby_child = Treat::Entities::Token.from_string(val)
|
124
130
|
end
|
125
131
|
|
126
|
-
ruby_child.set :tag_set, :penn
|
127
132
|
ruby_child.set :tag, tag_s
|
128
133
|
ruby_child.set :tag_opt, tag_opt if tag_opt
|
129
134
|
ruby_node << ruby_child
|
130
135
|
|
131
136
|
unless java_child.children.empty?
|
132
|
-
recurse(java_child, ruby_child, additional_tags)
|
137
|
+
recurse(java_child, ruby_child, tag_set, additional_tags)
|
133
138
|
end
|
134
139
|
|
135
140
|
end
|
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
|
|
137
142
|
end
|
138
143
|
|
139
144
|
end
|
145
|
+
|
140
146
|
end
|
@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
|
|
38
38
|
|
39
39
|
s = entity.to_s
|
40
40
|
|
41
|
-
# Replace
|
41
|
+
# Replace the point in all floating-point numbers
|
42
|
+
# by ^^; this is a fix since Punkt trips on decimal
|
43
|
+
# numbers.
|
42
44
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
43
|
-
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
|
45
|
+
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
44
46
|
|
45
47
|
result = @@segmenters[lang].
|
46
48
|
sentences_from_text(s,
|
47
49
|
:output => :sentences_text)
|
48
50
|
|
49
51
|
result.each do |sentence|
|
52
|
+
# Unescape the sentence.
|
50
53
|
Treat::Helpers::DecimalPointEscaper.
|
51
54
|
unescape!(sentence)
|
52
55
|
entity << Treat::Entities::Phrase.
|
@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
|
|
29
29
|
entity.check_hasnt_children
|
30
30
|
|
31
31
|
s = entity.to_s
|
32
|
+
|
32
33
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
33
34
|
|
34
|
-
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
|
35
|
+
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
35
36
|
|
36
37
|
@@segmenter ||= TactfulTokenizer::Model.new
|
37
38
|
|
38
39
|
sentences = @@segmenter.tokenize_text(s)
|
40
|
+
|
39
41
|
sentences.each do |sentence|
|
40
42
|
Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
|
43
|
+
puts sentence.to_s if sentence.to_s.include?('staff')
|
41
44
|
entity << Treat::Entities::Phrase.from_string(sentence)
|
42
45
|
end
|
46
|
+
|
43
47
|
end
|
44
48
|
|
45
49
|
end
|
@@ -1,3 +1,4 @@
|
|
1
|
+
# encoding: utf-8
|
1
2
|
# A native rule-basd tokenizer based on the one
|
2
3
|
# developped by Robert Macyntyre in 1995 for the Penn
|
3
4
|
# Treebank project. This tokenizer follows the
|
@@ -11,8 +12,6 @@
|
|
11
12
|
# you can redistribute it and/or modify it under the
|
12
13
|
# same terms as Ruby itself.
|
13
14
|
module Treat::Processors::Tokenizers::PTB
|
14
|
-
|
15
|
-
require 'treat/helpers/decimal_point_escaper'
|
16
15
|
|
17
16
|
# Tokenize the entity using a native rule-based algorithm.
|
18
17
|
def self.tokenize(entity, options = {})
|
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
|
|
33
32
|
|
34
33
|
# Helper method to split the string into tokens.
|
35
34
|
def self.split(string)
|
35
|
+
|
36
36
|
s = " " + string + " "
|
37
|
-
|
37
|
+
|
38
|
+
# Translate some common extended ascii
|
39
|
+
# characters to quotes
|
40
|
+
s.gsub!(/‘/,'`')
|
41
|
+
s.gsub!(/’/,"'")
|
42
|
+
s.gsub!(/“/,"``")
|
43
|
+
s.gsub!(/”/,"''")
|
44
|
+
|
45
|
+
|
38
46
|
s.gsub!(/\s+/," ")
|
39
47
|
s.gsub!(/(\s+)''/,'\1"')
|
40
48
|
s.gsub!(/(\s+)``/,'\1"')
|
@@ -14,8 +14,6 @@
|
|
14
14
|
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
15
15
|
class Treat::Processors::Tokenizers::Punkt
|
16
16
|
|
17
|
-
require 'treat/helpers/decimal_point_escaper'
|
18
|
-
|
19
17
|
SentEndChars = ['.', '?', '!']
|
20
18
|
ReSentEndChars = /[.?!]/
|
21
19
|
InternalPunctuation = [',', ':', ';']
|
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
|
|
35
33
|
entity.check_hasnt_children
|
36
34
|
|
37
35
|
s = entity.to_s
|
38
|
-
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
39
36
|
|
40
37
|
s.scan(ReWordTokenizer).each do |token|
|
41
38
|
if SentEndChars.include?(token[-1])
|
@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
|
|
51
51
|
entity.check_hasnt_children
|
52
52
|
|
53
53
|
s = entity.to_s
|
54
|
+
|
54
55
|
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
55
56
|
|
56
57
|
ReTokenize.each do |rules|
|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
|
|
58
59
|
end
|
59
60
|
|
60
61
|
s.split(' ').each do |token|
|
62
|
+
|
63
|
+
Treat::Helpers::DecimalPointEscaper.unescape!(token)
|
61
64
|
entity << Treat::Entities::Token.
|
62
65
|
from_string(token)
|
63
66
|
end
|
@@ -1,14 +1,20 @@
|
|
1
|
-
module Treat::
|
1
|
+
module Treat::Universalisation::Tags
|
2
2
|
|
3
3
|
ClawsC5 = 0
|
4
4
|
Brown = 1
|
5
5
|
Penn = 2
|
6
|
-
|
6
|
+
Stuttgart = 3
|
7
7
|
PennChinese = 4
|
8
|
-
|
8
|
+
Paris7 = 5
|
9
|
+
|
10
|
+
StanfordTagSetForLanguage = {
|
11
|
+
:french => :paris7,
|
12
|
+
:english => :penn,
|
13
|
+
:german => :stuttgart
|
14
|
+
}
|
9
15
|
|
10
16
|
PTBClauseTagDescription = [
|
11
|
-
['S', '
|
17
|
+
['S', 'Paris7 declarative clause'],
|
12
18
|
['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
|
13
19
|
['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
|
14
20
|
['SINV', 'Inverted declarative sentence'],
|
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
|
|
26
32
|
|
27
33
|
AlignedPhraseTags =
|
28
34
|
[
|
29
|
-
'
|
30
|
-
'
|
31
|
-
'Conjunction phrase', ['', '', 'CONJP'],
|
32
|
-
'Fragment', ['', '', 'FRAG'],
|
33
|
-
'
|
34
|
-
'List marker', ['', '', 'LST'],
|
35
|
-
'Not a phrase', ['', '', 'NAC'],
|
36
|
-
'Noun phrase', ['', '', 'NP'],
|
37
|
-
'
|
38
|
-
'
|
39
|
-
'
|
40
|
-
'
|
41
|
-
'
|
42
|
-
'
|
43
|
-
'
|
44
|
-
'
|
45
|
-
'
|
46
|
-
'
|
47
|
-
'
|
48
|
-
'Wh
|
49
|
-
'
|
50
|
-
'
|
51
|
-
'
|
52
|
-
'
|
35
|
+
'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
|
36
|
+
'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
|
37
|
+
'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
|
38
|
+
'Fragment', ['', '', 'FRAG', '', '', ''],
|
39
|
+
'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
|
40
|
+
'List marker', ['', '', 'LST', '', '', ''],
|
41
|
+
'Not a phrase', ['', '', 'NAC', '', '', ''],
|
42
|
+
'Noun phrase', ['', '', 'NP', '', '', 'NP'],
|
43
|
+
'Verbal nucleus', ['', '', '', '', '', 'VN'],
|
44
|
+
'Head of noun phrase', ['', '', 'NX', '', '', ''],
|
45
|
+
'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
|
46
|
+
'Parenthetical', ['', '', 'PRN', '', '', ''],
|
47
|
+
'Particle', ['', '', 'PRT', '', '', ''],
|
48
|
+
'Participial phrase', ['', '', '', '', '', 'VPart'],
|
49
|
+
'Quantifier phrase', ['', '', 'QP', '', '', ''],
|
50
|
+
'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
|
51
|
+
'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
|
52
|
+
'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
|
53
|
+
'Verb phrase', ['', '', 'VP', '', '', ''],
|
54
|
+
'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
|
55
|
+
'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
|
56
|
+
'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
|
57
|
+
'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
|
58
|
+
'Unknown', ['', '', 'X', '', '', ''],
|
59
|
+
'Phrase', ['', '', 'P', '', '', 'Sint'],
|
60
|
+
'Sentence', ['', '', 'S', '', '', 'SENT'],
|
61
|
+
'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
|
53
62
|
]
|
54
63
|
|
55
64
|
# A description of Enju categories.
|
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
|
|
139
148
|
# JRS?
|
140
149
|
|
141
150
|
|
142
|
-
|
151
|
+
Paris7WordTagToCategory = {
|
143
152
|
'C' => :complementizer,
|
144
153
|
'PN' => :punctuation,
|
145
154
|
'SC' => :conjunction
|
146
155
|
}
|
147
|
-
|
156
|
+
|
148
157
|
PunctuationToCategory = {
|
149
158
|
'.' => :period,
|
150
159
|
',' => :comma,
|
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
|
|
152
161
|
':' => :colon,
|
153
162
|
'!' => :exclamation,
|
154
163
|
'?' => :interrogation,
|
155
|
-
'"' => :
|
156
|
-
"'" => :
|
157
|
-
|
164
|
+
'"' => :double_quote,
|
165
|
+
"'" => :single_quote,
|
158
166
|
'$' => :dollar,
|
159
167
|
'%' => :percent,
|
160
168
|
'#' => :hash,
|
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
|
|
227
235
|
'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
|
228
236
|
'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
|
229
237
|
'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
|
230
|
-
|
238
|
+
'Interjection', ['', '', '', '', '', 'I'],
|
231
239
|
'Localizer', ['', '', '', '', 'LC'],
|
232
240
|
|
233
241
|
'Measure word', ['', '', '', '', 'M'],
|
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
|
|
366
374
|
'Verb, ? as main verb', ['', '', '', '', 'VE'], # ?
|
367
375
|
'Verb, ????', ['', '', '', '', 'VC'] # ?
|
368
376
|
]
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
377
|
+
|
378
|
+
# Paris7 Treebank functional tags
|
379
|
+
=begin
|
380
|
+
SUJ (subject)
|
381
|
+
OBJ (direct object)
|
382
|
+
ATS (predicative complement of a subject)
|
383
|
+
ATO (predicative complement of a direct object)
|
384
|
+
MOD (modifier or adjunct)
|
385
|
+
A-OBJ (indirect complement introduced by à)
|
386
|
+
DE-OBJ (indirect complement introduced by de)
|
387
|
+
P-OBJ (indirect complement introduced by another preposition)
|
388
|
+
=end
|
389
|
+
|
390
|
+
# !! Extremely ugly code follows.
|
391
|
+
|
392
|
+
# Generate word tag -> category hash.
|
393
|
+
wttc = {}
|
394
|
+
|
395
|
+
Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
|
374
396
|
|
375
397
|
category = desc.gsub(',', ' ,').
|
376
398
|
split(' ')[0].downcase.intern
|
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
|
|
378
400
|
wttc[tags[ClawsC5]] ||= {}
|
379
401
|
wttc[tags[Brown]] ||= {}
|
380
402
|
wttc[tags[Penn]] ||= {}
|
381
|
-
wttc[tags[
|
403
|
+
wttc[tags[Stuttgart]] ||= {}
|
382
404
|
wttc[tags[PennChinese]] ||= {}
|
383
|
-
wttc[tags[
|
405
|
+
wttc[tags[Paris7]] ||= {}
|
384
406
|
|
385
407
|
wttc[tags[ClawsC5]][:claws_5] = category
|
386
408
|
wttc[tags[Brown]][:brown] = category
|
387
409
|
wttc[tags[Penn]][:penn] = category
|
388
|
-
wttc[tags[
|
410
|
+
wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
|
389
411
|
wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
|
390
|
-
wttc[tags[
|
412
|
+
wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
|
391
413
|
|
392
414
|
end
|
415
|
+
|
393
416
|
# A hash converting word tags to word categories.
|
394
417
|
WordTagToCategory = wttc
|
395
418
|
|
396
419
|
# A hash converting phrase tag to categories.
|
397
420
|
pttc = {}
|
398
|
-
|
421
|
+
|
422
|
+
Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
|
423
|
+
|
399
424
|
category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
|
425
|
+
|
400
426
|
pttc[tags[Penn]] ||= {};
|
427
|
+
pttc[tags[Paris7]] ||= {};
|
428
|
+
|
429
|
+
pttc[tags[Penn]][:penn] = category
|
430
|
+
pttc[tags[Paris7]][:paris7] = category
|
431
|
+
|
401
432
|
# Not yet for other tag sts.
|
402
433
|
#pttc[tags[0]][:claws_5] = category
|
403
434
|
#pttc[tags[1]][:brown] = category
|
404
|
-
|
435
|
+
|
405
436
|
end
|
406
|
-
|
437
|
+
|
407
438
|
# A hash converting word tags to word categories.
|
408
439
|
PhraseTagToCategory = pttc
|
409
440
|
|
data/lib/treat.rb
CHANGED
@@ -10,7 +10,7 @@ module Treat
|
|
10
10
|
end
|
11
11
|
|
12
12
|
# The current version of Treat.
|
13
|
-
VERSION = "1.0.
|
13
|
+
VERSION = "1.0.5"
|
14
14
|
|
15
15
|
# Add methods to handle syntactic sugar,
|
16
16
|
# language configuration options, and paths.
|
@@ -44,7 +44,7 @@ module Treat
|
|
44
44
|
require 'treat/kernel'
|
45
45
|
require 'treat/downloader'
|
46
46
|
require 'treat/languages'
|
47
|
-
require 'treat/
|
47
|
+
require 'treat/universalisation'
|
48
48
|
require 'treat/entities'
|
49
49
|
require 'treat/categories'
|
50
50
|
require 'treat/data_set'
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-05-17 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rubyzip
|
@@ -161,6 +161,7 @@ files:
|
|
161
161
|
- lib/treat/languages/german.rb
|
162
162
|
- lib/treat/languages/greek.rb
|
163
163
|
- lib/treat/languages/italian.rb
|
164
|
+
- lib/treat/languages/language.rb
|
164
165
|
- lib/treat/languages/list.txt
|
165
166
|
- lib/treat/languages/polish.rb
|
166
167
|
- lib/treat/languages/portuguese.rb
|
@@ -176,9 +177,6 @@ files:
|
|
176
177
|
- lib/treat/lexicalizers/taggers/lingua.rb
|
177
178
|
- lib/treat/lexicalizers/taggers/stanford.rb
|
178
179
|
- lib/treat/lexicalizers.rb
|
179
|
-
- lib/treat/linguistics/categories.rb
|
180
|
-
- lib/treat/linguistics/tags.rb
|
181
|
-
- lib/treat/linguistics.rb
|
182
180
|
- lib/treat/loaders/linguistics.rb
|
183
181
|
- lib/treat/loaders/stanford.rb
|
184
182
|
- lib/treat/object.rb
|
@@ -190,7 +188,6 @@ files:
|
|
190
188
|
- lib/treat/processors/segmenters/punkt.rb
|
191
189
|
- lib/treat/processors/segmenters/stanford.rb
|
192
190
|
- lib/treat/processors/segmenters/tactful.rb
|
193
|
-
- lib/treat/processors/tokenizers/perl.rb
|
194
191
|
- lib/treat/processors/tokenizers/ptb.rb
|
195
192
|
- lib/treat/processors/tokenizers/punkt.rb
|
196
193
|
- lib/treat/processors/tokenizers/stanford.rb
|
@@ -202,6 +199,9 @@ files:
|
|
202
199
|
- lib/treat/retrievers.rb
|
203
200
|
- lib/treat/server.rb
|
204
201
|
- lib/treat/tree.rb
|
202
|
+
- lib/treat/universalisation/encodings.rb
|
203
|
+
- lib/treat/universalisation/tags.rb
|
204
|
+
- lib/treat/universalisation.rb
|
205
205
|
- lib/treat.rb
|
206
206
|
- spec/collection.rb
|
207
207
|
- spec/document.rb
|
data/lib/treat/linguistics.rb
DELETED
@@ -1,132 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
#
|
3
|
-
# Tokenize the entity using a native rule-based
|
4
|
-
# algorithm. This tokenizer is a port from an
|
5
|
-
# unknown Perl module, which I have lifted from
|
6
|
-
# the 'rbtagger' gem.
|
7
|
-
#
|
8
|
-
# Author: Todd A. Fisher
|
9
|
-
#
|
10
|
-
# This code is free to use under the terms of
|
11
|
-
# the MIT license.
|
12
|
-
#
|
13
|
-
# Original project website:
|
14
|
-
#
|
15
|
-
# https://github.com/taf2/rb-brill-tagger
|
16
|
-
module Treat::Processors::Tokenizers::Perl
|
17
|
-
|
18
|
-
require 'treat/helpers/decimal_point_escaper'
|
19
|
-
|
20
|
-
# Tokenize the entity using a rule-based algorithm
|
21
|
-
# ported from Perl by Todd A. Fisher.
|
22
|
-
#
|
23
|
-
# Options: none.
|
24
|
-
def self.tokenize(entity, options = {})
|
25
|
-
|
26
|
-
entity.check_hasnt_children
|
27
|
-
s = entity.to_s
|
28
|
-
|
29
|
-
tokens = get_tokens(entity.to_s)
|
30
|
-
tokens[1..-1].each do |token|
|
31
|
-
next if token =~ /^\s*$/
|
32
|
-
entity << Treat::Entities::Token.
|
33
|
-
from_string(token)
|
34
|
-
end
|
35
|
-
|
36
|
-
end
|
37
|
-
|
38
|
-
# Helper method to perform the tokenization.
|
39
|
-
def self.get_tokens(string)
|
40
|
-
|
41
|
-
# Normalize all whitespace
|
42
|
-
text = string.gsub(/\s+/,' ')
|
43
|
-
|
44
|
-
# Replace all decimal points by ^^
|
45
|
-
Treat::Helpers::DecimalPointEscaper.escape!(text)
|
46
|
-
|
47
|
-
=begin
|
48
|
-
|
49
|
-
# Translate some common extended ascii
|
50
|
-
# characters to quotes
|
51
|
-
text.gsub!(/‘/,'`')
|
52
|
-
text.gsub!(/’/,"'")
|
53
|
-
text.gsub!(/“/,"``")
|
54
|
-
text.gsub!(/”/,"''")
|
55
|
-
|
56
|
-
# Attempt to get correct directional quotes
|
57
|
-
# s{\"\b} { `` }g;
|
58
|
-
text.gsub!(/\"\b/,' `` ')
|
59
|
-
# s{\b\"} { '' }g;
|
60
|
-
text.gsub!(/\b\"/," '' ")
|
61
|
-
#s{\"(?=\s)} { '' }g;
|
62
|
-
text.gsub!(/\"(?=\s)/," '' ")
|
63
|
-
#s{\"} { `` }g;
|
64
|
-
text.gsub!(/\"(?=\s)/," `` ")
|
65
|
-
=end
|
66
|
-
|
67
|
-
# Isolate ellipses
|
68
|
-
# s{\.\.\.} { ... }g;
|
69
|
-
text.gsub!(/\.\.\./,' ... ')
|
70
|
-
# Isolate any embedded punctuation chars
|
71
|
-
# s{([,;:\@\#\$\%&])} { $1 }g;
|
72
|
-
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
73
|
-
|
74
|
-
# Assume sentence tokenization has been
|
75
|
-
# done first, so split FINAL
|
76
|
-
# periods only.
|
77
|
-
# s/ ([^.]) \. ([\]\)\}\>\"\']*)
|
78
|
-
# [ \t]* $ /$1 .$2 /gx;
|
79
|
-
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
80
|
-
# however, we may as well split ALL
|
81
|
-
# question marks and exclamation points,
|
82
|
-
# since they shouldn't have the abbrev.
|
83
|
-
# -marker ambiguity problem
|
84
|
-
#s{([?!])} { $1 }g;
|
85
|
-
text.gsub!(/([?!])/, ' \1 ')
|
86
|
-
# parentheses, brackets, etc.
|
87
|
-
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
88
|
-
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
89
|
-
#s/(-{2,})/ $1 /g;
|
90
|
-
text.gsub!(/(-{2,})/,' \1 ')
|
91
|
-
|
92
|
-
# Add a space to the beginning and end of
|
93
|
-
# each line, to reduce # of regexps below.
|
94
|
-
#s/$/ /;
|
95
|
-
text.gsub!(/$/," ")
|
96
|
-
#s/^/ /;
|
97
|
-
text.gsub!(/^/," ")
|
98
|
-
|
99
|
-
# possessive or close-single-quote
|
100
|
-
#s/\([^\']\)\' /$1 \' /g;
|
101
|
-
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
102
|
-
# as in it's, I'm, we'd
|
103
|
-
#s/\'([smd]) / \'$1 /ig;
|
104
|
-
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
105
|
-
#s/\'(ll|re|ve) / \'$1 /ig;
|
106
|
-
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
107
|
-
#s/n\'t / n\'t /ig;
|
108
|
-
text.gsub!(/n\'t /i," n't ")
|
109
|
-
|
110
|
-
#s/ (can)(not) / $1 $2 /ig;
|
111
|
-
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
112
|
-
#s/ (d\')(ye) / $1 $2 /ig;
|
113
|
-
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
114
|
-
#s/ (gim)(me) / $1 $2 /ig;
|
115
|
-
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
116
|
-
#s/ (gon)(na) / $1 $2 /ig;
|
117
|
-
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
118
|
-
#s/ (got)(ta) / $1 $2 /ig;
|
119
|
-
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
120
|
-
#s/ (lem)(me) / $1 $2 /ig;
|
121
|
-
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
122
|
-
#s/ (more)(\'n) / $1 $2 /ig;
|
123
|
-
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
124
|
-
#s/ (\'t)(is|was) / $1 $2 /ig;
|
125
|
-
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
126
|
-
#s/ (wan)(na) / $1 $2 /ig;
|
127
|
-
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
128
|
-
text.split(/\s/)
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|