treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
@@ -1,7 +1,7 @@
|
|
1
1
|
# This class allows the visualization of
|
2
2
|
# an entity in standoff format; for example:
|
3
3
|
# (S (NP John) (VP has (VP come))).
|
4
|
-
class Treat::Formatters::Visualizers::Standoff
|
4
|
+
class Treat::Workers::Formatters::Visualizers::Standoff
|
5
5
|
|
6
6
|
# Start out with an indent of 0.
|
7
7
|
DefaultOptions = { :indent => 0 }
|
@@ -44,8 +44,7 @@ class Treat::Formatters::Visualizers::Standoff
|
|
44
44
|
end
|
45
45
|
|
46
46
|
def self.ptb_escape(val)
|
47
|
-
Treat
|
48
|
-
PTBEscapeCharacters.each do |char, esc|
|
47
|
+
Treat.tags.ptb.escape_characters.each do |char, esc|
|
49
48
|
val.gsub!(char, val)
|
50
49
|
end
|
51
50
|
|
@@ -1,18 +1,18 @@
|
|
1
|
-
module Treat::
|
1
|
+
module Treat::Workers::Group
|
2
2
|
|
3
3
|
# Lazily load the worker classes in the group.
|
4
4
|
def const_missing(const)
|
5
5
|
bits = self.ancestors[0].to_s.split('::')
|
6
6
|
bits.collect! { |bit| ucc(bit) }
|
7
7
|
file = bits.join('/') + "/#{ucc(const)}"
|
8
|
-
if not File.readable?(Treat.lib + "#{file}.rb")
|
8
|
+
if not File.readable?(Treat.paths.lib + "#{file}.rb")
|
9
9
|
raise Treat::Exception,
|
10
10
|
"File '#{file}.rb' corresponding to " +
|
11
11
|
"requested worker #{self}::#{const} " +
|
12
12
|
"does not exist."
|
13
13
|
else
|
14
14
|
require file
|
15
|
-
if not const_defined?(const)
|
15
|
+
if not self.const_defined?(const)
|
16
16
|
raise Treat::Exception,
|
17
17
|
"File #{file} does not define " +
|
18
18
|
"#{self}::#{const}."
|
@@ -29,7 +29,7 @@ module Treat::Groupable
|
|
29
29
|
mod = ucc(cl(self))
|
30
30
|
if @@list[mod].nil?
|
31
31
|
@@list[mod] = []
|
32
|
-
dirs = Dir[Treat.lib + "treat/*/#{mod}/*.rb"]
|
32
|
+
dirs = Dir[Treat.paths.lib + "treat/workers/*/#{mod}/*.rb"]
|
33
33
|
dirs.each do |file|
|
34
34
|
@@list[mod] <<
|
35
35
|
file.split('/')[-1][0..-4].intern
|
@@ -100,6 +100,7 @@ module Treat::Groupable
|
|
100
100
|
end
|
101
101
|
|
102
102
|
self.recursive = false
|
103
|
+
self.list
|
103
104
|
|
104
105
|
# Return the method corresponding to the group.
|
105
106
|
# This method resolves the name of the method
|
@@ -116,11 +117,7 @@ module Treat::Groupable
|
|
116
117
|
m = ucc(cl(self)).dup
|
117
118
|
if m[-4..-1] == 'zers'
|
118
119
|
if type == :annotator
|
119
|
-
|
120
|
-
m[-5..-1] = ''
|
121
|
-
else
|
122
|
-
m[-5..-1] = 'y'
|
123
|
-
end
|
120
|
+
m[-5..-1] = m[-6] == 'l' ? '' : 'y'
|
124
121
|
else
|
125
122
|
m = m[0..-3]
|
126
123
|
end
|
@@ -147,9 +144,6 @@ module Treat::Groupable
|
|
147
144
|
@method = n.intern
|
148
145
|
end
|
149
146
|
|
150
|
-
# Populate the group's list.
|
151
|
-
group.list
|
152
|
-
|
153
147
|
end
|
154
148
|
|
155
149
|
end
|
@@ -3,10 +3,14 @@
|
|
3
3
|
# number in words in cardinal form.
|
4
4
|
#
|
5
5
|
# Project website: http://deveiate.org/projects/Linguistics/
|
6
|
-
module Treat::Inflectors::Cardinalizers::Linguistics
|
6
|
+
module Treat::Workers::Inflectors::Cardinalizers::Linguistics
|
7
7
|
|
8
8
|
require 'treat/loaders/linguistics'
|
9
9
|
|
10
|
+
DefaultOptions = {
|
11
|
+
:language => Treat.core.language.default
|
12
|
+
}
|
13
|
+
|
10
14
|
# Return the description of a cardinal number in words.
|
11
15
|
#
|
12
16
|
# Options:
|
@@ -32,8 +36,9 @@ module Treat::Inflectors::Cardinalizers::Linguistics
|
|
32
36
|
#
|
33
37
|
# More specific options when using :type => :ordinal:
|
34
38
|
def self.cardinal(entity, options = {})
|
39
|
+
options = DefaultOptions.merge(options)
|
35
40
|
Treat::Loaders::Linguistics.
|
36
|
-
load(
|
41
|
+
load(options[:language]).
|
37
42
|
numwords(entity.to_s, options)
|
38
43
|
end
|
39
44
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# in the 'linguistics' gem that allow to conjugate verbs.
|
3
3
|
#
|
4
4
|
# Project website: http://deveiate.org/projects/Linguistics/
|
5
|
-
module Treat::Inflectors::Conjugators::Linguistics
|
5
|
+
module Treat::Workers::Inflectors::Conjugators::Linguistics
|
6
6
|
|
7
7
|
require 'treat/loaders/linguistics'
|
8
8
|
|
@@ -12,10 +12,10 @@ module Treat::Inflectors::Conjugators::Linguistics
|
|
12
12
|
|
13
13
|
Forms = {
|
14
14
|
:present_participle =>
|
15
|
-
{:mode =>
|
16
|
-
:infinitive => {:mode =>
|
17
|
-
:plural_verb => {:count =>
|
18
|
-
:singular_verb => {:count =>
|
15
|
+
{:mode => 'participle', :tense => 'present'},
|
16
|
+
:infinitive => {:mode => 'infinitive'},
|
17
|
+
:plural_verb => {:count => 'plural'},
|
18
|
+
:singular_verb => {:count => 'singular'}
|
19
19
|
}
|
20
20
|
|
21
21
|
# Conjugate a verb using ruby linguistics with the specified
|
@@ -33,16 +33,16 @@ module Treat::Inflectors::Conjugators::Linguistics
|
|
33
33
|
|
34
34
|
options = DefaultOptions.merge(options)
|
35
35
|
cat = entity.check_has(:category)
|
36
|
-
return if cat !=
|
37
|
-
|
36
|
+
return if cat != 'verb' && options[:strict]
|
37
|
+
|
38
38
|
options = Forms[options[:form]] if options[:form]
|
39
|
-
|
39
|
+
|
40
40
|
klass = Treat::Loaders::Linguistics.load(entity.language)
|
41
|
-
if options[:mode] ==
|
41
|
+
if options[:mode] == 'infinitive'
|
42
42
|
silence_warnings { klass.infinitive(entity.to_s) }
|
43
|
-
elsif options[:mode] ==
|
43
|
+
elsif options[:mode] == 'participle' && options[:tense] == 'present'
|
44
44
|
silence_warnings { klass.present_participle(entity.to_s) }
|
45
|
-
elsif options[:count] ==
|
45
|
+
elsif options[:count] == 'plural' && options.size == 1
|
46
46
|
silence_warnings { klass.plural_verb(entity.to_s) }
|
47
47
|
else
|
48
48
|
raise Treat::Exception,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# This class is a wrapper for the ActiveSupport
|
2
2
|
# declension tools.
|
3
|
-
class Treat::Inflectors::Declensors::English
|
3
|
+
class Treat::Workers::Inflectors::Declensors::English
|
4
4
|
|
5
5
|
require 'active_support/inflector/inflections'
|
6
6
|
|
@@ -8,7 +8,7 @@ class Treat::Inflectors::Declensors::English
|
|
8
8
|
def self.declense(entity, options)
|
9
9
|
|
10
10
|
cat = entity.check_has(:category)
|
11
|
-
unless [
|
11
|
+
unless ['noun', 'adjective', 'determiner'].
|
12
12
|
include?(cat)
|
13
13
|
return
|
14
14
|
end
|
@@ -5,7 +5,7 @@
|
|
5
5
|
# Released under the MIT License.
|
6
6
|
#
|
7
7
|
# http://english.rubyforge.org
|
8
|
-
class Treat::Inflectors::Declensors::English
|
8
|
+
class Treat::Workers::Inflectors::Declensors::English
|
9
9
|
|
10
10
|
require 'treat/inflectors/declensors/english/inflect'
|
11
11
|
|
@@ -15,7 +15,7 @@ class Treat::Inflectors::Declensors::English
|
|
15
15
|
def self.declense(entity, options)
|
16
16
|
|
17
17
|
cat = entity.check_has(:category)
|
18
|
-
unless [
|
18
|
+
unless ['noun', 'adjective', 'determiner'].
|
19
19
|
include?(cat)
|
20
20
|
return
|
21
21
|
end
|
@@ -3,7 +3,7 @@
|
|
3
3
|
# declensions of a word.
|
4
4
|
#
|
5
5
|
# Project website: http://deveiate.org/projects/Linguistics/
|
6
|
-
class Treat::Inflectors::Declensors::Linguistics
|
6
|
+
class Treat::Workers::Inflectors::Declensors::Linguistics
|
7
7
|
|
8
8
|
require 'treat/loaders/linguistics'
|
9
9
|
|
@@ -15,7 +15,7 @@ class Treat::Inflectors::Declensors::Linguistics
|
|
15
15
|
def self.declense(entity, options = {})
|
16
16
|
|
17
17
|
cat = entity.check_has(:category)
|
18
|
-
unless [
|
18
|
+
unless ['noun', 'adjective', 'determiner'].
|
19
19
|
include?(cat)
|
20
20
|
return
|
21
21
|
end
|
@@ -28,10 +28,10 @@ class Treat::Inflectors::Declensors::Linguistics
|
|
28
28
|
klass = Treat::Loaders::Linguistics.load(entity.language)
|
29
29
|
string = entity.to_s
|
30
30
|
|
31
|
-
if options[:count] ==
|
31
|
+
if options[:count] == 'plural'
|
32
32
|
|
33
33
|
if entity.has?(:category) &&
|
34
|
-
[
|
34
|
+
['noun', 'adjective', 'verb'].
|
35
35
|
include?(entity.category)
|
36
36
|
silence_warnings do
|
37
37
|
klass.send(
|
@@ -3,14 +3,20 @@
|
|
3
3
|
# number in words in ordinal form.
|
4
4
|
#
|
5
5
|
# Project website: http://deveiate.org/projects/Linguistics/
|
6
|
-
class Treat::Inflectors::Ordinalizers::Linguistics
|
6
|
+
class Treat::Workers::Inflectors::Ordinalizers::Linguistics
|
7
7
|
|
8
8
|
require 'treat/loaders/linguistics'
|
9
9
|
|
10
|
+
DefaultOptions = {
|
11
|
+
:language => Treat.core.language.default
|
12
|
+
}
|
13
|
+
|
10
14
|
# Desribe a number in words in ordinal form, using the
|
11
15
|
# 'linguistics' gem.
|
12
16
|
def self.ordinal(number, options = {})
|
13
|
-
|
17
|
+
options = DefaultOptions.merge(options)
|
18
|
+
klass = Treat::Loaders::
|
19
|
+
Linguistics.load(options[:language])
|
14
20
|
klass.ordinate(number.to_s)
|
15
21
|
end
|
16
22
|
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# Porter stemming algorithm, ported to Ruby from a
|
3
3
|
# version coded up in Perl. This is a simplified
|
4
4
|
# implementation; for a true and fast Porter stemmer,
|
5
|
-
# see Treat::Inflectors::Stemmers::PorterC.
|
5
|
+
# see Treat::Workers::Inflectors::Stemmers::PorterC.
|
6
6
|
#
|
7
7
|
# Authored by Ray Pereda (raypereda@hotmail.com).
|
8
8
|
# Unknown license.
|
@@ -10,7 +10,7 @@
|
|
10
10
|
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
11
11
|
# Program, Vol. 14, no. 3, pp 130-137,
|
12
12
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
13
|
-
class Treat::Inflectors::Stemmers::Porter
|
13
|
+
class Treat::Workers::Inflectors::Stemmers::Porter
|
14
14
|
|
15
15
|
# Returns the stem of a word using a native Porter stemmer.
|
16
16
|
#
|
@@ -5,7 +5,7 @@
|
|
5
5
|
# Original paper: Porter, 1980. An algorithm for suffix stripping,
|
6
6
|
# Program, Vol. 14, no. 3, pp 130-137,
|
7
7
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
8
|
-
module Treat::Inflectors::Stemmers::PorterC
|
8
|
+
module Treat::Workers::Inflectors::Stemmers::PorterC
|
9
9
|
|
10
10
|
# Require the 'ruby-stemmer' gem.
|
11
11
|
silence_warnings { require 'lingua/stemmer' }
|
@@ -10,7 +10,7 @@
|
|
10
10
|
# Original paper: Jenkins, Marie-Claire, Smith, Dan,
|
11
11
|
# Conservative stemming for search and indexing, 2005.
|
12
12
|
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
13
|
-
class Treat::Inflectors::Stemmers::UEA
|
13
|
+
class Treat::Workers::Inflectors::Stemmers::UEA
|
14
14
|
|
15
15
|
# Require the 'uea-stemmer' gem.
|
16
16
|
silence_warnings { require 'uea-stemmer' }
|
@@ -1,20 +1,20 @@
|
|
1
1
|
# Finds the general part of speech of an entity
|
2
2
|
# (:sentence, :noun_phrase, :verb, :adverb, etc.)
|
3
3
|
# from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
|
4
|
-
class Treat::Lexicalizers::Categorizers::FromTag
|
4
|
+
class Treat::Workers::Lexicalizers::Categorizers::FromTag
|
5
5
|
|
6
|
-
Pttc = Treat
|
7
|
-
Wttc = Treat
|
8
|
-
Ptc = Treat
|
6
|
+
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
7
|
+
Wttc = Treat.tags.aligned.word_tags_to_category
|
8
|
+
Ptc = Treat.linguistics.punctuation.punct_to_category
|
9
9
|
|
10
10
|
# Find the category of the entity from its tag.
|
11
11
|
def self.category(entity, options = {})
|
12
12
|
|
13
13
|
tag = entity.check_has(:tag)
|
14
14
|
|
15
|
-
return
|
16
|
-
return
|
17
|
-
return
|
15
|
+
return 'unknown' if tag.nil? || tag == '' || entity.type == :symbol
|
16
|
+
return 'sentence' if tag == 'S' || entity.type == :sentence
|
17
|
+
return 'number' if entity.type == :number
|
18
18
|
|
19
19
|
return Ptc[entity.to_s] if entity.type == :punctuation
|
20
20
|
|
@@ -32,7 +32,7 @@ class Treat::Lexicalizers::Categorizers::FromTag
|
|
32
32
|
if entity.has?(:tag_set)
|
33
33
|
ts = entity.get(:tag_set)
|
34
34
|
else
|
35
|
-
a = entity.ancestor_with_feature(:
|
35
|
+
a = entity.ancestor_with_feature(:tag_set)
|
36
36
|
if a
|
37
37
|
ts = a.get(:tag_set)
|
38
38
|
else
|
@@ -51,7 +51,7 @@ class Treat::Lexicalizers::Categorizers::FromTag
|
|
51
51
|
"for token #{entity.to_s}."
|
52
52
|
end
|
53
53
|
|
54
|
-
|
54
|
+
'unknown'
|
55
55
|
|
56
56
|
end
|
57
57
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# An adaptor for synsets used by the Wordnet gem.
|
2
|
-
class Treat::Lexicalizers::Sensers::Wordnet::Synset
|
2
|
+
class Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset
|
3
3
|
|
4
4
|
# The POS tag of the word.
|
5
5
|
attr_accessor :pos
|
@@ -61,7 +61,7 @@ class Treat::Lexicalizers::Sensers::Wordnet::Synset
|
|
61
61
|
# Respond to the missing method event.
|
62
62
|
def method_missing(sym, *args, &block)
|
63
63
|
ret = @original_synset.send(sym)
|
64
|
-
if ret.is_a?(Treat::Lexicalizers::Sensers::Wordnet::Synset)
|
64
|
+
if ret.is_a?(Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset)
|
65
65
|
self.new(ret)
|
66
66
|
else
|
67
67
|
ret
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Obtain lexical information about a word using the
|
2
2
|
# ruby 'wordnet' gem.
|
3
|
-
class Treat::Lexicalizers::Sensers::Wordnet
|
3
|
+
class Treat::Workers::Lexicalizers::Sensers::Wordnet
|
4
4
|
|
5
5
|
# Require the 'wordnet' gem.
|
6
6
|
require 'wordnet'
|
@@ -13,7 +13,7 @@ class Treat::Lexicalizers::Sensers::Wordnet
|
|
13
13
|
end
|
14
14
|
|
15
15
|
# Require an adaptor for Wordnet synsets.
|
16
|
-
require 'treat/lexicalizers/sensers/wordnet/synset'
|
16
|
+
require 'treat/workers/lexicalizers/sensers/wordnet/synset'
|
17
17
|
|
18
18
|
# Noun, adjective and verb indexes.
|
19
19
|
@@indexes = {}
|
@@ -29,7 +29,7 @@ class Treat::Lexicalizers::Sensers::Wordnet
|
|
29
29
|
"the :nym option (:synonym, :hypernym, etc.)"
|
30
30
|
end
|
31
31
|
|
32
|
-
unless [
|
32
|
+
unless ['noun', 'adjective', 'verb'].
|
33
33
|
include?(word.category)
|
34
34
|
return []
|
35
35
|
end
|
@@ -45,7 +45,7 @@ class Treat::Lexicalizers::Sensers::Wordnet
|
|
45
45
|
|
46
46
|
lemma.synsets.each do |synset|
|
47
47
|
synsets <<
|
48
|
-
Treat::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
48
|
+
Treat::Workers::Lexicalizers::Sensers::Wordnet::Synset.new(synset)
|
49
49
|
end
|
50
50
|
|
51
51
|
((synsets.collect do |ss|
|
@@ -7,7 +7,7 @@ begin
|
|
7
7
|
# will clash with the top-level class 'Word'
|
8
8
|
# we define when syntactic sugar is enabled.
|
9
9
|
rescue TypeError
|
10
|
-
if Treat.sweetened
|
10
|
+
if Treat.core.syntax.sweetened
|
11
11
|
patch = true
|
12
12
|
# Unset the class Word for the duration
|
13
13
|
# of loading the tagger.
|
@@ -19,7 +19,7 @@ rescue TypeError
|
|
19
19
|
end
|
20
20
|
ensure
|
21
21
|
# Reset the class Word if using syntactic sugar.
|
22
|
-
if Treat.sweetened
|
22
|
+
if Treat.core.syntax.sweetened && patch
|
23
23
|
Object.const_set(:Word, Treat::Entities::Word)
|
24
24
|
end
|
25
25
|
end
|
@@ -13,11 +13,11 @@
|
|
13
13
|
# Project website:
|
14
14
|
#
|
15
15
|
# http://rbtagger.rubyforge.org/
|
16
|
-
module Treat::Lexicalizers::Taggers::Brill
|
16
|
+
module Treat::Workers::Lexicalizers::Taggers::Brill
|
17
17
|
|
18
18
|
require 'rbtagger'
|
19
19
|
|
20
|
-
require 'treat/lexicalizers/taggers/brill/patch'
|
20
|
+
require 'treat/workers/lexicalizers/taggers/brill/patch'
|
21
21
|
|
22
22
|
# Hold one instance of the tagger.
|
23
23
|
@@tagger = nil
|
@@ -32,12 +32,6 @@ module Treat::Lexicalizers::Taggers::Brill
|
|
32
32
|
# :contextual_rules => String (Contextual rules file to use)
|
33
33
|
def self.tag(entity, options = {})
|
34
34
|
|
35
|
-
# Tokenize the sentence/phrase.
|
36
|
-
if !entity.has_children? &&
|
37
|
-
!entity.is_a?(Treat::Entities::Token)
|
38
|
-
entity.tokenize(options)
|
39
|
-
end
|
40
|
-
|
41
35
|
# Create the tagger if necessary
|
42
36
|
@@tagger ||= ::Brill::Tagger.new(options[:lexicon],
|
43
37
|
options[:lexical_rules], options[:contextual_rules])
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# Project website: http://engtagger.rubyforge.org/
|
13
13
|
# Original Perl module site:
|
14
14
|
# http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
|
15
|
-
class Treat::Lexicalizers::Taggers::Lingua
|
15
|
+
class Treat::Workers::Lexicalizers::Taggers::Lingua
|
16
16
|
|
17
17
|
# Require the 'engtagger' gem.
|
18
18
|
silence_warnings { require 'engtagger' }
|
@@ -48,11 +48,6 @@ class Treat::Lexicalizers::Taggers::Lingua
|
|
48
48
|
# particularly words used polysemously.
|
49
49
|
def self.tag(entity, options = {})
|
50
50
|
|
51
|
-
if !entity.has_children? &&
|
52
|
-
!entity.is_a?(Treat::Entities::Token)
|
53
|
-
entity.tokenize
|
54
|
-
end
|
55
|
-
|
56
51
|
options = DefaultOptions.merge(options)
|
57
52
|
|
58
53
|
@@tagger ||= ::EngTagger.new(options)
|
@@ -1,11 +1,11 @@
|
|
1
1
|
# Wrapper for the Stanford POS tagger.
|
2
|
-
class Treat::Lexicalizers::Taggers::Stanford
|
2
|
+
class Treat::Workers::Lexicalizers::Taggers::Stanford
|
3
3
|
|
4
4
|
require 'treat/loaders/stanford'
|
5
5
|
|
6
6
|
# Hold one tagger per language.
|
7
7
|
@@taggers = {}
|
8
|
-
|
8
|
+
|
9
9
|
# Hold the default options.
|
10
10
|
DefaultOptions = {
|
11
11
|
:tagger_model => nil
|
@@ -13,75 +13,64 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
13
13
|
|
14
14
|
# Tag the word using one of the Stanford taggers.
|
15
15
|
def self.tag(entity, options = {})
|
16
|
-
|
17
|
-
#
|
18
|
-
if
|
19
|
-
|
20
|
-
entity.
|
16
|
+
|
17
|
+
# Handle tags for sentences and phrases.
|
18
|
+
if entity.is_a?(Treat::Entities::Sentence) ||
|
19
|
+
(entity.is_a?(Treat::Entities::Phrase) &&
|
20
|
+
!entity.parent_sentence)
|
21
|
+
|
22
|
+
tag_set = options[:tag_set]
|
23
|
+
entity.set :tag_set, tag_set
|
24
|
+
end
|
25
|
+
|
26
|
+
if entity.is_a?(Treat::Entities::Sentence)
|
27
|
+
return 'S'
|
28
|
+
elsif entity.is_a?(Treat::Entities::Phrase)
|
29
|
+
return 'P'
|
21
30
|
end
|
22
31
|
|
23
32
|
# Handle options and initialize the tagger.
|
24
33
|
lang = entity.language
|
25
34
|
options = get_options(options, lang)
|
26
|
-
init_tagger(lang)
|
35
|
+
init_tagger(lang) unless @@taggers[lang]
|
27
36
|
tokens, list = get_token_list(entity)
|
28
|
-
|
37
|
+
|
29
38
|
# Do the tagging.
|
30
39
|
i = 0
|
31
|
-
|
40
|
+
isolated_token = entity.is_a?(Treat::Entities::Token)
|
41
|
+
|
32
42
|
@@taggers[lang].apply(list).each do |tok|
|
33
43
|
tokens[i].set :tag, tok.tag
|
34
|
-
tokens[i].set :tag_set,
|
44
|
+
tokens[i].set :tag_set,
|
35
45
|
options[:tag_set] if isolated_token
|
36
46
|
return tok.tag if isolated_token
|
37
47
|
i += 1
|
38
48
|
end
|
39
49
|
|
40
|
-
# Handle tags for sentences and phrases.
|
41
|
-
if entity.is_a?(Treat::Entities::Sentence) ||
|
42
|
-
(entity.is_a?(Treat::Entities::Phrase) &&
|
43
|
-
!entity.parent_sentence)
|
44
|
-
|
45
|
-
tag_set = Treat::Universalisation::Tags::
|
46
|
-
StanfordTagSetForLanguage[
|
47
|
-
Treat::Languages.describe(lang)]
|
48
|
-
entity.set :tag_set, tag_set
|
49
|
-
end
|
50
|
-
|
51
|
-
if entity.is_a?(Treat::Entities::Sentence)
|
52
|
-
return 'S'
|
53
|
-
elsif entity.is_a?(Treat::Entities::Phrase)
|
54
|
-
return 'P'
|
55
|
-
end
|
56
|
-
|
57
50
|
end
|
58
|
-
|
51
|
+
|
59
52
|
# Initialize the tagger for a language.
|
60
|
-
def self.init_tagger(
|
61
|
-
language = Treat::Languages.describe(lang)
|
53
|
+
def self.init_tagger(language)
|
62
54
|
Treat::Loaders::Stanford.load(language)
|
63
55
|
model = StanfordCoreNLP::Config::Models[:pos][language]
|
64
|
-
model = Treat
|
56
|
+
model = Treat.paths.models + 'stanford/' +
|
65
57
|
StanfordCoreNLP::Config::ModelFolders[:pos] + model
|
66
|
-
@@taggers[
|
58
|
+
@@taggers[language] ||=
|
67
59
|
StanfordCoreNLP::MaxentTagger.new(model)
|
68
60
|
end
|
69
|
-
|
61
|
+
|
70
62
|
# Handle the options for the tagger.
|
71
|
-
def self.get_options(options,
|
72
|
-
language = Treat::Languages.describe(lang)
|
63
|
+
def self.get_options(options, language)
|
73
64
|
options = DefaultOptions.merge(options)
|
74
|
-
options[:tag_set] =
|
75
|
-
StanfordCoreNLP::Config::TagSets[language]
|
76
65
|
if options[:tagger_model]
|
77
66
|
::StanfordCoreNLP.set_model('pos.model',
|
78
67
|
options[:tagger_model])
|
79
68
|
end
|
80
|
-
options[:tag_set] =
|
69
|
+
options[:tag_set] =
|
81
70
|
StanfordCoreNLP::Config::TagSets[language]
|
82
71
|
options
|
83
72
|
end
|
84
|
-
|
73
|
+
|
85
74
|
# Retrieve a Java ArrayList object.
|
86
75
|
def self.get_token_list(entity)
|
87
76
|
list = StanfordCoreNLP::ArrayList.new
|
@@ -95,5 +84,5 @@ class Treat::Lexicalizers::Taggers::Stanford
|
|
95
84
|
end
|
96
85
|
return tokens, list
|
97
86
|
end
|
98
|
-
|
99
|
-
end
|
87
|
+
|
88
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
class Treat::Workers::Processors::Chunkers::Autoselect
|
2
|
+
|
3
|
+
def self.chunk(entity, options = {})
|
4
|
+
unless entity.has?(:format)
|
5
|
+
raise Treat::Exception,
|
6
|
+
"Must have a format to autoselect chunker."
|
7
|
+
end
|
8
|
+
begin
|
9
|
+
k = Treat::Workers::Processors::
|
10
|
+
Chunkers.const_get(cc(entity.format))
|
11
|
+
k.chunk(entity, options)
|
12
|
+
rescue Treat::Exception
|
13
|
+
Treat::Workers::Processors::
|
14
|
+
Chunkers::TXT.chunk(entity, options)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
end
|