treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,46 +1,54 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
1
|
+
# This class allows the visualization of
|
2
|
+
# an entity in standoff format; for example:
|
3
|
+
# (S (NP John) (VP has (VP come))).
|
4
|
+
class Treat::Formatters::Visualizers::Standoff
|
5
|
+
|
6
|
+
# Start out with an indent of 0.
|
7
|
+
DefaultOptions = { :indent => 0 }
|
8
|
+
|
9
|
+
# A lambda to recursively visualize the children
|
10
|
+
# of an entity.
|
11
|
+
Recurse = lambda do |entity, options|
|
12
|
+
v = ''
|
13
|
+
entity.each { |child| v += visualize(child, options) }
|
14
|
+
v
|
15
|
+
end
|
16
|
+
|
17
|
+
# Fix - brackets
|
18
|
+
# Visualize the entity using standoff notation.
|
19
|
+
# This can only be called on sentences and smaller
|
20
|
+
# entities, as it is not a suitable format to
|
21
|
+
# represent larger entities.
|
22
|
+
def self.visualize(entity, options = {})
|
23
|
+
options = DefaultOptions.merge(options)
|
24
|
+
value = ''; spaces = ''
|
25
|
+
options[:indent].times { spaces << ' '}
|
26
|
+
options[:indent] += 1
|
27
|
+
if entity.is_a?(Treat::Entities::Token)
|
28
|
+
val = ptb_escape(entity.value)
|
29
|
+
value += "#{spaces}(#{entity.tag} #{val})"
|
30
|
+
elsif entity.is_a?(Treat::Entities::Phrase)
|
31
|
+
tag = entity.has?(:tag) ? entity.tag : ''
|
32
|
+
value += ("#{spaces}(#{tag}\n" +
|
33
|
+
"#{Recurse.call(entity, options)})\n")
|
34
|
+
elsif entity.is_a?(Treat::Entities::Sentence)
|
35
|
+
value += ("#{spaces}(S\n" +
|
36
|
+
"#{Recurse.call(entity, options)})\n")
|
37
|
+
else
|
38
|
+
raise 'Standoff format is unsuitable to represent' +
|
39
|
+
' entities larger than sentences.'
|
40
|
+
end
|
41
|
+
options[:indent] -= 1
|
42
|
+
value.gsub!(")\n)", "))")
|
43
|
+
value
|
44
|
+
end
|
45
|
+
|
46
|
+
def self.ptb_escape(val)
|
47
|
+
Treat::Linguistics::Tags::
|
48
|
+
PTBEscapeCharacters.each do |char, esc|
|
49
|
+
val.gsub!(char, val)
|
44
50
|
end
|
51
|
+
|
52
|
+
val
|
45
53
|
end
|
46
|
-
end
|
54
|
+
end
|
@@ -1,32 +1,29 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
spacer + self.visualize(child, options)
|
23
|
-
end
|
24
|
-
options[:indent] -= 1
|
25
|
-
return string
|
26
|
-
end
|
27
|
-
'> ' + entity.inspect
|
28
|
-
end
|
1
|
+
# This class generates an ASCII representation
|
2
|
+
# of a tree of entities.
|
3
|
+
class Treat::Formatters::Visualizers::Tree
|
4
|
+
|
5
|
+
# Start out with an indent at 0.
|
6
|
+
DefaultOptions = { :indent => 0 }
|
7
|
+
|
8
|
+
# Obtain a plain text tree representation
|
9
|
+
# of the entity.
|
10
|
+
def self.visualize(entity, options = {})
|
11
|
+
options = DefaultOptions.merge(options)
|
12
|
+
string = ''
|
13
|
+
if entity.has_children?
|
14
|
+
spacer = '--'
|
15
|
+
spaces = ''
|
16
|
+
options[:indent].times { spaces << ' '}
|
17
|
+
string << "+ #{entity.inspect}\n#{spaces}|"
|
18
|
+
options[:indent] += 1
|
19
|
+
entity.children.each do |child|
|
20
|
+
string = string + "\n" + spaces + '+' +
|
21
|
+
spacer + self.visualize(child, options)
|
29
22
|
end
|
23
|
+
options[:indent] -= 1
|
24
|
+
return string
|
30
25
|
end
|
26
|
+
'> ' + entity.inspect
|
31
27
|
end
|
32
|
-
|
28
|
+
|
29
|
+
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
module Treat::Groupable
|
2
|
+
|
3
|
+
# Lazily load the worker classes in the group.
|
4
|
+
def const_missing(const)
|
5
|
+
bits = self.ancestors[0].to_s.split('::')
|
6
|
+
bits.collect! { |bit| ucc(bit) }
|
7
|
+
file = bits.join('/') + "/#{ucc(const)}"
|
8
|
+
if not File.readable?(Treat.lib + "#{file}.rb")
|
9
|
+
raise Treat::Exception,
|
10
|
+
"File '#{file}.rb' corresponding to " +
|
11
|
+
"requested worker #{self}::#{const} " +
|
12
|
+
"does not exist."
|
13
|
+
else
|
14
|
+
require file
|
15
|
+
if not const_defined?(const)
|
16
|
+
raise Treat::Exception,
|
17
|
+
"File #{file} does not define " +
|
18
|
+
"#{self}::#{const}."
|
19
|
+
end
|
20
|
+
const_get(const)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
# Cache the list of workers to improve performance.
|
25
|
+
@@list = {}
|
26
|
+
# Populates once the list of the workers in the group
|
27
|
+
# by crawling the filesystem.
|
28
|
+
def list
|
29
|
+
mod = ucc(cl(self))
|
30
|
+
if @@list[mod].nil?
|
31
|
+
@@list[mod] = []
|
32
|
+
dirs = Dir[Treat.lib + "treat/*/#{mod}/*.rb"]
|
33
|
+
dirs.each do |file|
|
34
|
+
@@list[mod] <<
|
35
|
+
file.split('/')[-1][0..-4].intern
|
36
|
+
end
|
37
|
+
end
|
38
|
+
@@list[mod]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Boolean - does the group have the supplied class
|
42
|
+
# included in its targets?
|
43
|
+
def has_target?(target, strict = false)
|
44
|
+
is_target = false
|
45
|
+
self.targets.each do |entity_type|
|
46
|
+
t = cc(entity_type)
|
47
|
+
entity_type = Treat::Entities.const_get(t)
|
48
|
+
if target < entity_type ||
|
49
|
+
entity_type == target
|
50
|
+
is_target = true; break
|
51
|
+
end
|
52
|
+
end
|
53
|
+
is_target
|
54
|
+
end
|
55
|
+
|
56
|
+
# Create a new algorithm within the group. Once
|
57
|
+
# the algorithm is added, it will be automatically
|
58
|
+
# installed on all the targets of the group.
|
59
|
+
def add(class_name, &block)
|
60
|
+
c = cc(class_name).intern
|
61
|
+
klass = self.const_set(c, Class.new)
|
62
|
+
method = self.method
|
63
|
+
@@list[ucc(cl(self))] << class_name
|
64
|
+
klass.send(:define_singleton_method,
|
65
|
+
method) do |entity, options={}|
|
66
|
+
block.call(entity, options)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
# Get constants in this module, excluding by
|
71
|
+
# default those defined by parent modules.
|
72
|
+
def const_get(const)
|
73
|
+
super(const, false)
|
74
|
+
end
|
75
|
+
|
76
|
+
# Modify the extended class.
|
77
|
+
def self.extended(group)
|
78
|
+
|
79
|
+
group.module_eval do
|
80
|
+
|
81
|
+
class << self
|
82
|
+
|
83
|
+
# The type of the group. There are three types:
|
84
|
+
#
|
85
|
+
# - Transformers transform the tree of an entity.
|
86
|
+
# - Annotators compute a value and store it in the entity.
|
87
|
+
# - Computers compute a value and do not store it.
|
88
|
+
attr_accessor :type
|
89
|
+
# The default worker in the group, for language-
|
90
|
+
# independent tasks.
|
91
|
+
attr_accessor :default
|
92
|
+
# The entity types which the group's workers work on.
|
93
|
+
attr_accessor :targets
|
94
|
+
# Presets to automatically generate functions.
|
95
|
+
attr_accessor :presets
|
96
|
+
# The preset option to use with preset functions.
|
97
|
+
attr_accessor :preset_option
|
98
|
+
end
|
99
|
+
|
100
|
+
# Return the method corresponding to the group.
|
101
|
+
# This method resolves the name of the method
|
102
|
+
# that a group should provide based on the name
|
103
|
+
# of the group. Basically, if the group ends in
|
104
|
+
# -ers, the verb corresponding to the group is
|
105
|
+
# returned (tokenizers -> tokenize, inflectors ->
|
106
|
+
# inflect). Otherwise, the name of the method
|
107
|
+
# is the same as that of the group (encoding ->
|
108
|
+
# encoding, tag -> tag).
|
109
|
+
@method = nil
|
110
|
+
def self.method
|
111
|
+
return @method if @method
|
112
|
+
m = ucc(cl(self)).dup
|
113
|
+
if m[-4..-1] == 'zers'
|
114
|
+
if type == :annotator
|
115
|
+
if m[-6] == 'l'
|
116
|
+
m[-5..-1] = ''
|
117
|
+
else
|
118
|
+
m[-5..-1] = 'y'
|
119
|
+
end
|
120
|
+
else
|
121
|
+
m = m[0..-3]
|
122
|
+
end
|
123
|
+
n = m
|
124
|
+
elsif m[-4..-1] == 'iers'
|
125
|
+
m[-4..-1] = 'y'
|
126
|
+
n = m
|
127
|
+
elsif m[-3..-1] == 'ers'
|
128
|
+
if ['k', 't', 'm', 'd',
|
129
|
+
'g', 'n', 'x', 'h'].
|
130
|
+
include? m[-4]
|
131
|
+
n = m[0..-4]
|
132
|
+
if n[-1] == n[-2]
|
133
|
+
n = n[0..-2]
|
134
|
+
end
|
135
|
+
else
|
136
|
+
n = m[0..-3]
|
137
|
+
end
|
138
|
+
elsif m[-3..-1] == 'ors'
|
139
|
+
n = m[0..-4] + 'e'
|
140
|
+
else
|
141
|
+
n = m
|
142
|
+
end
|
143
|
+
@method = n.intern
|
144
|
+
end
|
145
|
+
|
146
|
+
# Populate the group's list.
|
147
|
+
group.list
|
148
|
+
|
149
|
+
end
|
150
|
+
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Treat::Helpers
|
2
|
+
|
3
|
+
class DecimalPointEscaper
|
4
|
+
|
5
|
+
EscapeChar = '^^'
|
6
|
+
EscapedEscapeChar = '\^\^'
|
7
|
+
|
8
|
+
def self.escape!(s)
|
9
|
+
s.gsub!(/([0-9]+)\.([0-9]+)/) do
|
10
|
+
$1 + EscapeChar + $2
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.unescape!(s)
|
15
|
+
s.gsub!(/([0-9]+)#{EscapedEscapeChar}([0-9]+)/) do
|
16
|
+
$1 + '.' + $2
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/treat/inflectors.rb
CHANGED
@@ -1,47 +1,52 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
end
|
32
|
-
# Retrieve the full text description of a cardinal number.
|
33
|
-
module CardinalWords
|
34
|
-
extend Group
|
35
|
-
self.type = :annotator
|
36
|
-
self.targets = [:number]
|
37
|
-
end
|
38
|
-
# Retrieve the full text description of an ordinal number.
|
39
|
-
module OrdinalWords
|
40
|
-
extend Group
|
41
|
-
self.type = :annotator
|
42
|
-
self.targets = [:number]
|
43
|
-
end
|
44
|
-
extend Treat::Category
|
1
|
+
# Category of worker groups that retrieve
|
2
|
+
# the inflections of a word.
|
3
|
+
module Treat::Inflectors
|
4
|
+
|
5
|
+
# Return the stem (*not root form*) of a word.
|
6
|
+
module Stemmers
|
7
|
+
extend Treat::Groupable
|
8
|
+
self.type = :annotator
|
9
|
+
self.targets = [:word]
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrieve the different declensions of a
|
13
|
+
# noun (singular, plural).
|
14
|
+
module Declensors
|
15
|
+
extend Treat::Groupable
|
16
|
+
self.type = :annotator
|
17
|
+
self.targets = [:word]
|
18
|
+
self.preset_option = :count
|
19
|
+
self.presets = [:plural, :singular]
|
20
|
+
end
|
21
|
+
|
22
|
+
# Retrieve the different conjugations of a word
|
23
|
+
# given a mode, tense, person, and/or number.
|
24
|
+
module Conjugators
|
25
|
+
extend Treat::Groupable
|
26
|
+
self.type = :annotator
|
27
|
+
self.targets = [:word]
|
28
|
+
self.preset_option = :form
|
29
|
+
self.presets = [:infinitive, :present_participle,
|
30
|
+
:plural_verb, :singular_verb]
|
45
31
|
end
|
46
|
-
end
|
47
32
|
|
33
|
+
# Retrieve the full text description of a
|
34
|
+
# cardinal number.
|
35
|
+
module Cardinalizers
|
36
|
+
extend Treat::Groupable
|
37
|
+
self.type = :annotator
|
38
|
+
self.targets = [:number]
|
39
|
+
end
|
40
|
+
|
41
|
+
# Retrieve the full text description of an
|
42
|
+
# ordinal number.
|
43
|
+
module Ordinalizers
|
44
|
+
extend Treat::Groupable
|
45
|
+
self.type = :annotator
|
46
|
+
self.targets = [:number]
|
47
|
+
end
|
48
|
+
|
49
|
+
# Make Inflectors categorizable.
|
50
|
+
extend Treat::Categorizable
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# This class is a wrapper for the functions included
|
2
|
+
# in the 'linguistics' gem that allow to describe a
|
3
|
+
# number in words in cardinal form.
|
4
|
+
#
|
5
|
+
# Project website: http://deveiate.org/projects/Linguistics/
|
6
|
+
module Treat::Inflectors::Cardinalizers::Linguistics
|
7
|
+
|
8
|
+
require 'treat/loaders/linguistics'
|
9
|
+
|
10
|
+
# Return the description of a cardinal number in words.
|
11
|
+
#
|
12
|
+
# Options:
|
13
|
+
#
|
14
|
+
# - :group => Controls how many numbers at a time are
|
15
|
+
# grouped together. Valid values are 0 (normal grouping),
|
16
|
+
# 1 (single-digit grouping, e.g., “one, two, three, four”),
|
17
|
+
# 2 (double-digit grouping, e.g., “twelve, thirty-four”, or
|
18
|
+
# 3 (triple-digit grouping, e.g., “one twenty-three, four”).
|
19
|
+
# - :comma => Set the character/s used to separate word groups.
|
20
|
+
# Defaults to ", ".
|
21
|
+
# - :and => Set the word and/or characters used where ' and '
|
22
|
+
# (the default) is normally used. Setting :and to ' ', for
|
23
|
+
# example, will cause 2556 to be returned as “two-thousand,
|
24
|
+
# five hundred fifty-six” instead of “two-thousand, five
|
25
|
+
# hundred and fifty-six”.
|
26
|
+
# - :zero => Set the word used to represent the numeral 0 in
|
27
|
+
# the result. 'zero' is the default.
|
28
|
+
# - :decimal => Set the translation of any decimal points in
|
29
|
+
# the number; the default is 'point'.
|
30
|
+
# - :asArray If set to a true value, the number will be returned
|
31
|
+
# as an array of word groups instead of a String.
|
32
|
+
#
|
33
|
+
# More specific options when using :type => :ordinal:
|
34
|
+
def self.cardinal(entity, options = {})
|
35
|
+
Treat::Loaders::Linguistics.
|
36
|
+
load(entity.language).
|
37
|
+
numwords(entity.to_s, options)
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|