treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
@@ -9,6 +9,7 @@ module Treat::Entities::Abilities::Delegatable
|
|
9
9
|
opt = group.preset_option
|
10
10
|
return unless opt
|
11
11
|
|
12
|
+
self.class_eval do
|
12
13
|
group.presets.each do |preset|
|
13
14
|
define_method(preset) do |worker=nil, options={}|
|
14
15
|
return get(preset) if has?(preset)
|
@@ -19,15 +20,17 @@ module Treat::Entities::Abilities::Delegatable
|
|
19
20
|
features[preset] = f if f
|
20
21
|
end
|
21
22
|
end
|
23
|
+
end
|
22
24
|
|
23
25
|
end
|
24
26
|
|
25
27
|
# Add the workers to perform a task on an entity class.
|
26
28
|
def add_workers(group)
|
27
|
-
|
28
29
|
self.class_eval do
|
30
|
+
|
29
31
|
task = group.method
|
30
32
|
add_presets(group)
|
33
|
+
|
31
34
|
define_method(task) do |worker=nil, options={}|
|
32
35
|
if worker.is_a?(Hash)
|
33
36
|
options, worker =
|
@@ -37,8 +40,8 @@ module Treat::Entities::Abilities::Delegatable
|
|
37
40
|
@features[task]
|
38
41
|
else
|
39
42
|
self.class.call_worker(
|
40
|
-
|
41
|
-
|
43
|
+
self, task, worker,
|
44
|
+
group, options
|
42
45
|
)
|
43
46
|
end
|
44
47
|
end
|
@@ -55,30 +58,25 @@ module Treat::Entities::Abilities::Delegatable
|
|
55
58
|
end
|
56
59
|
|
57
60
|
print_debug(entity, task, worker,
|
58
|
-
group, options) if Treat.debug
|
59
|
-
|
61
|
+
group, options) if Treat.core.verbosity.debug
|
60
62
|
if not group.list.include?(worker)
|
61
63
|
raise Treat::Exception,
|
62
64
|
worker_not_found(worker, group)
|
63
|
-
|
64
|
-
|
65
|
-
worker = group.const_get(
|
66
|
-
cc(worker.to_s).intern
|
67
|
-
)
|
68
|
-
|
69
|
-
result = worker.send(group.method, entity, options)
|
65
|
+
end
|
70
66
|
|
71
|
-
|
72
|
-
|
73
|
-
end
|
67
|
+
worker = group.const_get(cc(worker.to_s).intern)
|
68
|
+
result = worker.send(group.method, entity, options)
|
74
69
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
result
|
79
|
-
end
|
70
|
+
if group.type == :annotator && result
|
71
|
+
entity.features[task] = result
|
72
|
+
end
|
80
73
|
|
74
|
+
if group.type == :transformer
|
75
|
+
entity
|
76
|
+
else
|
77
|
+
result
|
81
78
|
end
|
79
|
+
|
82
80
|
end
|
83
81
|
|
84
82
|
# Find which worker to use if none has been supplied.
|
@@ -93,23 +91,31 @@ module Treat::Entities::Abilities::Delegatable
|
|
93
91
|
# inside the given group.
|
94
92
|
def find_worker_for_language(language, group)
|
95
93
|
|
96
|
-
lang = Treat
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
94
|
+
lang = Treat.languages[language]
|
95
|
+
cat = group.to_s.split('::')[2].downcase.intern
|
96
|
+
group = ucc(cl(group)).intern
|
97
|
+
|
98
|
+
if lang.nil?
|
99
|
+
raise Treat::Exception,
|
100
|
+
"No configuration file loaded for language #{language}."
|
101
|
+
end
|
102
|
+
|
103
|
+
workers = lang.workers
|
104
|
+
|
105
|
+
if !workers.respond_to?(cat) ||
|
106
|
+
!workers[cat].respond_to?(group)
|
107
|
+
workers = Treat.languages.agnostic.workers
|
108
|
+
end
|
109
|
+
|
110
|
+
if !workers.respond_to?(cat) ||
|
111
|
+
!workers[cat].respond_to?(group)
|
112
|
+
raise Treat::Exception,
|
113
|
+
"No #{group} is/are available for the " +
|
114
|
+
"#{language.to_s.capitalize} language."
|
111
115
|
end
|
112
|
-
|
116
|
+
|
117
|
+
|
118
|
+
workers[cat][group].first
|
113
119
|
|
114
120
|
end
|
115
121
|
|
@@ -33,7 +33,7 @@ module Treat::Entities::Abilities::Doable
|
|
33
33
|
entity_types = group.targets
|
34
34
|
f = nil
|
35
35
|
entity_types.each do |t|
|
36
|
-
f = true if Treat::Entities.
|
36
|
+
f = true if is_a?(Treat::Entities.const_get(cc(t)))
|
37
37
|
end
|
38
38
|
if f || entity_types.include?(:entity)
|
39
39
|
send(task, worker, options)
|
@@ -55,7 +55,7 @@ module Treat::Entities::Abilities::Doable
|
|
55
55
|
|
56
56
|
# Get the group of a task.
|
57
57
|
def get_group(task)
|
58
|
-
g = Treat::
|
58
|
+
g = Treat::Workers.lookup(task)
|
59
59
|
unless g
|
60
60
|
raise Treat::Exception,
|
61
61
|
"Task #{task} does not exist."
|
@@ -11,7 +11,7 @@ module Treat::Entities::Abilities::Iterable
|
|
11
11
|
types = [:entity] if types.size == 0
|
12
12
|
f = false
|
13
13
|
types.each do |t2|
|
14
|
-
if Treat::Entities.
|
14
|
+
if is_a?(Treat::Entities.const_get(cc(t2)))
|
15
15
|
f = true; break
|
16
16
|
end
|
17
17
|
end
|
@@ -54,57 +54,45 @@ module Treat::Entities::Abilities::Iterable
|
|
54
54
|
|
55
55
|
# Returns the first ancestor of this entity
|
56
56
|
# that has the given type.
|
57
|
-
def
|
57
|
+
def ancestor_with_type(type)
|
58
|
+
return unless has_parent?
|
58
59
|
ancestor = @parent
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
f = true; break
|
64
|
-
end
|
65
|
-
end
|
66
|
-
f
|
67
|
-
end
|
68
|
-
if ancestor
|
69
|
-
while not match_types.call(ancestor.type, type)
|
70
|
-
return nil unless (ancestor && ancestor.has_parent?)
|
71
|
-
ancestor = ancestor.parent
|
72
|
-
end
|
73
|
-
match_types.call(ancestor.type, types) ? ancestor : nil
|
60
|
+
type_klass = Treat::Entities.const_get(cc(type))
|
61
|
+
while not ancestor.is_a?(type_klass)
|
62
|
+
return nil unless (ancestor && ancestor.has_parent?)
|
63
|
+
ancestor = ancestor.parent
|
74
64
|
end
|
65
|
+
ancestor
|
75
66
|
end
|
76
67
|
|
77
|
-
alias :ancestor_with_type :ancestor_with_types
|
78
|
-
|
79
68
|
# Yields each ancestors of this entity that
|
80
|
-
# has
|
81
|
-
def each_ancestor(
|
82
|
-
types = [:entity] if types.empty?
|
69
|
+
# has the given type.
|
70
|
+
def each_ancestor(type = :entity)
|
83
71
|
ancestor = self
|
84
|
-
while (a = ancestor.
|
72
|
+
while (a = ancestor.ancestor_with_type(type))
|
85
73
|
yield a
|
86
74
|
ancestor = ancestor.parent
|
87
75
|
end
|
88
76
|
end
|
89
77
|
|
90
|
-
# Returns an array of ancestors of this
|
91
|
-
#
|
92
|
-
def
|
93
|
-
|
94
|
-
each_ancestor(
|
95
|
-
|
78
|
+
# Returns an array of ancestors of this
|
79
|
+
# entity that have the given type.
|
80
|
+
def ancestors_with_type(type)
|
81
|
+
ancestors = []
|
82
|
+
each_ancestor(type) do |a|
|
83
|
+
ancestors << a
|
84
|
+
end
|
85
|
+
ancestors
|
96
86
|
end
|
97
87
|
|
98
88
|
# Returns the first ancestor that has a feature
|
99
89
|
# with the given name, otherwise nil.
|
100
|
-
def ancestor_with_feature(
|
101
|
-
each_ancestor
|
90
|
+
def ancestor_with_feature(feature)
|
91
|
+
each_ancestor do |ancestor|
|
102
92
|
return ancestor if ancestor.has?(feature)
|
103
93
|
end
|
104
94
|
end
|
105
95
|
|
106
|
-
alias :ancestors_with_type :ancestors_with_types
|
107
|
-
|
108
96
|
# Number of children that have a given feature.
|
109
97
|
def num_children_with_feature(feature)
|
110
98
|
i = 0
|
@@ -24,8 +24,9 @@ module Treat::Entities::Abilities::Magical
|
|
24
24
|
#
|
25
25
|
def magic(sym, *args)
|
26
26
|
|
27
|
-
|
28
|
-
@@
|
27
|
+
# Cache this for performance.
|
28
|
+
@@entities_regexp ||= "(#{Treat.core.entities.list.join('|')})"
|
29
|
+
@@cats_regexp ||= "(#{Treat.linguistics.categories.join('|')})"
|
29
30
|
|
30
31
|
method = sym.to_s =~ /entities/ ?
|
31
32
|
sym.to_s.gsub('entities', 'entitys') :
|
@@ -56,17 +57,16 @@ module Treat::Entities::Abilities::Magical
|
|
56
57
|
entities_with_feature($2.intern,
|
57
58
|
args[0], $1.intern).each { |e| yield e }
|
58
59
|
elsif method =~ /^each_#{@@cats_regexp}$/
|
59
|
-
entities_with_category($1.
|
60
|
-
).each { |e| yield e }
|
60
|
+
entities_with_category($1).each { |e| yield e }
|
61
61
|
elsif method =~ /^#{@@cats_regexp}s$/
|
62
|
-
entities_with_category($1
|
62
|
+
entities_with_category($1)
|
63
63
|
elsif method =~ /^#{@@cats_regexp}$/
|
64
|
-
first_but_warn(entities_with_category($1
|
64
|
+
first_but_warn(entities_with_category($1), $1)
|
65
65
|
elsif method =~ /^first_#{@@cats_regexp}$/
|
66
|
-
e = entities_with_category($1
|
66
|
+
e = entities_with_category($1)
|
67
67
|
e ? e[0] : nil
|
68
68
|
elsif method =~ /^#{@@cats_regexp}_count$/
|
69
|
-
entities_with_category($1
|
69
|
+
entities_with_category($1).size
|
70
70
|
elsif method =~ /^(.*)_count$/
|
71
71
|
num_children_with_feature($1.intern)
|
72
72
|
elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
|
@@ -43,42 +43,4 @@ module Treat::Entities::Abilities::Registrable
|
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
46
|
-
def contains_id?(id)
|
47
|
-
|
48
|
-
@registry[:id][id]
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
def contains_value?(val)
|
53
|
-
|
54
|
-
@registry[:value][val] ?
|
55
|
-
true : false
|
56
|
-
|
57
|
-
end
|
58
|
-
|
59
|
-
def contains_type?(type1)
|
60
|
-
|
61
|
-
return true if @registry[:type][type1]
|
62
|
-
|
63
|
-
@registry[:type].each do |type2, count|
|
64
|
-
if Treat::Entities.
|
65
|
-
match_types[type1][type2]
|
66
|
-
return true
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
false
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
def contains_types?(types)
|
75
|
-
|
76
|
-
types.each do |type|
|
77
|
-
return true if contains_type?(type)
|
78
|
-
end
|
79
|
-
|
80
|
-
false
|
81
|
-
|
82
|
-
end
|
83
|
-
|
84
46
|
end
|
@@ -32,28 +32,28 @@ module Treat::Entities::Abilities::Stringable
|
|
32
32
|
end
|
33
33
|
end
|
34
34
|
|
35
|
-
# Return an informative string representation
|
36
|
-
# of the entity.
|
37
|
-
def inspect
|
38
|
-
s = "#{cl(self.class)} (#{@id.to_s})"
|
39
|
-
if caller_method(2) == :inspect
|
40
|
-
@id.to_s
|
41
|
-
else
|
42
|
-
dependencies = []
|
43
|
-
@dependencies.each do |dependency|
|
44
|
-
dependencies <<
|
45
|
-
"#{dependency.target}#{dependency.type}"
|
46
|
-
end
|
47
|
-
s += " --- #{short_value.inspect}" +
|
48
|
-
" --- #{@features.inspect} " +
|
49
|
-
" --- #{dependencies.inspect} "
|
50
|
-
end
|
51
|
-
s
|
52
|
-
end
|
53
|
-
|
54
35
|
# Print out an ASCII representation of the tree.
|
55
36
|
def print_tree; puts visualize(:tree); end
|
56
37
|
|
38
|
+
# Return an informative string representation
|
39
|
+
# of the entity.
|
40
|
+
def inspect
|
41
|
+
s = "#{cl(self.class)} (#{@id.to_s})"
|
42
|
+
if caller_method(2) == :inspect
|
43
|
+
@id.to_s
|
44
|
+
else
|
45
|
+
dependencies = []
|
46
|
+
@dependencies.each do |dependency|
|
47
|
+
dependencies <<
|
48
|
+
"#{dependency.target}#{dependency.type}"
|
49
|
+
end
|
50
|
+
s += " --- #{short_value.inspect}" +
|
51
|
+
" --- #{@features.inspect} " +
|
52
|
+
" --- #{dependencies.inspect} "
|
53
|
+
end
|
54
|
+
s
|
55
|
+
end
|
56
|
+
|
57
57
|
# Helper method to implode the string value of the subtree.
|
58
58
|
def implode
|
59
59
|
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
# Represents a collection of texts.
|
3
|
+
class Collection < Treat::Entities::Entity
|
4
|
+
|
5
|
+
# Initialize the collection with a folder
|
6
|
+
# containing the texts of the collection.
|
7
|
+
def initialize(folder = nil, id = nil)
|
8
|
+
super('', id)
|
9
|
+
set :folder, folder
|
10
|
+
i = folder + '/.index'
|
11
|
+
set :index, i if FileTest.directory?(i)
|
12
|
+
end
|
13
|
+
|
14
|
+
# Works like the default <<, but if the
|
15
|
+
# file being added is a collection or a
|
16
|
+
# document, then copy that collection or
|
17
|
+
# document into this collection's folder.
|
18
|
+
def <<(entities, copy = true)
|
19
|
+
unless entities.is_a? Array
|
20
|
+
entities = [entities]
|
21
|
+
end
|
22
|
+
entities.each do |entity|
|
23
|
+
if [:document, :collection].
|
24
|
+
include?(entity.type) && copy
|
25
|
+
entity = entity.copy_into(self)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
super(entities)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -1,17 +1,20 @@
|
|
1
1
|
module Treat::Entities
|
2
2
|
|
3
|
-
|
4
|
-
require 'treat/tree'
|
3
|
+
module Abilities; end
|
5
4
|
|
6
|
-
|
5
|
+
# Require abilities.
|
6
|
+
p = Treat.paths.lib +
|
7
|
+
'treat/entities/abilities/*.rb'
|
8
|
+
Dir.glob(p).each { |f| require f }
|
9
|
+
|
10
|
+
require 'birch'
|
11
|
+
|
12
|
+
class Entity < Treat::Core::Node
|
7
13
|
|
8
14
|
# A Symbol representing the lowercase
|
9
15
|
# version of the class name.
|
10
16
|
attr_accessor :type
|
11
17
|
|
12
|
-
# Require abilities.
|
13
|
-
require 'treat/entities/abilities'
|
14
|
-
|
15
18
|
# Implements support for #register,
|
16
19
|
# #registry, and #contains_* methods.
|
17
20
|
include Abilities::Registrable
|
@@ -48,14 +51,17 @@ module Treat::Entities
|
|
48
51
|
# #entities_with_type, #ancestors_with_type,
|
49
52
|
# #entities_with_feature, #entities_with_category.
|
50
53
|
include Abilities::Iterable
|
51
|
-
|
54
|
+
|
52
55
|
# Implement support for #export to export
|
53
56
|
# a line of a data set based on a classification.
|
54
57
|
include Abilities::Exportable
|
55
|
-
|
58
|
+
|
56
59
|
# Implement support for #copy_into.
|
57
60
|
include Abilities::Copyable
|
58
|
-
|
61
|
+
|
62
|
+
# Implement support for #self.compare_with
|
63
|
+
extend Abilities::Comparable
|
64
|
+
|
59
65
|
# Initialize the entity with its value and
|
60
66
|
# (optionally) a unique identifier. By default,
|
61
67
|
# the object_id will be used as id.
|
@@ -65,7 +71,7 @@ module Treat::Entities
|
|
65
71
|
@type = :entity if self == Entity
|
66
72
|
@type ||= ucc(cl(self.class)).intern
|
67
73
|
end
|
68
|
-
|
74
|
+
|
69
75
|
# Add an entity to the current entity.
|
70
76
|
# Registers the entity in the root node
|
71
77
|
# token registry if the entity is a leaf.
|
@@ -83,7 +89,6 @@ module Treat::Entities
|
|
83
89
|
entities[0]
|
84
90
|
end
|
85
91
|
|
86
|
-
|
87
92
|
# Catch missing methods to support method-like
|
88
93
|
# access to features (e.g. entity.category
|
89
94
|
# instead of entity.features[:category]) and to
|
@@ -105,13 +110,13 @@ module Treat::Entities
|
|
105
110
|
super(sym, *args, &block)
|
106
111
|
rescue NoMethodError
|
107
112
|
raise Treat::Exception,
|
108
|
-
if Treat::
|
113
|
+
if Treat::Workers.lookup(sym)
|
109
114
|
msg = "Method #{sym} cannot " +
|
110
115
|
"be called on a #{type}."
|
111
116
|
else
|
112
117
|
msg = "Method #{sym} does not exist."
|
113
118
|
msg += did_you_mean?(
|
114
|
-
Treat::
|
119
|
+
Treat::Workers.methods, sym)
|
115
120
|
end
|
116
121
|
end
|
117
122
|
else
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
|
3
|
+
# Any kind of grouped entities.
|
4
|
+
class Group < Treat::Entities::Entity; end
|
5
|
+
|
6
|
+
# Represents a group of words with a sentence ender.
|
7
|
+
class Sentence < Group; end
|
8
|
+
|
9
|
+
# Represents a group of words.
|
10
|
+
class Phrase < Group; end
|
11
|
+
|
12
|
+
# Represents a non-linguistic fragment
|
13
|
+
class Fragment < Group; end
|
14
|
+
|
15
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
# Represents a section.
|
3
|
+
class Section < Treat::Entities::Entity; end
|
4
|
+
|
5
|
+
# Represents a page of text.
|
6
|
+
class Page < Section; end
|
7
|
+
|
8
|
+
# Represents a block of text
|
9
|
+
class Block < Section; end
|
10
|
+
|
11
|
+
# Represents a list.
|
12
|
+
class List < Section; end
|
13
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
# Represents a terminal element in the text structure.
|
3
|
+
class Token < Treat::Entities::Entity; end
|
4
|
+
|
5
|
+
# Represents a word.
|
6
|
+
class Word < Token; end
|
7
|
+
|
8
|
+
# Represents a clitic ('s).
|
9
|
+
class Enclitic < Token; end
|
10
|
+
|
11
|
+
# Represents a number.
|
12
|
+
class Number < Token
|
13
|
+
def to_i; to_s.to_i; end
|
14
|
+
def to_f; to_s.to_f; end
|
15
|
+
end
|
16
|
+
|
17
|
+
# Represents a punctuation sign.
|
18
|
+
class Punctuation < Token; end
|
19
|
+
|
20
|
+
# Represents a character that is neither
|
21
|
+
# alphabetical, numerical or a punctuation
|
22
|
+
# character (e.g. @#$%&*).
|
23
|
+
class Symbol < Token; end
|
24
|
+
|
25
|
+
# Represents a url.
|
26
|
+
class Url < Token; end
|
27
|
+
|
28
|
+
# Represents a valid RFC822 address.
|
29
|
+
class Email < Token; end
|
30
|
+
|
31
|
+
# Represents a token whose type
|
32
|
+
# cannot be identified.
|
33
|
+
class Unknown; end
|
34
|
+
|
35
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
module Treat::Entities
|
2
|
+
# Represents a zone of text
|
3
|
+
# (Title, Paragraph, List, Quote).
|
4
|
+
class Zone < Treat::Entities::Entity; end
|
5
|
+
|
6
|
+
# Represents a title, subtitle, logical header.
|
7
|
+
class Title < Zone; end
|
8
|
+
|
9
|
+
# Represents a paragraph.
|
10
|
+
class Paragraph < Zone; end
|
11
|
+
end
|
data/lib/treat/entities.rb
CHANGED
@@ -1,76 +1,6 @@
|
|
1
|
-
#
|
2
|
-
# (from a collection of texts down to an individual word) with
|
3
|
-
# a unique identifier, a value, features, children and dependencies
|
4
|
-
# linking them to other textual entities.
|
5
|
-
#
|
6
|
-
# - A Collection represents a folder containing documents (and folders).
|
7
|
-
# - A Document represents a file with a textual content.
|
8
|
-
# - A Zone represents a logical division of content in a document.
|
9
|
-
# - A Phrase is a group of words; a Sentence is a Phrase with an ender.
|
10
|
-
# - A Token represents a Word, a Number, a Punctuation or a Symbol.
|
1
|
+
# Contains the textual model used by Treat.
|
11
2
|
module Treat::Entities
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
# as non_camel_case identifiers.
|
17
|
-
attr_accessor :list
|
18
|
-
end
|
19
|
-
|
20
|
-
# Require all entities.
|
21
|
-
require 'treat/entities/entities'
|
22
|
-
|
23
|
-
# Add each constant to the list, except Entity.
|
24
|
-
self.list = []
|
25
|
-
constants.each do |constant|
|
26
|
-
unless constant == :Entity ||
|
27
|
-
constant == :Abilities
|
28
|
-
self.list << ucc(constant).intern
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# Make each Entity class buildable magically.
|
33
|
-
# This enables to create Entities without calling
|
34
|
-
# #new (e.g. Word 'hello').
|
35
|
-
constants.each do |entity|
|
36
|
-
define_singleton_method(entity) do |value='', id=nil|
|
37
|
-
const_get(entity).build(value, id)
|
38
|
-
end
|
39
|
-
end
|
40
|
-
|
41
|
-
# Create entity lookup table.
|
42
|
-
@@match_types = nil
|
43
|
-
def self.match_types
|
44
|
-
return @@match_types if @@match_types
|
45
|
-
list = (Treat::Entities.list + [:entity])
|
46
|
-
@@match_types = {}
|
47
|
-
list.each do |type1|
|
48
|
-
list.each do |type2|
|
49
|
-
@@match_types[type2] ||= {}
|
50
|
-
if (type1 == type2) ||
|
51
|
-
(Treat::Entities.const_get(cc(type1)) <
|
52
|
-
Treat::Entities.const_get(cc(type2)))
|
53
|
-
@@match_types[type2][type1] = true
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
57
|
-
@@match_types
|
58
|
-
end
|
59
|
-
|
60
|
-
# A bottom-up ordering of general types of entities.
|
61
|
-
@@order = [Token, Phrase, Zone, Section, Document, Collection]
|
62
|
-
|
63
|
-
# Return the hierarchy level of the entity
|
64
|
-
# class, the minimum being a Token and the
|
65
|
-
# maximum being a Collection.
|
66
|
-
#
|
67
|
-
# Implement as true comparison functions.
|
68
|
-
def self.rank(type)
|
69
|
-
klass = Treat::Entities.const_get(cc(type))
|
70
|
-
compare = lambda { |a,b| a == b || a < b }
|
71
|
-
1.upto(@@order.size) do |i|
|
72
|
-
return i if compare.call(klass, @@order[i])
|
73
|
-
end
|
74
|
-
end
|
75
|
-
|
76
|
-
end
|
3
|
+
require 'treat/entities/entity'
|
4
|
+
p = Treat.paths.lib + 'treat/entities/*.rb'
|
5
|
+
Dir.glob(p).each { |f| require f }
|
6
|
+
end
|