treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
data/lib/treat/buildable.rb
CHANGED
@@ -4,6 +4,15 @@ module Treat
|
|
4
4
|
# a string or a numeric object. This class
|
5
5
|
# is pretty much self-explanatory.
|
6
6
|
module Buildable
|
7
|
+
# Initialize the document with its filename.
|
8
|
+
# Optionally specify a reader to read the file.
|
9
|
+
# If +read+ is set to false, the document will
|
10
|
+
# not be read automatically; in that case, the
|
11
|
+
# method #read must be called on the document
|
12
|
+
# object to load it in.
|
13
|
+
def build(file_or_value = '', id = nil)
|
14
|
+
from_anything(file_or_value, id)
|
15
|
+
end
|
7
16
|
def from_anything(file_or_value, id)
|
8
17
|
if File.readable?(file_or_value.to_s)
|
9
18
|
from_file(file_or_value)
|
@@ -17,27 +26,52 @@ module Treat
|
|
17
26
|
"Use filename, folder, text or a number."
|
18
27
|
end
|
19
28
|
end
|
20
|
-
def from_string(string)
|
29
|
+
def from_string(string, enforce_type = false)
|
30
|
+
enforce_type = true if caller_method == :build
|
21
31
|
if self == Treat::Entities::Document ||
|
22
32
|
self == Treat::Entities::Collection
|
23
33
|
raise Treat::Exception,
|
24
34
|
"Cannot create a document or collection from " +
|
25
35
|
"a string (need a readable file/folder)."
|
26
36
|
end
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
if
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
+
unless self == Treat::Entities::Entity
|
38
|
+
return self.new(string) if enforce_type
|
39
|
+
end
|
40
|
+
dot = string.count('.!?')
|
41
|
+
if self == Treat::Entities::Phrase
|
42
|
+
if dot >= 1
|
43
|
+
c = Treat::Entities::Sentence.new(string)
|
44
|
+
else
|
45
|
+
c = Treat::Entities::Phrase.new(string)
|
46
|
+
end
|
47
|
+
elsif (self == Treat::Entities::Token) ||
|
48
|
+
string.count(' ') == 0
|
49
|
+
if string == "'s"
|
50
|
+
c = Treat::Entities::Clitic.new(string)
|
51
|
+
elsif string =~ /^[[:alpha:]\-']+$/ &&
|
52
|
+
string.count(' ') == 0
|
53
|
+
c = Treat::Entities::Word.new(string)
|
54
|
+
elsif string =~ /^[[:digit:]]+$/
|
55
|
+
c = Treat::Entities::Number.new(string)
|
56
|
+
elsif string =~ /^[[:punct:]]+$/
|
57
|
+
c = Treat::Entities::Punctuation.new(string)
|
58
|
+
else
|
59
|
+
c = Treat::Entities::Symbol.new(string)
|
60
|
+
end
|
61
|
+
elsif dot > 1 || string.count("\n") > 0
|
62
|
+
c = Treat::Entities::Section.new(string)
|
63
|
+
elsif dot >= 1 && dot < 5 && string.size > 5
|
64
|
+
c = Treat::Entities::Sentence.new(string)
|
65
|
+
elsif string.strip.count(' ') > 0
|
66
|
+
c = Treat::Entities::Phrase.new(string)
|
37
67
|
else
|
38
|
-
|
68
|
+
c = Treat::Entities::Unknown.new(string) unless c
|
39
69
|
end
|
40
|
-
|
70
|
+
unless self == c.class || self == Treat::Entities::Entity || c.is_a?(self)
|
71
|
+
raise "You said that \"#{string}\" was a #{cl(self).downcase}, " +
|
72
|
+
"but Treat thinks it is a #{cl(c.class).downcase}."
|
73
|
+
end
|
74
|
+
c
|
41
75
|
end
|
42
76
|
def from_numeric(numeric)
|
43
77
|
unless self == Treat::Entities::Number
|
@@ -80,9 +114,19 @@ module Treat
|
|
80
114
|
ext = file.split('.')[-1]
|
81
115
|
# Humanize the yaml extension.
|
82
116
|
ext = 'yaml' if ext == 'yml'
|
83
|
-
if
|
84
|
-
include?(ext.downcase.intern)
|
117
|
+
if ext == 'yaml'
|
85
118
|
from_serialized_file(file)
|
119
|
+
elsif ext == 'xml'
|
120
|
+
beginning = nil
|
121
|
+
File.open(file) do |w|
|
122
|
+
beginning = w.readlines(200)
|
123
|
+
end
|
124
|
+
beginning = beginning.join(' ')
|
125
|
+
if beginning.index('<treat>')
|
126
|
+
from_serialized_file(file)
|
127
|
+
else
|
128
|
+
from_raw_file(file)
|
129
|
+
end
|
86
130
|
else
|
87
131
|
from_raw_file(file)
|
88
132
|
end
|
data/lib/treat/categories.rb
CHANGED
@@ -2,25 +2,37 @@ module Treat
|
|
2
2
|
# This module keeps track of all categories that
|
3
3
|
# exist and the methods they implement.
|
4
4
|
module Categories
|
5
|
-
class << self
|
5
|
+
class << self
|
6
|
+
# A list of all categories.
|
7
|
+
attr_accessor :list
|
8
|
+
end
|
6
9
|
# Array - list of all categories.
|
7
10
|
self.list = []
|
8
|
-
|
9
|
-
# a
|
10
|
-
def self.
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
@@lookup = nil
|
12
|
+
# Find the class of a group given its method.
|
13
|
+
def self.lookup(method)
|
14
|
+
return @@lookup[method] if @@lookup
|
15
|
+
@@lookup = {}
|
16
|
+
|
17
|
+
self.list.each do |category|
|
18
|
+
category.groups.each do |group|
|
19
|
+
group = category.const_get(group)
|
20
|
+
@@lookup[group.method] = group
|
21
|
+
methods = group.presets.merge(
|
22
|
+
group.preprocessors.merge(
|
23
|
+
group.postprocessors
|
24
|
+
)
|
25
|
+
)
|
26
|
+
methods.each do |x,y|
|
27
|
+
@@lookup[x] = group
|
28
|
+
end
|
29
|
+
end
|
19
30
|
end
|
20
|
-
|
31
|
+
|
32
|
+
@@lookup[method]
|
21
33
|
end
|
34
|
+
# Require all categories.
|
22
35
|
require 'treat/category'
|
23
|
-
require 'treat/detectors'
|
24
36
|
require 'treat/formatters'
|
25
37
|
require 'treat/processors'
|
26
38
|
require 'treat/lexicalizers'
|
data/lib/treat/category.rb
CHANGED
@@ -4,7 +4,7 @@ module Treat
|
|
4
4
|
module Category
|
5
5
|
# Require the Group class.
|
6
6
|
require 'treat/group'
|
7
|
-
# Add
|
7
|
+
# Add workers to the Entities based on the
|
8
8
|
# configuration for a given category.
|
9
9
|
def self.extended(category)
|
10
10
|
Treat::Categories.list << category
|
@@ -13,7 +13,7 @@ module Treat
|
|
13
13
|
group = const_get(group)
|
14
14
|
group.targets.each do |entity_type|
|
15
15
|
entity = Treat::Entities.const_get(cc(entity_type))
|
16
|
-
entity.class_eval {
|
16
|
+
entity.class_eval { add_workers group }
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
data/lib/treat/delegatable.rb
CHANGED
@@ -1,76 +1,93 @@
|
|
1
1
|
module Treat
|
2
2
|
# Makes a class delegatable, allowing calls on it to be forwarded
|
3
|
-
# to a
|
3
|
+
# to a worker class performing the appropriate call.
|
4
4
|
module Delegatable
|
5
|
-
# Add
|
6
|
-
def
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
define_method(decorator_m) do |delegate=nil, options={}|
|
14
|
-
options[:decorator] = decorator_m
|
15
|
-
send(m, delegate, options)
|
5
|
+
# Add postprocessor methods to entities.
|
6
|
+
def add_presets(group)
|
7
|
+
group.presets.each do |preset_m, presets|
|
8
|
+
define_method(preset_m) do |worker=nil, options={}|
|
9
|
+
options = presets.merge(options)
|
10
|
+
m = group.method
|
11
|
+
send(m, worker, options)
|
12
|
+
features[preset_m] = unset(m)
|
16
13
|
end
|
17
14
|
end
|
18
15
|
end
|
19
|
-
|
20
|
-
|
16
|
+
def add_preprocessors(group)
|
17
|
+
group.preprocessors.each do |preprocessor_m, block|
|
18
|
+
define_method(preprocessor_m) do |worker=nil, options={}|
|
19
|
+
block.call(self, worker, options)
|
20
|
+
features[preprocessor_m] = unset(group.method)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
# Add postprocessor methods to entities.
|
25
|
+
def add_postprocessors(group, m)
|
26
|
+
group.postprocessors.each do |postprocessor_m, block|
|
27
|
+
define_method(postprocessor_m) do |worker=nil, options={}|
|
28
|
+
options[:postprocessor] = postprocessor_m
|
29
|
+
send(m, worker, options)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
# Add worker group to all entities of a class.
|
34
|
+
def add_workers(group)
|
21
35
|
# Define each method in group.
|
22
36
|
self.class_eval do
|
23
37
|
m = group.method
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
38
|
+
add_presets(group)
|
39
|
+
add_preprocessors(group)
|
40
|
+
add_postprocessors(group, m)
|
41
|
+
define_method(m) do |worker=nil, options={}|
|
42
|
+
postprocessor =
|
43
|
+
options.delete(:postprocessor)
|
28
44
|
if !@features[m].nil?
|
29
45
|
@features[m]
|
30
46
|
else
|
31
|
-
self.class.
|
32
|
-
|
33
|
-
|
47
|
+
self.class.call_worker(
|
48
|
+
self, m, worker,
|
49
|
+
postprocessor,
|
50
|
+
group, options
|
51
|
+
)
|
34
52
|
end
|
35
53
|
end
|
36
54
|
end
|
37
55
|
end
|
38
|
-
# Call a
|
39
|
-
def
|
40
|
-
if
|
41
|
-
|
56
|
+
# Call a worker.
|
57
|
+
def call_worker(entity, m, worker, postprocessor, group, options)
|
58
|
+
if worker.nil? || worker == :default
|
59
|
+
worker = find_worker(entity, group)
|
42
60
|
end
|
43
|
-
if not group.list.include?(
|
44
|
-
raise Treat::Exception,
|
61
|
+
if not group.list.include?(worker)
|
62
|
+
raise Treat::Exception, worker_not_found(worker, group)
|
45
63
|
else
|
46
|
-
|
47
|
-
result = entity.accept(group,
|
48
|
-
if
|
49
|
-
result = group.
|
64
|
+
worker_klass = group.const_get(cc(worker.to_s).intern)
|
65
|
+
result = entity.accept(group, worker_klass, m, options)
|
66
|
+
if postprocessor
|
67
|
+
result = group.postprocessors[postprocessor].call(entity, result)
|
50
68
|
end
|
51
69
|
if group.type == :annotator
|
52
|
-
f =
|
53
|
-
entity.features[f] = result
|
70
|
+
f = postprocessor.nil? ? m : postprocessor
|
71
|
+
entity.features[f] = result unless result == nil
|
54
72
|
end
|
55
73
|
result
|
56
74
|
end
|
57
75
|
end
|
58
|
-
# Get the default
|
76
|
+
# Get the default worker for that language
|
59
77
|
# inside the given group.
|
60
|
-
def
|
78
|
+
def find_worker_for_language(language, group)
|
61
79
|
lang = Treat::Languages.describe(language)
|
62
80
|
lclass = cc(lang).intern
|
63
81
|
if Treat::Languages.constants.include?(lclass)
|
64
82
|
cat = group.to_s.split('::')[-2].intern
|
65
|
-
lclass = Treat::Languages.
|
83
|
+
lclass = Treat::Languages.get(lclass).const_get(cat)
|
66
84
|
g = ucc(cl(group)).intern
|
67
85
|
if !lclass[g] || !lclass[g][0]
|
68
86
|
d = ucc(cl(group))
|
69
87
|
d.gsub!('_', ' ')
|
70
|
-
d =
|
71
|
-
d = 'delegator to find ' + d
|
88
|
+
d = 'worker to find "' + d
|
72
89
|
raise Treat::Exception, "No #{d}" +
|
73
|
-
" is available for the #{lang} language."
|
90
|
+
"\" is available for the #{lang} language."
|
74
91
|
end
|
75
92
|
return lclass[g][0]
|
76
93
|
else
|
@@ -78,20 +95,20 @@ module Treat
|
|
78
95
|
"Language '#{lang}' is not supported (yet)."
|
79
96
|
end
|
80
97
|
end
|
81
|
-
# Get which
|
82
|
-
def
|
83
|
-
|
84
|
-
self.
|
98
|
+
# Get which worker to use if none has been supplied.
|
99
|
+
def find_worker(entity, group)
|
100
|
+
worker = group.default.nil? ?
|
101
|
+
self.find_worker_for_language(entity.language, group) :
|
85
102
|
group.default
|
86
|
-
if
|
103
|
+
if worker == :none
|
87
104
|
raise NAT::Exception,
|
88
|
-
"There is intentionally no default
|
105
|
+
"There is intentionally no default worker for #{group}."
|
89
106
|
end
|
90
|
-
|
107
|
+
worker
|
91
108
|
end
|
92
109
|
# Return an error message and suggest possible typos.
|
93
|
-
def
|
94
|
-
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
110
|
+
def worker_not_found(klass, group)
|
111
|
+
"Algorithm '#{ucc(cl(klass))}' couldn't be found in group #{group}." +
|
95
112
|
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
96
113
|
end
|
97
114
|
end
|
data/lib/treat/doable.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
module Treat
|
2
|
+
module Doable
|
3
|
+
def do(*tasks)
|
4
|
+
tasks.each do |task|
|
5
|
+
if task.is_a?(Hash)
|
6
|
+
task.each do |k,v|
|
7
|
+
t, w = k, v
|
8
|
+
w, o = *w if w.is_a?(Array)
|
9
|
+
o ||= {}
|
10
|
+
do_task(t, w, o)
|
11
|
+
end
|
12
|
+
else
|
13
|
+
t = task.is_a?(Array) ? task[0] : task
|
14
|
+
w = task.is_a?(Array) ? task[1] : nil
|
15
|
+
w, o = *w if w.is_a?(Array)
|
16
|
+
o ||= {}
|
17
|
+
do_task(t, w, o)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
def do_task(task, worker, options)
|
22
|
+
group = Categories.lookup(task)
|
23
|
+
unless group
|
24
|
+
raise Treat::Exception, "Task #{task} does not exist."
|
25
|
+
end
|
26
|
+
entity_types = group.targets
|
27
|
+
f = nil
|
28
|
+
entity_types.each do |t|
|
29
|
+
f = true if Treat::Entities.match_types[type][t]
|
30
|
+
end
|
31
|
+
if f || entity_types.include?(:entity)
|
32
|
+
send(task, worker, options)
|
33
|
+
else
|
34
|
+
each_entity(*entity_types) do |entity|
|
35
|
+
entity.do_task(task, worker, options)
|
36
|
+
end
|
37
|
+
unless entity_types.include?(type)
|
38
|
+
features.delete(task)
|
39
|
+
end
|
40
|
+
nil
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
data/lib/treat/entities.rb
CHANGED
@@ -6,17 +6,31 @@ module Treat
|
|
6
6
|
# - Document
|
7
7
|
# - Zone (a Section, Title, Paragraph, or List)
|
8
8
|
# - Sentence
|
9
|
-
# -
|
9
|
+
# - Phrases
|
10
10
|
# - Token (a Word, Number, Punctuation, or Symbol).
|
11
11
|
module Entities
|
12
|
+
# Cache a list of defined entity types to
|
13
|
+
# improve performance.
|
14
|
+
@@list = nil
|
15
|
+
# Provide a list of defined entity types,
|
16
|
+
# as non-camel case identifiers.
|
17
|
+
def self.list
|
18
|
+
return @@list if @@list
|
19
|
+
@@list = []
|
20
|
+
self.constants.each do |constant|
|
21
|
+
unless constant == :Entity
|
22
|
+
@@list << ucc(constant).intern
|
23
|
+
end
|
24
|
+
end
|
25
|
+
@@list
|
26
|
+
end
|
12
27
|
# Require Entity first.
|
13
28
|
require 'treat/entities/entity'
|
14
29
|
# Then require all possible entities.
|
15
30
|
require 'treat/entities/collection'
|
16
31
|
require 'treat/entities/document'
|
17
32
|
require 'treat/entities/zones'
|
18
|
-
require 'treat/entities/
|
19
|
-
require 'treat/entities/constituents'
|
33
|
+
require 'treat/entities/phrases'
|
20
34
|
require 'treat/entities/tokens'
|
21
35
|
# Make the constants buildable.
|
22
36
|
constants.each do |entity|
|
@@ -24,17 +38,23 @@ module Treat
|
|
24
38
|
const_get(entity).build(value, id)
|
25
39
|
end
|
26
40
|
end
|
27
|
-
#
|
28
|
-
#
|
29
|
-
@@
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
@@
|
41
|
+
# Create entity lookup table.
|
42
|
+
# Lookup table
|
43
|
+
@@match_types = nil
|
44
|
+
def self.match_types
|
45
|
+
return @@match_types if @@match_types
|
46
|
+
list = (Treat::Entities.list + [:entity])
|
47
|
+
@@match_types = {}
|
48
|
+
list.each do |type1|
|
49
|
+
@@match_types[type1] = {type1 => true}
|
50
|
+
list.each do |type2|
|
51
|
+
if Treat::Entities.const_get(cc(type1)) <
|
52
|
+
Treat::Entities.const_get(cc(type2))
|
53
|
+
@@match_types[type1][type2] = true
|
54
|
+
end
|
55
|
+
end
|
36
56
|
end
|
37
|
-
@@
|
57
|
+
@@match_types
|
38
58
|
end
|
39
59
|
# Return the hierarchy level of the entity
|
40
60
|
# class, the minimum being a Token and the
|
@@ -43,7 +63,7 @@ module Treat
|
|
43
63
|
klass = Entities.const_get(cc(type))
|
44
64
|
compare = lambda { |a,b| a == b || a < b }
|
45
65
|
return 0 if compare.call(klass, Token)
|
46
|
-
return 1 if compare.call(klass,
|
66
|
+
return 1 if compare.call(klass, Phrase)
|
47
67
|
return 2 if compare.call(klass, Sentence)
|
48
68
|
return 3 if compare.call(klass, Zone)
|
49
69
|
return 4 if compare.call(klass, Document)
|