treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -4,6 +4,15 @@ module Treat
4
4
  # a string or a numeric object. This class
5
5
  # is pretty much self-explanatory.
6
6
  module Buildable
7
+ # Initialize the document with its filename.
8
+ # Optionally specify a reader to read the file.
9
+ # If +read+ is set to false, the document will
10
+ # not be read automatically; in that case, the
11
+ # method #read must be called on the document
12
+ # object to load it in.
13
+ def build(file_or_value = '', id = nil)
14
+ from_anything(file_or_value, id)
15
+ end
7
16
  def from_anything(file_or_value, id)
8
17
  if File.readable?(file_or_value.to_s)
9
18
  from_file(file_or_value)
@@ -17,27 +26,52 @@ module Treat
17
26
  "Use filename, folder, text or a number."
18
27
  end
19
28
  end
20
- def from_string(string)
29
+ def from_string(string, enforce_type = false)
30
+ enforce_type = true if caller_method == :build
21
31
  if self == Treat::Entities::Document ||
22
32
  self == Treat::Entities::Collection
23
33
  raise Treat::Exception,
24
34
  "Cannot create a document or collection from " +
25
35
  "a string (need a readable file/folder)."
26
36
  end
27
- dot = string.count('.') + string.count('!') + string.count('?')
28
- return Treat::Entities::Section.new(string) if dot > 1 ||
29
- (string.count("\n") > 0 && dot == 1)
30
- return Treat::Entities::Sentence.new(string) if dot == 1 && string.size > 5
31
- if string.count(' ') == 0
32
- return Treat::Entities::Clitic.new(string) if string == "'s"
33
- return Treat::Entities::Word.new(string) if string =~ /^[[:alpha:]\-']+$/
34
- return Treat::Entities::Number.new(string) if string =~ /^[[:digit:]]+$/
35
- return Treat::Entities::Punctuation.new(string) if string =~ /^[[:punct:]]+$/
36
- return Treat::Entities::Symbol.new(string)
37
+ unless self == Treat::Entities::Entity
38
+ return self.new(string) if enforce_type
39
+ end
40
+ dot = string.count('.!?')
41
+ if self == Treat::Entities::Phrase
42
+ if dot >= 1
43
+ c = Treat::Entities::Sentence.new(string)
44
+ else
45
+ c = Treat::Entities::Phrase.new(string)
46
+ end
47
+ elsif (self == Treat::Entities::Token) ||
48
+ string.count(' ') == 0
49
+ if string == "'s"
50
+ c = Treat::Entities::Clitic.new(string)
51
+ elsif string =~ /^[[:alpha:]\-']+$/ &&
52
+ string.count(' ') == 0
53
+ c = Treat::Entities::Word.new(string)
54
+ elsif string =~ /^[[:digit:]]+$/
55
+ c = Treat::Entities::Number.new(string)
56
+ elsif string =~ /^[[:punct:]]+$/
57
+ c = Treat::Entities::Punctuation.new(string)
58
+ else
59
+ c = Treat::Entities::Symbol.new(string)
60
+ end
61
+ elsif dot > 1 || string.count("\n") > 0
62
+ c = Treat::Entities::Section.new(string)
63
+ elsif dot >= 1 && dot < 5 && string.size > 5
64
+ c = Treat::Entities::Sentence.new(string)
65
+ elsif string.strip.count(' ') > 0
66
+ c = Treat::Entities::Phrase.new(string)
37
67
  else
38
- return Treat::Entities::Phrase.new(string)
68
+ c = Treat::Entities::Unknown.new(string) unless c
39
69
  end
40
- return Treat::Entities::Unknown.new(string)
70
+ unless self == c.class || self == Treat::Entities::Entity || c.is_a?(self)
71
+ raise "You said that \"#{string}\" was a #{cl(self).downcase}, " +
72
+ "but Treat thinks it is a #{cl(c.class).downcase}."
73
+ end
74
+ c
41
75
  end
42
76
  def from_numeric(numeric)
43
77
  unless self == Treat::Entities::Number
@@ -80,9 +114,19 @@ module Treat
80
114
  ext = file.split('.')[-1]
81
115
  # Humanize the yaml extension.
82
116
  ext = 'yaml' if ext == 'yml'
83
- if Treat::Formatters::Unserializers.list.
84
- include?(ext.downcase.intern)
117
+ if ext == 'yaml'
85
118
  from_serialized_file(file)
119
+ elsif ext == 'xml'
120
+ beginning = nil
121
+ File.open(file) do |w|
122
+ beginning = w.readlines(200)
123
+ end
124
+ beginning = beginning.join(' ')
125
+ if beginning.index('<treat>')
126
+ from_serialized_file(file)
127
+ else
128
+ from_raw_file(file)
129
+ end
86
130
  else
87
131
  from_raw_file(file)
88
132
  end
@@ -2,25 +2,37 @@ module Treat
2
2
  # This module keeps track of all categories that
3
3
  # exist and the methods they implement.
4
4
  module Categories
5
- class << self; attr_accessor :list; end
5
+ class << self
6
+ # A list of all categories.
7
+ attr_accessor :list
8
+ end
6
9
  # Array - list of all categories.
7
10
  self.list = []
8
- # Boolean - does any of the categories have
9
- # a method that corresponds to sym?
10
- def self.have_method?(sym); methods.include?(sym); end
11
- # Cache the list of methods once it has been computed.
12
- @@methods = []
13
- # Array - provide a list of all methods implemented
14
- # by all Treat categories.
15
- def self.methods
16
- return @@methods unless @@methods.empty?
17
- self.list.each do |ns|
18
- ns.methods.each { |method| @@methods << method }
11
+ @@lookup = nil
12
+ # Find the class of a group given its method.
13
+ def self.lookup(method)
14
+ return @@lookup[method] if @@lookup
15
+ @@lookup = {}
16
+
17
+ self.list.each do |category|
18
+ category.groups.each do |group|
19
+ group = category.const_get(group)
20
+ @@lookup[group.method] = group
21
+ methods = group.presets.merge(
22
+ group.preprocessors.merge(
23
+ group.postprocessors
24
+ )
25
+ )
26
+ methods.each do |x,y|
27
+ @@lookup[x] = group
28
+ end
29
+ end
19
30
  end
20
- @@methods
31
+
32
+ @@lookup[method]
21
33
  end
34
+ # Require all categories.
22
35
  require 'treat/category'
23
- require 'treat/detectors'
24
36
  require 'treat/formatters'
25
37
  require 'treat/processors'
26
38
  require 'treat/lexicalizers'
@@ -4,7 +4,7 @@ module Treat
4
4
  module Category
5
5
  # Require the Group class.
6
6
  require 'treat/group'
7
- # Add delegators to the Entities based on the
7
+ # Add workers to the Entities based on the
8
8
  # configuration for a given category.
9
9
  def self.extended(category)
10
10
  Treat::Categories.list << category
@@ -13,7 +13,7 @@ module Treat
13
13
  group = const_get(group)
14
14
  group.targets.each do |entity_type|
15
15
  entity = Treat::Entities.const_get(cc(entity_type))
16
- entity.class_eval { add_delegators group }
16
+ entity.class_eval { add_workers group }
17
17
  end
18
18
  end
19
19
  end
@@ -1,76 +1,93 @@
1
1
  module Treat
2
2
  # Makes a class delegatable, allowing calls on it to be forwarded
3
- # to a delegate class performing the appropriate call.
3
+ # to a worker class performing the appropriate call.
4
4
  module Delegatable
5
- # Add decorator methods to entities.
6
- def add_decorators(group, m)
7
- decorators = group.methods -
8
- Object.methods -
9
- [:type, :type=, :targets, :targets=,
10
- :default, :default=, :add,
11
- :has_target?, :list]
12
- decorators.each do |decorator_m|
13
- define_method(decorator_m) do |delegate=nil, options={}|
14
- options[:decorator] = decorator_m
15
- send(m, delegate, options)
5
+ # Add postprocessor methods to entities.
6
+ def add_presets(group)
7
+ group.presets.each do |preset_m, presets|
8
+ define_method(preset_m) do |worker=nil, options={}|
9
+ options = presets.merge(options)
10
+ m = group.method
11
+ send(m, worker, options)
12
+ features[preset_m] = unset(m)
16
13
  end
17
14
  end
18
15
  end
19
- # Add delegator group to all entities of a class.
20
- def add_delegators(group)
16
+ def add_preprocessors(group)
17
+ group.preprocessors.each do |preprocessor_m, block|
18
+ define_method(preprocessor_m) do |worker=nil, options={}|
19
+ block.call(self, worker, options)
20
+ features[preprocessor_m] = unset(group.method)
21
+ end
22
+ end
23
+ end
24
+ # Add postprocessor methods to entities.
25
+ def add_postprocessors(group, m)
26
+ group.postprocessors.each do |postprocessor_m, block|
27
+ define_method(postprocessor_m) do |worker=nil, options={}|
28
+ options[:postprocessor] = postprocessor_m
29
+ send(m, worker, options)
30
+ end
31
+ end
32
+ end
33
+ # Add worker group to all entities of a class.
34
+ def add_workers(group)
21
35
  # Define each method in group.
22
36
  self.class_eval do
23
37
  m = group.method
24
- add_decorators(group, m)
25
- define_method(m) do |delegate=nil, options={}|
26
- decorator = options.delete(:decorator)
27
- puts self.id if !@features
38
+ add_presets(group)
39
+ add_preprocessors(group)
40
+ add_postprocessors(group, m)
41
+ define_method(m) do |worker=nil, options={}|
42
+ postprocessor =
43
+ options.delete(:postprocessor)
28
44
  if !@features[m].nil?
29
45
  @features[m]
30
46
  else
31
- self.class.call_delegator(
32
- self, m, delegate, decorator,
33
- group, options)
47
+ self.class.call_worker(
48
+ self, m, worker,
49
+ postprocessor,
50
+ group, options
51
+ )
34
52
  end
35
53
  end
36
54
  end
37
55
  end
38
- # Call a delegator.
39
- def call_delegator(entity, m, delegate, decorator, group, options)
40
- if delegate.nil?
41
- delegate = get_missing_delegate(entity, group)
56
+ # Call a worker.
57
+ def call_worker(entity, m, worker, postprocessor, group, options)
58
+ if worker.nil? || worker == :default
59
+ worker = find_worker(entity, group)
42
60
  end
43
- if not group.list.include?(delegate)
44
- raise Treat::Exception, delegate_not_found(delegate, group)
61
+ if not group.list.include?(worker)
62
+ raise Treat::Exception, worker_not_found(worker, group)
45
63
  else
46
- delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
47
- result = entity.accept(group, delegate_klass, m, options)
48
- if decorator
49
- result = group.send(decorator, entity, result)
64
+ worker_klass = group.const_get(cc(worker.to_s).intern)
65
+ result = entity.accept(group, worker_klass, m, options)
66
+ if postprocessor
67
+ result = group.postprocessors[postprocessor].call(entity, result)
50
68
  end
51
69
  if group.type == :annotator
52
- f = decorator.nil? ? m : decorator
53
- entity.features[f] = result
70
+ f = postprocessor.nil? ? m : postprocessor
71
+ entity.features[f] = result unless result == nil
54
72
  end
55
73
  result
56
74
  end
57
75
  end
58
- # Get the default delegate for that language
76
+ # Get the default worker for that language
59
77
  # inside the given group.
60
- def get_language_delegate(language, group)
78
+ def find_worker_for_language(language, group)
61
79
  lang = Treat::Languages.describe(language)
62
80
  lclass = cc(lang).intern
63
81
  if Treat::Languages.constants.include?(lclass)
64
82
  cat = group.to_s.split('::')[-2].intern
65
- lclass = Treat::Languages.const_get(lclass).const_get(cat)
83
+ lclass = Treat::Languages.get(lclass).const_get(cat)
66
84
  g = ucc(cl(group)).intern
67
85
  if !lclass[g] || !lclass[g][0]
68
86
  d = ucc(cl(group))
69
87
  d.gsub!('_', ' ')
70
- d = d[0..-2] if d[-1] == 's'
71
- d = 'delegator to find ' + d
88
+ d = 'worker to find "' + d
72
89
  raise Treat::Exception, "No #{d}" +
73
- " is available for the #{lang} language."
90
+ "\" is available for the #{lang} language."
74
91
  end
75
92
  return lclass[g][0]
76
93
  else
@@ -78,20 +95,20 @@ module Treat
78
95
  "Language '#{lang}' is not supported (yet)."
79
96
  end
80
97
  end
81
- # Get which delegate to use if none has been supplied.
82
- def get_missing_delegate(entity, group)
83
- delegate = group.default.nil? ?
84
- self.get_language_delegate(entity.language, group) :
98
+ # Get which worker to use if none has been supplied.
99
+ def find_worker(entity, group)
100
+ worker = group.default.nil? ?
101
+ self.find_worker_for_language(entity.language, group) :
85
102
  group.default
86
- if delegate == :none
103
+ if worker == :none
87
104
  raise NAT::Exception,
88
- "There is intentionally no default delegate for #{group}."
105
+ "There is intentionally no default worker for #{group}."
89
106
  end
90
- delegate
107
+ worker
91
108
  end
92
109
  # Return an error message and suggest possible typos.
93
- def delegate_not_found(klass, group)
94
- "Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
110
+ def worker_not_found(klass, group)
111
+ "Algorithm '#{ucc(cl(klass))}' couldn't be found in group #{group}." +
95
112
  did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
96
113
  end
97
114
  end
@@ -0,0 +1,44 @@
1
+ module Treat
2
+ module Doable
3
+ def do(*tasks)
4
+ tasks.each do |task|
5
+ if task.is_a?(Hash)
6
+ task.each do |k,v|
7
+ t, w = k, v
8
+ w, o = *w if w.is_a?(Array)
9
+ o ||= {}
10
+ do_task(t, w, o)
11
+ end
12
+ else
13
+ t = task.is_a?(Array) ? task[0] : task
14
+ w = task.is_a?(Array) ? task[1] : nil
15
+ w, o = *w if w.is_a?(Array)
16
+ o ||= {}
17
+ do_task(t, w, o)
18
+ end
19
+ end
20
+ end
21
+ def do_task(task, worker, options)
22
+ group = Categories.lookup(task)
23
+ unless group
24
+ raise Treat::Exception, "Task #{task} does not exist."
25
+ end
26
+ entity_types = group.targets
27
+ f = nil
28
+ entity_types.each do |t|
29
+ f = true if Treat::Entities.match_types[type][t]
30
+ end
31
+ if f || entity_types.include?(:entity)
32
+ send(task, worker, options)
33
+ else
34
+ each_entity(*entity_types) do |entity|
35
+ entity.do_task(task, worker, options)
36
+ end
37
+ unless entity_types.include?(type)
38
+ features.delete(task)
39
+ end
40
+ nil
41
+ end
42
+ end
43
+ end
44
+ end
@@ -6,17 +6,31 @@ module Treat
6
6
  # - Document
7
7
  # - Zone (a Section, Title, Paragraph, or List)
8
8
  # - Sentence
9
- # - Constituent (a Phrase or Clause)
9
+ # - Phrases
10
10
  # - Token (a Word, Number, Punctuation, or Symbol).
11
11
  module Entities
12
+ # Cache a list of defined entity types to
13
+ # improve performance.
14
+ @@list = nil
15
+ # Provide a list of defined entity types,
16
+ # as non-camel case identifiers.
17
+ def self.list
18
+ return @@list if @@list
19
+ @@list = []
20
+ self.constants.each do |constant|
21
+ unless constant == :Entity
22
+ @@list << ucc(constant).intern
23
+ end
24
+ end
25
+ @@list
26
+ end
12
27
  # Require Entity first.
13
28
  require 'treat/entities/entity'
14
29
  # Then require all possible entities.
15
30
  require 'treat/entities/collection'
16
31
  require 'treat/entities/document'
17
32
  require 'treat/entities/zones'
18
- require 'treat/entities/sentence'
19
- require 'treat/entities/constituents'
33
+ require 'treat/entities/phrases'
20
34
  require 'treat/entities/tokens'
21
35
  # Make the constants buildable.
22
36
  constants.each do |entity|
@@ -24,17 +38,23 @@ module Treat
24
38
  const_get(entity).build(value, id)
25
39
  end
26
40
  end
27
- # Cache a list of defined entity types to
28
- # improve performance.
29
- @@list = []
30
- # Provide a list of defined entity types,
31
- # as non-camel case identifiers.
32
- def self.list
33
- return @@list unless @@list.empty?
34
- self.constants.each do |constant|
35
- @@list << :"#{ucc(constant)}"
41
+ # Create entity lookup table.
42
+ # Lookup table
43
+ @@match_types = nil
44
+ def self.match_types
45
+ return @@match_types if @@match_types
46
+ list = (Treat::Entities.list + [:entity])
47
+ @@match_types = {}
48
+ list.each do |type1|
49
+ @@match_types[type1] = {type1 => true}
50
+ list.each do |type2|
51
+ if Treat::Entities.const_get(cc(type1)) <
52
+ Treat::Entities.const_get(cc(type2))
53
+ @@match_types[type1][type2] = true
54
+ end
55
+ end
36
56
  end
37
- @@list
57
+ @@match_types
38
58
  end
39
59
  # Return the hierarchy level of the entity
40
60
  # class, the minimum being a Token and the
@@ -43,7 +63,7 @@ module Treat
43
63
  klass = Entities.const_get(cc(type))
44
64
  compare = lambda { |a,b| a == b || a < b }
45
65
  return 0 if compare.call(klass, Token)
46
- return 1 if compare.call(klass, Constituent)
66
+ return 1 if compare.call(klass, Phrase)
47
67
  return 2 if compare.call(klass, Sentence)
48
68
  return 3 if compare.call(klass, Zone)
49
69
  return 4 if compare.call(klass, Document)