treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,83 @@
1
+ module Treat::Entities::Abilities::Magical
2
+
3
+ # Parse "magic methods", which allow the following
4
+ # syntaxes to be used (where 'word' can be replaced
5
+ # by any entity type, e.g. token, zone, etc.):
6
+ #
7
+ # - each_word : iterate over each entity of type word.
8
+ # - words: return an array of words in the entity.
9
+ # - word: return the first word in the entity.
10
+ # - word_count: return the number of words in the entity.
11
+ # - words_with_*(value) (where is an arbitrary feature):
12
+ # return the words that have the given feature.
13
+ # - word_with_*(value) : return the first word with
14
+ # the feature specified by * in value.
15
+ #
16
+ # Also provides magical methods for types of words:
17
+ #
18
+ # - each_noun:
19
+ # - nouns:
20
+ # - noun:
21
+ # - noun_count:
22
+ # - nouns_with_*(value)
23
+ # - noun_with_*(value)
24
+ #
25
+ def magic(sym, *args)
26
+
27
+ @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
28
+ @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
29
+
30
+ method = sym.to_s =~ /entities/ ?
31
+ sym.to_s.gsub('entities', 'entitys') :
32
+ method = sym.to_s
33
+
34
+ if method =~ /^#{@@entities_regexp}s$/
35
+ entities_with_type($1.intern)
36
+ elsif method =~ /^#{@@entities_regexp}$/
37
+ first_but_warn(entities_with_type($1.intern), $1)
38
+ elsif method =~ /^parent_#{@@entities_regexp}$/
39
+ ancestor_with_type($1.intern)
40
+ elsif method =~ /^each_#{@@entities_regexp}$/
41
+ each_entity($1.intern) { |e| yield e }
42
+ elsif method =~ /^#{@@entities_regexp}_count$/
43
+ entities_with_type($1.intern).size
44
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
45
+ entities_with_feature($2.intern, args[0], $1.intern)
46
+ elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
47
+ first_but_warn(entities_with_feature(
48
+ $2.intern, args[0], $1.intern), $1)
49
+ elsif method =~ /^each_#{@@entities_regexp}_with_([a-z]*)$/
50
+ entities_with_feature($2.intern, args[0],
51
+ $1.intern).each { |e| yield e }
52
+ elsif method =~ /^each_with_([a-z]*)$/
53
+ entities_with_feature($2.intern,
54
+ args[0], $1.intern).each { |e| yield e }
55
+ elsif method =~ /^each_#{@@cats_regexp}$/
56
+ entities_with_category($1.intern
57
+ ).each { |e| yield e }
58
+ elsif method =~ /^#{@@cats_regexp}s$/
59
+ entities_with_category($1.intern)
60
+ elsif method =~ /^#{@@cats_regexp}$/
61
+ first_but_warn(entities_with_category($1.intern), $1)
62
+ elsif method =~ /^#{@@cats_regexp}_count$/
63
+ entities_with_category($1.intern).size
64
+ elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
65
+ entities_with_feature($2.intern, args[0], $1)
66
+ elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
67
+ first_but_warn(entities_with_feature(
68
+ $2.intern, args[0], $1.intern), $1)
69
+ elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
70
+ f = send(:"#{$2}s".intern).first
71
+ f ? f.send($1.intern) : nil
72
+ elsif method =~ /^frequency_in_#{@@entities_regexp}$/
73
+ frequency_in($1.intern)
74
+ # first_word
75
+ # tag_of_first_verb
76
+ # tag_of_title
77
+ else
78
+ return :no_magic
79
+ end
80
+ end
81
+
82
+
83
+ end
@@ -0,0 +1,74 @@
1
+ # Registers occurences of textual values inside
2
+ # all children entity. Useful to calculate frequency.
3
+ module Treat::Entities::Abilities::Registrable
4
+
5
+ # Registers a token in the @registry hash.
6
+ def register(entity)
7
+
8
+ if entity.is_a?(Treat::Entities::Token) ||
9
+ entity.is_a?(Treat::Entities::Phrase)
10
+ val = entity.to_s.downcase
11
+ @registry[:value][val] ||= 0
12
+ @registry[:value][val] += 1
13
+ end
14
+
15
+ @registry[:id][entity.id] = true
16
+ @registry[:type][entity.type] ||= 0
17
+ @registry[:type][entity.type] += 1
18
+ @registry[:position][entity.id] = @count
19
+ @count += 1
20
+
21
+ @parent.register(entity) if has_parent?
22
+
23
+ end
24
+
25
+ # Backtrack up the tree to find a token registry,
26
+ # by default the one in the root node of any entity.
27
+ def registry(type = nil)
28
+ if has_parent? &&
29
+ type != self.type
30
+ @parent.registry(type)
31
+ else
32
+ @registry
33
+ end
34
+ end
35
+
36
+ def contains_id?(id)
37
+
38
+ @registry[:id][id]
39
+
40
+ end
41
+
42
+ def contains_value?(val)
43
+
44
+ @registry[:value][val] ?
45
+ true : false
46
+
47
+ end
48
+
49
+ def contains_type?(type1)
50
+
51
+ return true if @registry[:type][type1]
52
+
53
+ @registry[:type].each do |type2, count|
54
+ if Treat::Entities.
55
+ match_types[type1][type2]
56
+ return true
57
+ end
58
+ end
59
+
60
+ false
61
+
62
+ end
63
+
64
+ def contains_types?(types)
65
+
66
+ types.each do |type|
67
+ return true if contains_type?(type)
68
+ end
69
+
70
+ false
71
+
72
+ end
73
+
74
+ end
@@ -0,0 +1,91 @@
1
+ # Gives entities the ability to be converted
2
+ # to string representations (#to_string, #to_s,
3
+ # #to_str, #inspect, #print_tree).
4
+ module Treat::Entities::Abilities::Stringable
5
+
6
+ # Return the entity's true string value in
7
+ # plain text format. Non-terminal entities
8
+ # will normally have an empty value.
9
+ def to_string; @value; end
10
+
11
+ # Returns the entity's string value by
12
+ # imploding the value of all terminal
13
+ # entities in the subtree of that entity.
14
+ def to_s
15
+ @value != '' ? @value : implode.strip
16
+ end
17
+
18
+ # #to_str is the same as #to_s.
19
+ alias :to_str :to_s
20
+
21
+ # Return a shortened value of the entity's
22
+ # string value using [...], with a cutoff
23
+ # number of words or length.
24
+ def short_value(max_length = 30)
25
+ s = to_s
26
+ words = s.split(' ')
27
+ if s.length < max_length
28
+ s
29
+ else
30
+ words[0..2].join(' ') + ' [...] ' +
31
+ words[-2..-1].join(' ')
32
+ end
33
+ end
34
+
35
+ # Return an informative string representation
36
+ # of the entity.
37
+ def inspect
38
+ s = "#{cl(self.class)} (#{@id.to_s})"
39
+ if caller_method(2) == :inspect
40
+ @id.to_s
41
+ else
42
+ dependencies = []
43
+ @dependencies.each do |dependency|
44
+ dependencies <<
45
+ "#{dependency.target}#{dependency.type}"
46
+ end
47
+ s += " --- #{short_value.inspect}" +
48
+ " --- #{@features.inspect} " +
49
+ " --- #{dependencies.inspect} "
50
+ end
51
+ s
52
+ end
53
+
54
+ # Print out an ASCII representation of the tree.
55
+ def print_tree; puts visualize(:tree); end
56
+
57
+ # Helper method to implode the string value of the subtree.
58
+ def implode
59
+
60
+ return @value.dup if !has_children?
61
+
62
+ value = ''
63
+
64
+ each do |child|
65
+
66
+ if child.is_a?(Treat::Entities::Section)
67
+ value += "\n\n"
68
+ end
69
+
70
+ if child.is_a?(Treat::Entities::Token) || child.value != ''
71
+ if child.is_a?(Treat::Entities::Punctuation) ||
72
+ child.is_a?(Treat::Entities::Clitic)
73
+ value.strip!
74
+ end
75
+ value += child.to_s + ' '
76
+ else
77
+ value += child.implode
78
+ end
79
+
80
+ if child.is_a?(Treat::Entities::Title) ||
81
+ child.is_a?(Treat::Entities::Paragraph)
82
+ value += "\n\n"
83
+ end
84
+
85
+ end
86
+
87
+ value
88
+
89
+ end
90
+
91
+ end
@@ -0,0 +1,104 @@
1
+ module Treat::Entities
2
+
3
+ # Require the generic entity lass.
4
+ require 'treat/entities/entity'
5
+
6
+ # Represents a collection of texts.
7
+ class Collection < Entity
8
+
9
+ # Initialize the collection with a folder
10
+ # containing the texts of the collection.
11
+ def initialize(folder = nil, id = nil)
12
+ super('', id)
13
+ set :folder, folder
14
+ i = folder + '/.index'
15
+ set :index, i if FileTest.directory?(i)
16
+ end
17
+
18
+ # Works like the default <<, but if the
19
+ # file being added is a collection or a
20
+ # document, then copy that collection or
21
+ # document into this collection's folder.
22
+ def <<(entities, copy = true)
23
+ unless entities.is_a? Array
24
+ entities = [entities]
25
+ end
26
+ entities.each do |entity|
27
+ if [:document, :collection].
28
+ include?(entity.type) && copy
29
+ entity = entity.copy_into(self)
30
+ end
31
+ end
32
+ super(entities)
33
+ end
34
+
35
+ end
36
+
37
+ # Represents a document.
38
+ class Document < Entity
39
+
40
+ def initialize(file = nil, id = nil)
41
+ super('', id)
42
+ set :file, file
43
+ end
44
+
45
+ end
46
+
47
+ # Represents a section, usually with a title
48
+ # and at least one paragraph.
49
+ class Section < Entity; end
50
+
51
+ # Represents a zone of text
52
+ # (Title, Paragraph, List, Quote).
53
+ class Zone < Entity; end
54
+
55
+ # Represents a title, subtitle, logical header.
56
+ class Title < Zone; end
57
+
58
+ # Represents a paragraph.
59
+ class Paragraph < Zone; end
60
+
61
+ # Represents a list.
62
+ class List < Zone; end
63
+
64
+ # Represents a group of words.
65
+ class Phrase < Entity; end
66
+
67
+ # Represents a group of words with a sentence ender.
68
+ class Sentence < Phrase; end
69
+
70
+ # Represents a terminal element in the text structure.
71
+ class Token < Entity
72
+ end
73
+
74
+ # Represents a word.
75
+ class Word < Token
76
+ end
77
+
78
+ # Represents a clitic ('s).
79
+ class Clitic < Token; end
80
+
81
+ # Represents a number.
82
+ class Number < Token
83
+ def to_i; to_s.to_i; end
84
+ def to_f; to_s.to_f; end
85
+ end
86
+
87
+ # Represents a punctuation sign.
88
+ class Punctuation < Token; end
89
+
90
+ # Represents a character that is neither
91
+ # alphabetical, numerical or a punctuation
92
+ # character (e.g. @#$%&*).
93
+ class Symbol < Token; end
94
+
95
+ # Represents a url.
96
+ class Url < Token; end
97
+
98
+ # Represents a valid RFC822 address.
99
+ class Email < Token; end
100
+
101
+ # Represents an entity of unknown type.
102
+ class Unknown; end
103
+
104
+ end
@@ -1,258 +1,135 @@
1
- require 'treat/tree'
2
- require 'treat/feature'
3
- require 'treat/delegatable'
4
- require 'treat/visitable'
5
- require 'treat/registrable'
6
- require 'treat/buildable'
7
- require 'treat/doable'
8
- require 'treat/viewable'
9
- require 'treat/features'
1
+ module Treat::Entities
10
2
 
11
- module Treat
12
- module Entities
13
- class Entity < Tree::Node
14
- # A Symbol representing the lowercase version of the class name.
15
- attr_accessor :type
16
- # Implements support for #register
17
- include Registrable
18
- # Implement support for #accept.
19
- include Visitable
20
- # Implement support for #self.add_workers
21
- extend Delegatable
22
- # Implement support for #self.from_*
23
- extend Buildable
24
- # Implement support for #do.
25
- include Doable
26
- # Implement support for to_s, inspect, etc.
27
- include Viewable
28
- # Initialize the entity with its value and
29
- # (optionally) a unique identifier. By default,
30
- # the object_id will be used as id. Also initialize
31
- # the token registry in the root node.
32
- def initialize(value = '', id = nil)
33
- id ||= object_id
34
- super(value, id)
35
- @type = :entity
36
- # @match_types = Treat::Entities.match_types
37
- end
38
- # Catch missing methods to support method-like
39
- # access to features (e.g. entity.categoryinstead of
40
- # entity.features[:cat]) and to support magic
41
- # methods (see #parse_magic_method). If the
42
- # feature does not exist
43
- def method_missing(sym, *args, &block)
44
- return self.build(*args) if sym == nil
45
- if !@features.has_key?(sym)
46
- r = parse_magic_method(sym, *args, &block)
47
- if r == :no_magic
48
- begin
49
- super(sym, *args, &block)
50
- rescue NoMethodError
51
- return false if sym.to_s[-1] == '?'
52
- if Categories.lookup(sym)
53
- msg = "Method #{sym} cannot be called on a #{type}."
54
- else
55
- msg = "Method #{sym} does not exist."
56
- msg += did_you_mean?(Category.methods, sym)
57
- end
58
- raise Treat::Exception, msg
59
- end
60
- else
61
- r
62
- end
63
- else
64
- @features[sym]
65
- end
66
- end
67
- # Parse "magic methods", which allow the following
68
- # syntaxes to be used (where 'word' can be replaced
69
- # by any entity type, e.g. token, zone, etc.):
70
- #
71
- # - each_word : iterate over each entity of type word.
72
- # - words: return an array of words in the entity.
73
- # - word: return the first word in the entity.
74
- # - word_count: return the number of words in the entity.
75
- # - words_with_*(value) (where is an arbitrary feature):
76
- # return the words that have the given feature.
77
- # - word_with_*(value) : return the first word with
78
- # the feature specified by * in value.
79
- #
80
- # Also provides magical methods for types of words:
81
- #
82
- # - each_noun:
83
- # - nouns:
84
- # - noun:
85
- # - noun_count:
86
- # - nouns_with_*(value)
87
- # - noun_with_*(value)
88
- #
89
- def parse_magic_method(sym, *args)
90
- @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
91
- @@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
92
- method = sym.to_s =~ /entities/ ?
93
- sym.to_s.gsub('entities', 'entitys') :
94
- method = sym.to_s
95
- if method =~ /^#{@@entities_regexp}s$/
96
- a = []
97
- each_entity($1.intern) { |e| a << e }
98
- a
99
- elsif method =~ /^#{@@entities_regexp}$/
100
- a = []
101
- each_entity($1.intern) { |e| a << e }
102
- first_but_warn(a, $1)
103
- elsif method =~ /^parent_#{@@entities_regexp}$/
104
- ancestor_with_types($1.intern)
105
- elsif method =~ /^each_#{@@entities_regexp}$/
106
- each_entity($1.intern) { |e| yield e }
107
- elsif method =~ /^#{@@entities_regexp}_count$/
108
- i = 0
109
- each_entity($1.intern) { |e| i += 1 }
110
- i
111
- elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
112
- a = []
113
- each_entity($1.intern) do |e|
114
- a << e if e.has?($2.intern) &&
115
- e.send($2.intern) == args[0]
116
- end
117
- a
118
- elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
119
- a = []
120
- each_entity($1.intern) do |e|
121
- a << e if e.has?($2.intern) &&
122
- e.send($2.intern) == args[0]
123
- end
124
- first_but_warn(a, $1)
125
- elsif method =~ /^each_with_([a-z]*)$/
126
- each_entity do |e|
127
- yield e if e.has?($1.intern) &&
128
- e.send($1.intern) == args[0]
129
- end
130
- elsif method =~ /^each_#{@@cats_regexp}$/
131
- each_entity(:word) { |e| yield e if e.category == $1.intern }
132
- elsif method =~ /^#{@@cats_regexp}s$/
133
- a = []
134
- each_entity(:word) { |e| a << e if e.category == $1.intern }
135
- a
136
- elsif method =~ /^#{@@cats_regexp}$/
137
- a = []
138
- each_entity(:word) { |e| a << e if e.category == $1.intern }
139
- first_but_warn(a, $1)
140
- elsif method =~ /^#{@@cats_regexp}_count$/
141
- i = 0
142
- each_entity(:word) { |e| i += 1 if e.category == $1.intern }
143
- i
144
- elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
145
- a = []
146
- each_entity(:word) do |e|
147
- a << e if e.category == $1.intern &&
148
- e.has?($2.intern) && e.send($2.intern) == args[0]
149
- end
150
- a
151
- elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
152
- a = []
153
- each_entity(:word) do |e|
154
- a << e if e.category== $1.intern &&
155
- e.has?($2.intern) && e.send($2.intern) == args[0]
156
- end
157
- first_but_warn(a, $1)
158
- elsif method =~ /^is_#{@@entities_regexp}\?$/
159
- type.to_s == $1
160
- elsif method =~ /^is_#{@@cats_regexp}\?$/
161
- category.to_s == $1
162
- else
163
- return :no_magic
164
- end
3
+ # Require base class for Entity.
4
+ require 'treat/tree'
5
+
6
+ class Entity < Treat::Tree::Node
7
+
8
+ # A Symbol representing the lowercase
9
+ # version of the class name.
10
+ attr_accessor :type
11
+
12
+ # Require abilities.
13
+ require 'treat/entities/abilities'
14
+
15
+ # Implements support for #register,
16
+ # #registry, and #contains_* methods.
17
+ include Abilities::Registrable
18
+
19
+ # Implement support for #self.add_workers
20
+ extend Abilities::Delegatable
21
+
22
+ # Implement support for #self.print_debug and
23
+ # #self.invalid_call_msg
24
+ extend Abilities::Debuggable
25
+
26
+ # Implement support for #self.build
27
+ # and #self.from_*
28
+ extend Abilities::Buildable
29
+
30
+ # Implement support for #do.
31
+ include Abilities::Doable
32
+
33
+ # Implement support for #frequency,
34
+ # #frequency_in_parent and #position_in_parent.
35
+ include Abilities::Countable
36
+
37
+ # Implement support for #magic.
38
+ include Abilities::Magical
39
+
40
+ # Implement support for #to_s, #inspect, etc.
41
+ include Abilities::Stringable
42
+
43
+ # Implement support for #check_has
44
+ # and #check_hasnt_children?
45
+ include Abilities::Checkable
46
+
47
+ # Implement support for #each_entity, as well as
48
+ # #entities_with_type, #ancestors_with_type,
49
+ # #entities_with_feature, #entities_with_category.
50
+ include Abilities::Iterable
51
+
52
+ # Implement support for #export to export
53
+ # a line of a data set based on a classification.
54
+ include Abilities::Exportable
55
+
56
+ # Implement support for #copy_into.
57
+ include Abilities::Copyable
58
+
59
+ # Initialize the entity with its value and
60
+ # (optionally) a unique identifier. By default,
61
+ # the object_id will be used as id.
62
+ def initialize(value = '', id = nil)
63
+ id ||= object_id
64
+ super(value, id)
65
+ @type = :entity if self == Entity
66
+ @type ||= ucc(cl(self.class)).intern
67
+ unless is_a?(Treat::Entities::Token)
68
+ @count = 0
69
+ @registry = {
70
+ :id => {},
71
+ :value => {},
72
+ :type => {},
73
+ :position => {}
74
+ }
165
75
  end
166
- # Add an entity to the current entity.
167
- # Registers the entity in the root node
168
- # token registry if the entity is a leaf.
169
- #
170
- # @see Treat::Registrable
171
- def <<(entities, clear_parent = true)
172
- entities = [entities] unless entities.is_a? Array
173
- entities.each do |entity|
174
- if entity.is_a?(Treat::Entities::Token) ||
175
- entity.is_a?(Treat::Entities::Phrase)
176
- register_token(entity) unless entity.value == ''
177
- end
178
- end
179
- super(entities)
180
- @parent.value = '' if has_parent?
181
- entities[0]
76
+ end
77
+
78
+
79
+ # Add an entity to the current entity.
80
+ # Registers the entity in the root node
81
+ # token registry if the entity is a leaf.
82
+ #
83
+ # @see Treat::Registrable
84
+ def <<(entities, clear_parent = true)
85
+ unless entities.is_a? Array
86
+ entities = [entities]
182
87
  end
183
- # Yields each entity of any of the supplied
184
- # types in the children tree of this Entity.
185
- # Note that this function is recursive, unlike
186
- # #each. It does not yield the top element being
187
- # recursed.
188
- #
189
- # This function NEEDS to be ported to C (see source).
190
- def each_entity(*types)
191
- types = [:entity] if types.size == 0
192
- f = false
193
- types.each { |t2| f = true if Treat::Entities.match_types[t2][type] }
194
- yield self if f
195
- unless @children.size == 0
196
- @children.each do |child|
197
- child.each_entity(*types) { |y| yield y }
198
- end
199
- end
88
+ entities.each do |entity|
89
+ register(entity)
200
90
  end
201
-
202
- # Replace with:
203
- #inline do |builder|
204
- #
205
- # builder.c_raw <<-EOS, :arity => -1
91
+ super(entities)
92
+ @parent.value = '' if has_parent?
93
+ entities[0]
94
+ end
206
95
 
207
96
 
97
+ # Catch missing methods to support method-like
98
+ # access to features (e.g. entity.category
99
+ # instead of entity.features[:category]) and to
100
+ # support magic methods (see #magic).
101
+ #
102
+ # If the feature or magic method does not exist,
103
+ # or can't be parsed, raises an exception.
104
+ #
105
+ # Also catches the "empty" method call (e.g.
106
+ # Word('hello') or Word 'hello') as syntactic
107
+ # sugar for the #self.build method.
108
+ def method_missing(sym, *args, &block)
109
+ return self.build(*args) if sym == nil
208
110
 
209
- #EOS
210
- #end
211
- # Returns the first ancestor of this entity that has the given type.
212
- def ancestor_with_types(*types)
213
- ancestor = @parent
214
- match_types = lambda do |t1, t2s|
215
- f = false
216
- t2s.each do |t2|
217
- if Treat::Entities.match_types[t1][t2]
218
- f = true; break
219
- end
220
- end
221
- f
222
- end
223
- if ancestor
224
- while not match_types.call(ancestor.type, types)
225
- return nil unless (ancestor && ancestor.has_parent?)
226
- ancestor = ancestor.parent
111
+ if !@features.has_key?(sym)
112
+ r = magic(sym, *args, &block)
113
+ return r unless r == :no_magic
114
+ begin
115
+ super(sym, *args, &block)
116
+ rescue NoMethodError
117
+ raise Treat::Exception,
118
+ if Treat::Categories.lookup(sym)
119
+ msg = "Method #{sym} cannot " +
120
+ "be called on a #{type}."
121
+ else
122
+ msg = "Method #{sym} does not exist."
123
+ msg += did_you_mean?(
124
+ Treat::Categories.methods, sym)
227
125
  end
228
- match_types.call(ancestor.type, types) ? ancestor : nil
229
- end
230
- end
231
- alias :ancestor_with_type :ancestor_with_types
232
- # Returns the (direct) ancestors of this entity that
233
- # have the given type.
234
- def ancestors_with_types(*types)
235
- ancestor = self
236
- ancestors = []
237
- while (a = ancestor.ancestor_with_types(*types))
238
- ancestors << a
239
- ancestor = ancestor.parent
240
126
  end
241
- ancestors
242
- end
243
- alias :ancestors_with_type :ancestors_with_types
244
- # Return the first element in the array, warning if not
245
- # the only one in the array. Used for magic methods: e.g.,
246
- # the magic method "word" if called on a sentence
247
- # with many words, Treat will return the first word
248
- # but warn the user.
249
- def first_but_warn(array, type)
250
- if array.size > 1
251
- warn "Warning: requested one #{type}, but" +
252
- " there are many #{type}s in the given entity."
253
- end
254
- array[0]
127
+ else
128
+ @features[sym]
255
129
  end
130
+
256
131
  end
132
+
257
133
  end
134
+
258
135
  end