treat 0.2.5 → 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,83 @@
1
+ module Treat::Entities::Abilities::Magical
2
+
3
+ # Parse "magic methods", which allow the following
4
+ # syntaxes to be used (where 'word' can be replaced
5
+ # by any entity type, e.g. token, zone, etc.):
6
+ #
7
+ # - each_word : iterate over each entity of type word.
8
+ # - words: return an array of words in the entity.
9
+ # - word: return the first word in the entity.
10
+ # - word_count: return the number of words in the entity.
11
+ # - words_with_*(value) (where is an arbitrary feature):
12
+ # return the words that have the given feature.
13
+ # - word_with_*(value) : return the first word with
14
+ # the feature specified by * in value.
15
+ #
16
+ # Also provides magical methods for types of words:
17
+ #
18
+ # - each_noun:
19
+ # - nouns:
20
+ # - noun:
21
+ # - noun_count:
22
+ # - nouns_with_*(value)
23
+ # - noun_with_*(value)
24
+ #
25
+ def magic(sym, *args)
26
+
27
+ @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
28
+ @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
29
+
30
+ method = sym.to_s =~ /entities/ ?
31
+ sym.to_s.gsub('entities', 'entitys') :
32
+ method = sym.to_s
33
+
34
+ if method =~ /^#{@@entities_regexp}s$/
35
+ entities_with_type($1.intern)
36
+ elsif method =~ /^#{@@entities_regexp}$/
37
+ first_but_warn(entities_with_type($1.intern), $1)
38
+ elsif method =~ /^parent_#{@@entities_regexp}$/
39
+ ancestor_with_type($1.intern)
40
+ elsif method =~ /^each_#{@@entities_regexp}$/
41
+ each_entity($1.intern) { |e| yield e }
42
+ elsif method =~ /^#{@@entities_regexp}_count$/
43
+ entities_with_type($1.intern).size
44
+ elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
45
+ entities_with_feature($2.intern, args[0], $1.intern)
46
+ elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
47
+ first_but_warn(entities_with_feature(
48
+ $2.intern, args[0], $1.intern), $1)
49
+ elsif method =~ /^each_#{@@entities_regexp}_with_([a-z]*)$/
50
+ entities_with_feature($2.intern, args[0],
51
+ $1.intern).each { |e| yield e }
52
+ elsif method =~ /^each_with_([a-z]*)$/
53
+ entities_with_feature($2.intern,
54
+ args[0], $1.intern).each { |e| yield e }
55
+ elsif method =~ /^each_#{@@cats_regexp}$/
56
+ entities_with_category($1.intern
57
+ ).each { |e| yield e }
58
+ elsif method =~ /^#{@@cats_regexp}s$/
59
+ entities_with_category($1.intern)
60
+ elsif method =~ /^#{@@cats_regexp}$/
61
+ first_but_warn(entities_with_category($1.intern), $1)
62
+ elsif method =~ /^#{@@cats_regexp}_count$/
63
+ entities_with_category($1.intern).size
64
+ elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
65
+ entities_with_feature($2.intern, args[0], $1)
66
+ elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
67
+ first_but_warn(entities_with_feature(
68
+ $2.intern, args[0], $1.intern), $1)
69
+ elsif method =~ /^([a-z]*)_of_first_#{@@entities_regexp}$/
70
+ f = send(:"#{$2}s".intern).first
71
+ f ? f.send($1.intern) : nil
72
+ elsif method =~ /^frequency_in_#{@@entities_regexp}$/
73
+ frequency_in($1.intern)
74
+ # first_word
75
+ # tag_of_first_verb
76
+ # tag_of_title
77
+ else
78
+ return :no_magic
79
+ end
80
+ end
81
+
82
+
83
+ end
@@ -0,0 +1,74 @@
1
+ # Registers occurences of textual values inside
2
+ # all children entity. Useful to calculate frequency.
3
+ module Treat::Entities::Abilities::Registrable
4
+
5
+ # Registers a token in the @registry hash.
6
+ def register(entity)
7
+
8
+ if entity.is_a?(Treat::Entities::Token) ||
9
+ entity.is_a?(Treat::Entities::Phrase)
10
+ val = entity.to_s.downcase
11
+ @registry[:value][val] ||= 0
12
+ @registry[:value][val] += 1
13
+ end
14
+
15
+ @registry[:id][entity.id] = true
16
+ @registry[:type][entity.type] ||= 0
17
+ @registry[:type][entity.type] += 1
18
+ @registry[:position][entity.id] = @count
19
+ @count += 1
20
+
21
+ @parent.register(entity) if has_parent?
22
+
23
+ end
24
+
25
+ # Backtrack up the tree to find a token registry,
26
+ # by default the one in the root node of any entity.
27
+ def registry(type = nil)
28
+ if has_parent? &&
29
+ type != self.type
30
+ @parent.registry(type)
31
+ else
32
+ @registry
33
+ end
34
+ end
35
+
36
+ def contains_id?(id)
37
+
38
+ @registry[:id][id]
39
+
40
+ end
41
+
42
+ def contains_value?(val)
43
+
44
+ @registry[:value][val] ?
45
+ true : false
46
+
47
+ end
48
+
49
+ def contains_type?(type1)
50
+
51
+ return true if @registry[:type][type1]
52
+
53
+ @registry[:type].each do |type2, count|
54
+ if Treat::Entities.
55
+ match_types[type1][type2]
56
+ return true
57
+ end
58
+ end
59
+
60
+ false
61
+
62
+ end
63
+
64
+ def contains_types?(types)
65
+
66
+ types.each do |type|
67
+ return true if contains_type?(type)
68
+ end
69
+
70
+ false
71
+
72
+ end
73
+
74
+ end
@@ -0,0 +1,91 @@
1
+ # Gives entities the ability to be converted
2
+ # to string representations (#to_string, #to_s,
3
+ # #to_str, #inspect, #print_tree).
4
+ module Treat::Entities::Abilities::Stringable
5
+
6
+ # Return the entity's true string value in
7
+ # plain text format. Non-terminal entities
8
+ # will normally have an empty value.
9
+ def to_string; @value; end
10
+
11
+ # Returns the entity's string value by
12
+ # imploding the value of all terminal
13
+ # entities in the subtree of that entity.
14
+ def to_s
15
+ @value != '' ? @value : implode.strip
16
+ end
17
+
18
+ # #to_str is the same as #to_s.
19
+ alias :to_str :to_s
20
+
21
+ # Return a shortened value of the entity's
22
+ # string value using [...], with a cutoff
23
+ # number of words or length.
24
+ def short_value(max_length = 30)
25
+ s = to_s
26
+ words = s.split(' ')
27
+ if s.length < max_length
28
+ s
29
+ else
30
+ words[0..2].join(' ') + ' [...] ' +
31
+ words[-2..-1].join(' ')
32
+ end
33
+ end
34
+
35
+ # Return an informative string representation
36
+ # of the entity.
37
+ def inspect
38
+ s = "#{cl(self.class)} (#{@id.to_s})"
39
+ if caller_method(2) == :inspect
40
+ @id.to_s
41
+ else
42
+ dependencies = []
43
+ @dependencies.each do |dependency|
44
+ dependencies <<
45
+ "#{dependency.target}#{dependency.type}"
46
+ end
47
+ s += " --- #{short_value.inspect}" +
48
+ " --- #{@features.inspect} " +
49
+ " --- #{dependencies.inspect} "
50
+ end
51
+ s
52
+ end
53
+
54
+ # Print out an ASCII representation of the tree.
55
+ def print_tree; puts visualize(:tree); end
56
+
57
+ # Helper method to implode the string value of the subtree.
58
+ def implode
59
+
60
+ return @value.dup if !has_children?
61
+
62
+ value = ''
63
+
64
+ each do |child|
65
+
66
+ if child.is_a?(Treat::Entities::Section)
67
+ value += "\n\n"
68
+ end
69
+
70
+ if child.is_a?(Treat::Entities::Token) || child.value != ''
71
+ if child.is_a?(Treat::Entities::Punctuation) ||
72
+ child.is_a?(Treat::Entities::Clitic)
73
+ value.strip!
74
+ end
75
+ value += child.to_s + ' '
76
+ else
77
+ value += child.implode
78
+ end
79
+
80
+ if child.is_a?(Treat::Entities::Title) ||
81
+ child.is_a?(Treat::Entities::Paragraph)
82
+ value += "\n\n"
83
+ end
84
+
85
+ end
86
+
87
+ value
88
+
89
+ end
90
+
91
+ end
@@ -0,0 +1,104 @@
1
+ module Treat::Entities
2
+
3
+ # Require the generic entity lass.
4
+ require 'treat/entities/entity'
5
+
6
+ # Represents a collection of texts.
7
+ class Collection < Entity
8
+
9
+ # Initialize the collection with a folder
10
+ # containing the texts of the collection.
11
+ def initialize(folder = nil, id = nil)
12
+ super('', id)
13
+ set :folder, folder
14
+ i = folder + '/.index'
15
+ set :index, i if FileTest.directory?(i)
16
+ end
17
+
18
+ # Works like the default <<, but if the
19
+ # file being added is a collection or a
20
+ # document, then copy that collection or
21
+ # document into this collection's folder.
22
+ def <<(entities, copy = true)
23
+ unless entities.is_a? Array
24
+ entities = [entities]
25
+ end
26
+ entities.each do |entity|
27
+ if [:document, :collection].
28
+ include?(entity.type) && copy
29
+ entity = entity.copy_into(self)
30
+ end
31
+ end
32
+ super(entities)
33
+ end
34
+
35
+ end
36
+
37
+ # Represents a document.
38
+ class Document < Entity
39
+
40
+ def initialize(file = nil, id = nil)
41
+ super('', id)
42
+ set :file, file
43
+ end
44
+
45
+ end
46
+
47
+ # Represents a section, usually with a title
48
+ # and at least one paragraph.
49
+ class Section < Entity; end
50
+
51
+ # Represents a zone of text
52
+ # (Title, Paragraph, List, Quote).
53
+ class Zone < Entity; end
54
+
55
+ # Represents a title, subtitle, logical header.
56
+ class Title < Zone; end
57
+
58
+ # Represents a paragraph.
59
+ class Paragraph < Zone; end
60
+
61
+ # Represents a list.
62
+ class List < Zone; end
63
+
64
+ # Represents a group of words.
65
+ class Phrase < Entity; end
66
+
67
+ # Represents a group of words with a sentence ender.
68
+ class Sentence < Phrase; end
69
+
70
+ # Represents a terminal element in the text structure.
71
+ class Token < Entity
72
+ end
73
+
74
+ # Represents a word.
75
+ class Word < Token
76
+ end
77
+
78
+ # Represents a clitic ('s).
79
+ class Clitic < Token; end
80
+
81
+ # Represents a number.
82
+ class Number < Token
83
+ def to_i; to_s.to_i; end
84
+ def to_f; to_s.to_f; end
85
+ end
86
+
87
+ # Represents a punctuation sign.
88
+ class Punctuation < Token; end
89
+
90
+ # Represents a character that is neither
91
+ # alphabetical, numerical or a punctuation
92
+ # character (e.g. @#$%&*).
93
+ class Symbol < Token; end
94
+
95
+ # Represents a url.
96
+ class Url < Token; end
97
+
98
+ # Represents a valid RFC822 address.
99
+ class Email < Token; end
100
+
101
+ # Represents an entity of unknown type.
102
+ class Unknown; end
103
+
104
+ end
@@ -1,258 +1,135 @@
1
- require 'treat/tree'
2
- require 'treat/feature'
3
- require 'treat/delegatable'
4
- require 'treat/visitable'
5
- require 'treat/registrable'
6
- require 'treat/buildable'
7
- require 'treat/doable'
8
- require 'treat/viewable'
9
- require 'treat/features'
1
+ module Treat::Entities
10
2
 
11
- module Treat
12
- module Entities
13
- class Entity < Tree::Node
14
- # A Symbol representing the lowercase version of the class name.
15
- attr_accessor :type
16
- # Implements support for #register
17
- include Registrable
18
- # Implement support for #accept.
19
- include Visitable
20
- # Implement support for #self.add_workers
21
- extend Delegatable
22
- # Implement support for #self.from_*
23
- extend Buildable
24
- # Implement support for #do.
25
- include Doable
26
- # Implement support for to_s, inspect, etc.
27
- include Viewable
28
- # Initialize the entity with its value and
29
- # (optionally) a unique identifier. By default,
30
- # the object_id will be used as id. Also initialize
31
- # the token registry in the root node.
32
- def initialize(value = '', id = nil)
33
- id ||= object_id
34
- super(value, id)
35
- @type = :entity
36
- # @match_types = Treat::Entities.match_types
37
- end
38
- # Catch missing methods to support method-like
39
- # access to features (e.g. entity.categoryinstead of
40
- # entity.features[:cat]) and to support magic
41
- # methods (see #parse_magic_method). If the
42
- # feature does not exist
43
- def method_missing(sym, *args, &block)
44
- return self.build(*args) if sym == nil
45
- if !@features.has_key?(sym)
46
- r = parse_magic_method(sym, *args, &block)
47
- if r == :no_magic
48
- begin
49
- super(sym, *args, &block)
50
- rescue NoMethodError
51
- return false if sym.to_s[-1] == '?'
52
- if Categories.lookup(sym)
53
- msg = "Method #{sym} cannot be called on a #{type}."
54
- else
55
- msg = "Method #{sym} does not exist."
56
- msg += did_you_mean?(Category.methods, sym)
57
- end
58
- raise Treat::Exception, msg
59
- end
60
- else
61
- r
62
- end
63
- else
64
- @features[sym]
65
- end
66
- end
67
- # Parse "magic methods", which allow the following
68
- # syntaxes to be used (where 'word' can be replaced
69
- # by any entity type, e.g. token, zone, etc.):
70
- #
71
- # - each_word : iterate over each entity of type word.
72
- # - words: return an array of words in the entity.
73
- # - word: return the first word in the entity.
74
- # - word_count: return the number of words in the entity.
75
- # - words_with_*(value) (where is an arbitrary feature):
76
- # return the words that have the given feature.
77
- # - word_with_*(value) : return the first word with
78
- # the feature specified by * in value.
79
- #
80
- # Also provides magical methods for types of words:
81
- #
82
- # - each_noun:
83
- # - nouns:
84
- # - noun:
85
- # - noun_count:
86
- # - nouns_with_*(value)
87
- # - noun_with_*(value)
88
- #
89
- def parse_magic_method(sym, *args)
90
- @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
91
- @@cats_regexp ||= "(#{Treat::Languages::WordCategories.join('|')})"
92
- method = sym.to_s =~ /entities/ ?
93
- sym.to_s.gsub('entities', 'entitys') :
94
- method = sym.to_s
95
- if method =~ /^#{@@entities_regexp}s$/
96
- a = []
97
- each_entity($1.intern) { |e| a << e }
98
- a
99
- elsif method =~ /^#{@@entities_regexp}$/
100
- a = []
101
- each_entity($1.intern) { |e| a << e }
102
- first_but_warn(a, $1)
103
- elsif method =~ /^parent_#{@@entities_regexp}$/
104
- ancestor_with_types($1.intern)
105
- elsif method =~ /^each_#{@@entities_regexp}$/
106
- each_entity($1.intern) { |e| yield e }
107
- elsif method =~ /^#{@@entities_regexp}_count$/
108
- i = 0
109
- each_entity($1.intern) { |e| i += 1 }
110
- i
111
- elsif method =~ /^#{@@entities_regexp}s_with_([a-z]+)$/
112
- a = []
113
- each_entity($1.intern) do |e|
114
- a << e if e.has?($2.intern) &&
115
- e.send($2.intern) == args[0]
116
- end
117
- a
118
- elsif method =~ /^#{@@entities_regexp}_with_([a-z]*)$/
119
- a = []
120
- each_entity($1.intern) do |e|
121
- a << e if e.has?($2.intern) &&
122
- e.send($2.intern) == args[0]
123
- end
124
- first_but_warn(a, $1)
125
- elsif method =~ /^each_with_([a-z]*)$/
126
- each_entity do |e|
127
- yield e if e.has?($1.intern) &&
128
- e.send($1.intern) == args[0]
129
- end
130
- elsif method =~ /^each_#{@@cats_regexp}$/
131
- each_entity(:word) { |e| yield e if e.category == $1.intern }
132
- elsif method =~ /^#{@@cats_regexp}s$/
133
- a = []
134
- each_entity(:word) { |e| a << e if e.category == $1.intern }
135
- a
136
- elsif method =~ /^#{@@cats_regexp}$/
137
- a = []
138
- each_entity(:word) { |e| a << e if e.category == $1.intern }
139
- first_but_warn(a, $1)
140
- elsif method =~ /^#{@@cats_regexp}_count$/
141
- i = 0
142
- each_entity(:word) { |e| i += 1 if e.category == $1.intern }
143
- i
144
- elsif method =~ /^#{@@cats_regexp}s_with_([a-z]*)$/
145
- a = []
146
- each_entity(:word) do |e|
147
- a << e if e.category == $1.intern &&
148
- e.has?($2.intern) && e.send($2.intern) == args[0]
149
- end
150
- a
151
- elsif method =~ /^#{@@cats_regexp}_with_([a-z]*)$/
152
- a = []
153
- each_entity(:word) do |e|
154
- a << e if e.category== $1.intern &&
155
- e.has?($2.intern) && e.send($2.intern) == args[0]
156
- end
157
- first_but_warn(a, $1)
158
- elsif method =~ /^is_#{@@entities_regexp}\?$/
159
- type.to_s == $1
160
- elsif method =~ /^is_#{@@cats_regexp}\?$/
161
- category.to_s == $1
162
- else
163
- return :no_magic
164
- end
3
+ # Require base class for Entity.
4
+ require 'treat/tree'
5
+
6
+ class Entity < Treat::Tree::Node
7
+
8
+ # A Symbol representing the lowercase
9
+ # version of the class name.
10
+ attr_accessor :type
11
+
12
+ # Require abilities.
13
+ require 'treat/entities/abilities'
14
+
15
+ # Implements support for #register,
16
+ # #registry, and #contains_* methods.
17
+ include Abilities::Registrable
18
+
19
+ # Implement support for #self.add_workers
20
+ extend Abilities::Delegatable
21
+
22
+ # Implement support for #self.print_debug and
23
+ # #self.invalid_call_msg
24
+ extend Abilities::Debuggable
25
+
26
+ # Implement support for #self.build
27
+ # and #self.from_*
28
+ extend Abilities::Buildable
29
+
30
+ # Implement support for #do.
31
+ include Abilities::Doable
32
+
33
+ # Implement support for #frequency,
34
+ # #frequency_in_parent and #position_in_parent.
35
+ include Abilities::Countable
36
+
37
+ # Implement support for #magic.
38
+ include Abilities::Magical
39
+
40
+ # Implement support for #to_s, #inspect, etc.
41
+ include Abilities::Stringable
42
+
43
+ # Implement support for #check_has
44
+ # and #check_hasnt_children?
45
+ include Abilities::Checkable
46
+
47
+ # Implement support for #each_entity, as well as
48
+ # #entities_with_type, #ancestors_with_type,
49
+ # #entities_with_feature, #entities_with_category.
50
+ include Abilities::Iterable
51
+
52
+ # Implement support for #export to export
53
+ # a line of a data set based on a classification.
54
+ include Abilities::Exportable
55
+
56
+ # Implement support for #copy_into.
57
+ include Abilities::Copyable
58
+
59
+ # Initialize the entity with its value and
60
+ # (optionally) a unique identifier. By default,
61
+ # the object_id will be used as id.
62
+ def initialize(value = '', id = nil)
63
+ id ||= object_id
64
+ super(value, id)
65
+ @type = :entity if self == Entity
66
+ @type ||= ucc(cl(self.class)).intern
67
+ unless is_a?(Treat::Entities::Token)
68
+ @count = 0
69
+ @registry = {
70
+ :id => {},
71
+ :value => {},
72
+ :type => {},
73
+ :position => {}
74
+ }
165
75
  end
166
- # Add an entity to the current entity.
167
- # Registers the entity in the root node
168
- # token registry if the entity is a leaf.
169
- #
170
- # @see Treat::Registrable
171
- def <<(entities, clear_parent = true)
172
- entities = [entities] unless entities.is_a? Array
173
- entities.each do |entity|
174
- if entity.is_a?(Treat::Entities::Token) ||
175
- entity.is_a?(Treat::Entities::Phrase)
176
- register_token(entity) unless entity.value == ''
177
- end
178
- end
179
- super(entities)
180
- @parent.value = '' if has_parent?
181
- entities[0]
76
+ end
77
+
78
+
79
+ # Add an entity to the current entity.
80
+ # Registers the entity in the root node
81
+ # token registry if the entity is a leaf.
82
+ #
83
+ # @see Treat::Registrable
84
+ def <<(entities, clear_parent = true)
85
+ unless entities.is_a? Array
86
+ entities = [entities]
182
87
  end
183
- # Yields each entity of any of the supplied
184
- # types in the children tree of this Entity.
185
- # Note that this function is recursive, unlike
186
- # #each. It does not yield the top element being
187
- # recursed.
188
- #
189
- # This function NEEDS to be ported to C (see source).
190
- def each_entity(*types)
191
- types = [:entity] if types.size == 0
192
- f = false
193
- types.each { |t2| f = true if Treat::Entities.match_types[t2][type] }
194
- yield self if f
195
- unless @children.size == 0
196
- @children.each do |child|
197
- child.each_entity(*types) { |y| yield y }
198
- end
199
- end
88
+ entities.each do |entity|
89
+ register(entity)
200
90
  end
201
-
202
- # Replace with:
203
- #inline do |builder|
204
- #
205
- # builder.c_raw <<-EOS, :arity => -1
91
+ super(entities)
92
+ @parent.value = '' if has_parent?
93
+ entities[0]
94
+ end
206
95
 
207
96
 
97
+ # Catch missing methods to support method-like
98
+ # access to features (e.g. entity.category
99
+ # instead of entity.features[:category]) and to
100
+ # support magic methods (see #magic).
101
+ #
102
+ # If the feature or magic method does not exist,
103
+ # or can't be parsed, raises an exception.
104
+ #
105
+ # Also catches the "empty" method call (e.g.
106
+ # Word('hello') or Word 'hello') as syntactic
107
+ # sugar for the #self.build method.
108
+ def method_missing(sym, *args, &block)
109
+ return self.build(*args) if sym == nil
208
110
 
209
- #EOS
210
- #end
211
- # Returns the first ancestor of this entity that has the given type.
212
- def ancestor_with_types(*types)
213
- ancestor = @parent
214
- match_types = lambda do |t1, t2s|
215
- f = false
216
- t2s.each do |t2|
217
- if Treat::Entities.match_types[t1][t2]
218
- f = true; break
219
- end
220
- end
221
- f
222
- end
223
- if ancestor
224
- while not match_types.call(ancestor.type, types)
225
- return nil unless (ancestor && ancestor.has_parent?)
226
- ancestor = ancestor.parent
111
+ if !@features.has_key?(sym)
112
+ r = magic(sym, *args, &block)
113
+ return r unless r == :no_magic
114
+ begin
115
+ super(sym, *args, &block)
116
+ rescue NoMethodError
117
+ raise Treat::Exception,
118
+ if Treat::Categories.lookup(sym)
119
+ msg = "Method #{sym} cannot " +
120
+ "be called on a #{type}."
121
+ else
122
+ msg = "Method #{sym} does not exist."
123
+ msg += did_you_mean?(
124
+ Treat::Categories.methods, sym)
227
125
  end
228
- match_types.call(ancestor.type, types) ? ancestor : nil
229
- end
230
- end
231
- alias :ancestor_with_type :ancestor_with_types
232
- # Returns the (direct) ancestors of this entity that
233
- # have the given type.
234
- def ancestors_with_types(*types)
235
- ancestor = self
236
- ancestors = []
237
- while (a = ancestor.ancestor_with_types(*types))
238
- ancestors << a
239
- ancestor = ancestor.parent
240
126
  end
241
- ancestors
242
- end
243
- alias :ancestors_with_type :ancestors_with_types
244
- # Return the first element in the array, warning if not
245
- # the only one in the array. Used for magic methods: e.g.,
246
- # the magic method "word" if called on a sentence
247
- # with many words, Treat will return the first word
248
- # but warn the user.
249
- def first_but_warn(array, type)
250
- if array.size > 1
251
- warn "Warning: requested one #{type}, but" +
252
- " there are many #{type}s in the given entity."
253
- end
254
- array[0]
127
+ else
128
+ @features[sym]
255
129
  end
130
+
256
131
  end
132
+
257
133
  end
134
+
258
135
  end