treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,40 +1,41 @@
1
- module Treat
2
- module Processors
3
- module Tokenizers
4
- # A wrapper for the Stanford parser's Penn-Treebank
5
- # style tokenizer.
6
- class Stanford
7
- require 'stanford-core-nlp'
8
- DefaultOptions = {
9
- :silence => false,
10
- :log_to_file => nil
11
- }
12
- @@tokenizer = nil
13
- # Tokenize the entity using a Penn-Treebank style tokenizer
14
- # included with the Stanford Parser.
15
- #
16
- # Options:
17
- # - (String) :log_to_file => a filename to log output to
18
- # instead of displaying it.
19
- def self.tokenize(entity, options = {})
20
- options = DefaultOptions.merge(options)
21
- options[:log_to_file] = '/dev/null' if options[:silence]
22
- if options[:log_to_file]
23
- ::StanfordCoreNLP.log_file = options[:log_to_file]
24
- end
25
- @@tokenizer ||= ::StanfordCoreNLP.load(:tokenize)
26
- text = ::StanfordCoreNLP::Text.new(entity.to_s)
27
- @@tokenizer.annotate(text)
28
- text.get(:tokens).each do |token|
29
- t = Treat::Entities::Token.from_string(token.value)
30
- entity << t
31
- t.set :character_offset_begin,
32
- token.get(:character_offset_begin)
33
- t.set :character_offset_end,
34
- token.get(:character_offset_end)
35
- end
36
- end
37
- end
1
+ # A wrapper for the Stanford parser's
2
+ # Penn-Treebank style tokenizer.
3
+ class Treat::Processors::Tokenizers::Stanford
4
+
5
+ require 'treat/loaders/stanford'
6
+
7
+ @@tokenizer = nil
8
+
9
+ # Tokenize the entity using a Penn-Treebank
10
+ # style tokenizer.
11
+ #
12
+ # Options: none.
13
+ def self.tokenize(entity, options = {})
14
+
15
+ entity.check_hasnt_children
16
+
17
+ s = entity.to_s
18
+
19
+ @@tokenizer ||=
20
+ ::StanfordCoreNLP.load(:tokenize)
21
+ text =
22
+ ::StanfordCoreNLP::Text.new(s)
23
+ @@tokenizer.annotate(text)
24
+
25
+ add_tokens(entity, text.get(:tokens))
26
+
27
+ end
28
+
29
+ # Add the tokens to the entity.
30
+ def self.add_tokens(entity, tokens)
31
+ tokens.each do |token|
32
+ val = token.value
33
+ val = '(' if val == '-LRB-' # Fix for other special chars
34
+ val = ')' if val == '-RRB'
35
+ t = Treat::Entities::Token.
36
+ from_string(token.value)
37
+ entity << t
38
38
  end
39
39
  end
40
- end
40
+
41
+ end
@@ -1,58 +1,67 @@
1
- module Treat
2
- module Processors
3
- module Tokenizers
4
- # A tokenizer class lifted from the 'tactful-tokenizer' gem.
5
- #
6
- # Copyright © 2010 Matthew Bunday. All rights reserved.
7
- # Released under the GNU GPL v3. Modified by Louis Mullie.
8
- #
9
- # Project website: https://github.com/SlyShy/Tactful_Tokenizer
10
- class Tactful
11
- ReTokenize = [
12
- # Uniform Quotes
13
- [/''|``/, '"'],
14
- # Separate punctuation from words.
15
- [/(^|\s)(')/, '\1\2'],
16
- [/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
17
- [/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
18
- # Treat double-hyphen as a single token.
19
- [/([^-])(--+)([^-])/, '\1 \2 \3'],
20
- [/(\s|^)(,)(?=(\S))/, '\1\2 '],
21
- # Only separate a comma if a space follows.
22
- [/(.)(,)(\s|$)/, '\1 \2\3'],
23
- # Combine dots separated by whitespace to be a single token.
24
- [/\.\s\.\s\./, '...'],
25
- # Separate "No.6"
26
- [/([\W]\.)(\d+)/, '\1 \2'],
27
- # Separate words from ellipses
28
- [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
29
- [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
30
- [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
31
- ##### Some additional fixes.
32
- # Fix %, $, &
33
- [/(\d)%/, '\1 %'],
34
- [/\$(\.?\d)/, '$ \1'],
35
- [/(\W)& (\W)/, '\1&\2'],
36
- [/(\W\W+)&(\W\W+)/, '\1 & \2'],
37
- # Fix (n 't) -> ( n't)
38
- [/n 't( |$)/, " n't\\1"],
39
- [/N 'T( |$)/, " N'T\\1"],
40
- # Treebank tokenizer special words
41
- [/([Cc])annot/, '\1an not']
42
- ]
43
- # Tokenize the entity using a rule-based algorithm
44
- # that has been lifted from the 'tactful-tokenizer'
45
- # gem.
46
- def self.tokenize(entity, options = {})
47
- s = entity.to_s
48
- ReTokenize.each do |rules|
49
- s.gsub!(rules[0], rules[1])
50
- end
51
- s.split(' ').each do |token|
52
- entity << Entities::Token.from_string(token)
53
- end
54
- end
55
- end
1
+ # A tokenizer class lifted from the 'tactful-tokenizer' gem.
2
+ #
3
+ # Copyright © 2010 Matthew Bunday. All rights reserved.
4
+ # Released under the GNU GPL v3. Modified by Louis Mullie.
5
+ #
6
+ # Project website: https://github.com/SlyShy/Tactful_Tokenizer
7
+ class Treat::Processors::Tokenizers::Tactful
8
+
9
+ require 'treat/helpers/decimal_point_escaper'
10
+
11
+ ReTokenize = [
12
+ # Uniform Quotes
13
+ [/''|``/, '"'],
14
+ # Separate punctuation from words.
15
+ [/(^|\s)(')/, '\1\2'],
16
+ [/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
17
+ [/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
18
+ # Treat double-hyphen as a single token.
19
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
20
+ [/(\s|^)(,)(?=(\S))/, '\1\2 '],
21
+ # Only separate a comma if a space follows.
22
+ [/(.)(,)(\s|$)/, '\1 \2\3'],
23
+ # Combine dots separated by whitespace to be a single token.
24
+ [/\.\s\.\s\./, '...'],
25
+ # Separate "No.6"
26
+ [/([\W]\.)(\d+)/, '\1 \2'],
27
+ # Separate words from ellipses
28
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
29
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
30
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
31
+ ##### Some additional fixes.
32
+ # Fix %, $, &
33
+ [/(\d)%/, '\1 %'],
34
+ [/\$(\.?\d)/, '$ \1'],
35
+ [/(\W)& (\W)/, '\1&\2'],
36
+ [/(\W\W+)&(\W\W+)/, '\1 & \2'],
37
+ # Fix (n 't) -> ( n't)
38
+ [/n 't( |$)/, " n't\\1"],
39
+ [/N 'T( |$)/, " N'T\\1"],
40
+ # Treebank tokenizer special words
41
+ [/([Cc])annot/, '\1an not']
42
+
43
+ ]
44
+
45
+
46
+ # Tokenize the entity using a rule-based algorithm
47
+ # that has been lifted from the 'tactful-tokenizer'
48
+ # gem.
49
+ def self.tokenize(entity, options = {})
50
+
51
+ entity.check_hasnt_children
52
+
53
+ s = entity.to_s
54
+ Treat::Helpers::DecimalPointEscaper.escape!(s)
55
+
56
+ ReTokenize.each do |rules|
57
+ s.gsub!(rules[0], rules[1])
56
58
  end
59
+
60
+ s.split(' ').each do |token|
61
+ entity << Treat::Entities::Token.
62
+ from_string(token)
63
+ end
64
+
57
65
  end
66
+
58
67
  end
data/lib/treat/proxies.rb CHANGED
@@ -1,40 +1,57 @@
1
- module Treat
2
- # Proxies install Treat functions on core Ruby classes.
3
- module Proxies
4
- # The module proxy provides functionanaty common
5
- # to the different types of proxies.
6
- module Proxy
7
- # Build the entity corresponding to the proxied
8
- # object and send the method call to the entity.
9
- def method_missing(sym, *args, &block)
10
- if sym == :do || Treat::Categories.lookup(sym)
11
- to_entity.send(sym, *args)
12
- else
13
- super(sym, *args, &block)
14
- end
15
- end
16
- def to_entity(builder = nil)
17
- Treat::Entities::Unknown(self.to_s)
1
+ # Proxies install builders on core Ruby objects,
2
+ # so that methods called on them may be passed
3
+ # to the entity that can be built from the core
4
+ # class instance.
5
+ module Treat::Proxies
6
+
7
+ # Provides a base functionality for proxies.
8
+ module Proxy
9
+
10
+ # Build the entity corresponding to the proxied
11
+ # object and send the method call to the entity.
12
+ def method_missing(sym, *args, &block)
13
+ if sym == :do || Treat::Categories.lookup(sym)
14
+ to_entity.send(sym, *args)
15
+ else
16
+ super(sym, *args, &block)
18
17
  end
19
18
  end
20
- # Install Treat functions on String objects.
21
- module String
22
- include Treat::Proxies::Proxy
23
- # Return the entity corresponding to the string.
24
- def to_entity
25
- Treat::Entities::Entity.from_string(self.to_s)
26
- end
19
+
20
+ # Create an unknown type of entity by default.
21
+ def to_entity(builder = nil)
22
+ Treat::Entities::Unknown(self.to_s)
27
23
  end
28
- # Install Treat functions on Numeric objects.
29
- module Numeric
30
- include Treat::Proxies::Proxy
31
- # Return the entity corresponding to the number.
32
- def to_entity(builder = nil)
33
- Treat::Entities::Number.from_numeric(self)
34
- end
24
+
25
+ end
26
+
27
+ # Install Treat functions on String objects.
28
+ module String
29
+
30
+ # Include base proxy functionality.
31
+ include Treat::Proxies::Proxy
32
+
33
+ # Return the entity corresponding to the string.
34
+ def to_entity
35
+ Treat::Entities::Entity.from_string(self.to_s)
36
+ end
37
+
38
+ end
39
+
40
+ # Install Treat functions on Numeric objects.
41
+ module Numeric
42
+
43
+ # Include base proxy functionality.
44
+ include Treat::Proxies::Proxy
45
+
46
+ # Return the entity corresponding to the number.
47
+ def to_entity(builder = nil)
48
+ Treat::Entities::Number.from_numeric(self)
35
49
  end
36
- # Include the proxies in the core classes.
37
- ::String.class_eval { include Treat::Proxies::String }
38
- ::Numeric.class_eval { include Treat::Proxies::Numeric }
50
+
39
51
  end
40
- end
52
+
53
+ # Include the proxies in the core classes.
54
+ ::String.class_eval { include Treat::Proxies::String }
55
+ ::Numeric.class_eval { include Treat::Proxies::Numeric }
56
+
57
+ end
@@ -1,17 +1,27 @@
1
- module Treat
2
- module Retrievers
3
- module Indexers
4
- extend Group
5
- self.type = :annotator
6
- self.targets = [:collection]
7
- self.default = :ferret
8
- end
9
- module Searchers
10
- extend Group
11
- self.type = :computer
12
- self.targets = [:entity]
13
- self.default = :ferret
14
- end
15
- extend Treat::Category
1
+ # Retrievers find documents in collections.
2
+ module Treat::Retrievers
3
+
4
+ # Indexers create an index of words used
5
+ # in the documents within a collection.
6
+ module Indexers
7
+ extend Treat::Groupable
8
+ self.type = :annotator
9
+ self.targets = [:collection]
10
+ self.default = :ferret
16
11
  end
17
- end
12
+
13
+ # Searchers perform full-text search
14
+ # on indexed collections in order
15
+ # to retrieve documents matching
16
+ # a query.
17
+ module Searchers
18
+ extend Treat::Groupable
19
+ self.type = :computer
20
+ self.targets = [:collection]
21
+ self.default = :ferret
22
+ end
23
+
24
+ # Make Retrievers categorizable.
25
+ extend Treat::Categorizable
26
+
27
+ end
@@ -1,28 +1,49 @@
1
- module Treat
2
- module Retrievers
3
- module Indexers
4
- class Ferret
5
- silence_warnings { require 'ferret' }
6
- require 'find'
7
- require 'fileutils'
8
- # Create a Ferret index for the collection and
9
- # store the path to the index under "folder."
10
- def self.index(collection, options = {})
11
- path = "#{collection.folder}/.index"
12
- FileUtils.mkdir(path) unless File.readable?(path)
13
- index = ::Ferret::Index::Index.new(
14
- :default_field => 'content',
15
- :path => path
16
- )
17
- collection.each_document do |doc|
18
- index.add_document(
19
- :file => doc.file,
20
- :content => doc.to_s
21
- )
22
- end
23
- path
24
- end
25
- end
1
+ # A wrapper for the indexing functions of Ferret,
2
+ # a port of the Java Lucene search engine.
3
+ #
4
+ # Documentation:
5
+ # http://rubydoc.info/gems/ferret
6
+ class Treat::Retrievers::Indexers::Ferret
7
+
8
+ # Require Ferret and file utilities.
9
+ silence_warnings { require 'ferret' }
10
+ require 'find'
11
+ require 'fileutils'
12
+
13
+ # Create a Ferret index for the collection and
14
+ # store the index in the collection, under the
15
+ # path collection-folder/.index
16
+ #
17
+ # Annotates the collection with the path to the
18
+ # index for future use (e.g. in searching).
19
+ def self.index(collection, options = {})
20
+
21
+ path = "#{collection.folder}/.index"
22
+ return path if FileTest.directory?(path)
23
+
24
+ begin
25
+ FileUtils.mkdir(path)
26
+ rescue Exception => e
27
+ raise Treat::Exception,
28
+ "Could not create folder for index " +
29
+ "under the collection's folder. " +
30
+ "(#{e.message})."
26
31
  end
32
+
33
+ index = ::Ferret::Index::Index.new(
34
+ :default_field => 'content',
35
+ :path => path
36
+ )
37
+
38
+ collection.each_document do |doc|
39
+ index.add_document(
40
+ :file => doc.file,
41
+ :content => doc.to_s
42
+ )
43
+ end
44
+
45
+ path
46
+
27
47
  end
28
- end
48
+
49
+ end
@@ -1,53 +1,72 @@
1
- module Treat
2
- module Retrievers
3
- module Searchers
4
- class Ferret
5
- silence_warnings { require 'ferret' }
6
- require 'find'
7
- DefaultOptions = {
8
- :q => nil,
9
- :limit => :all,
10
- :callback => nil
11
- }
12
- # Returns an array of retrieved documents.
13
- #
14
- # Options:
15
- #
16
- # - (String) :q => a search query.
17
- # - (Symbol) :limit => number of documents.
18
- def self.search(collection, options = {})
19
- options = DefaultOptions.merge(options)
20
- unless collection.has?(:index) && collection.index
21
- raise Treat::Exception, 'This collection has not been indexed.'
22
- end
23
- unless options[:q]
24
- raise Treat::Exception,
25
- 'You must set a query by using the :q option.'
26
- end
27
- path = "#{collection.folder}/.index"
28
- unless File.readable?(path)
29
- raise Treat::Exception, "The index at location #{path} cannot be found."
30
- end
31
- index = ::Ferret::Index::Index.new(
32
- :default_field => 'content',
33
- :path => path
34
- )
35
- query = options.delete(:q)
36
- files = {}
37
- index.search_each(query, options) do |doc, score|
38
- files[index[doc]['file']] = score
39
- end
40
- docs = []
41
- files.each do |doc, score|
42
- doc2 = collection.document_with_file(doc)
43
- if options[:callback]
44
- options[:callback].call(doc2, score)
45
- end
46
- docs << doc2
47
- end
48
- docs
49
- end
1
+ # A simple interface to the Ferret information
2
+ # retrieval library, which performs full-text
3
+ # search within documents of a collection.
4
+ #
5
+ # Documentation:
6
+ # http://rubydoc.info/gems/ferret
7
+ class Treat::Retrievers::Searchers::Ferret
8
+
9
+ silence_warnings { require 'ferret' }
10
+ require 'find'
11
+
12
+ DefaultOptions = {
13
+ :q => nil,
14
+ :limit => :all,
15
+ :callback => nil
16
+ }
17
+
18
+ # Returns an array of retrieved documents.
19
+ #
20
+ # Options:
21
+ #
22
+ # - (String) :q => a search query.
23
+ # - (Symbol) :limit => number of documents.
24
+ def self.search(collection, options = {})
25
+
26
+ options = DefaultOptions.merge(options)
27
+
28
+ unless collection.has?(:index)
29
+ raise Treat::Exception,
30
+ "This collection must be indexed to be searchable."
31
+ end
32
+
33
+ unless options[:q]
34
+ raise Treat::Exception,
35
+ 'You must set a query by using the :q option.'
36
+ end
37
+
38
+ path = collection.index
39
+
40
+ unless FileTest.directory?(path)
41
+ raise Treat::Exception,
42
+ "The index at location #{path} cannot be found."
43
+ end
44
+
45
+ index = ::Ferret::Index::Index.new(
46
+ :default_field => 'content',
47
+ :path => path
48
+ )
49
+
50
+ query = options.delete(:q)
51
+ files = {}
52
+ index.search_each(query, options) do |doc, score|
53
+ files[index[doc]['file']] = score
54
+ end
55
+
56
+ docs = []
57
+ files.each do |doc, score|
58
+ doc2 = collection.document_with_file(doc)
59
+ unless doc2
60
+ raise Treat::Exception,
61
+ "Couldn't retrieve indexed " +
62
+ "document with filename #{doc}."
63
+ end
64
+ if options[:callback]
65
+ options[:callback].call(doc2, score)
50
66
  end
67
+ docs << doc2
51
68
  end
69
+
70
+ docs
52
71
  end
53
- end
72
+ end