treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,7 +1,7 @@
1
1
  module Treat
2
2
  # Custom exception class for the Treat toolkit.
3
3
  # Used to distinguish between errors raised by
4
- # gems or Ruby from errors raised by the toolkit.
5
- class Exception < ::Exception
6
- end
7
- end
4
+ # gems/Ruby from errors raised by the toolkit.
5
+ class Exception < ::Exception; end
6
+ class InvalidInputException < Exception; end
7
+ end
@@ -1,82 +1,79 @@
1
- module Treat
2
- # Extractors extract specific information out of texts.
3
- module Extractors
4
- # Detecs language.
5
- module Language
6
- extend Group
7
- require 'treat/extractors/language/language_extractor.rb'
8
- self.type = :annotator
9
- self.targets = [:entity]
10
- self.default = :what_language
11
- end
12
- # Extracts the time of an object and annotates it
13
- # with specific information regarding time.
14
- module Time
15
- extend Group
16
- self.type = :annotator
17
- self.targets = [:phrase]
18
- end
19
- # Extracts the time of an object and annotates it
20
- # with specific information regarding time.
21
- module Date
22
- extend Group
23
- self.type = :annotator
24
- self.targets = [:phrase]
25
- end
26
- # Extract the topic from a text.
27
- module Topics
28
- extend Group
29
- self.type = :annotator
30
- self.targets = [:document, :zone]
31
- end
32
- # Extract the keywords from a text.
33
- module Keywords
34
- extend Group
35
- self.type = :annotator
36
- self.targets = [:document, :zone]
37
- end
38
- # Extract the topic words from a text.
39
- module TopicWords
40
- extend Group
41
- self.type = :annotator
42
- self.targets = [:collection]
43
- end
44
- # Extract named entities from texts.
45
- module NamedEntityTag
46
- extend Group
47
- self.type = :annotator
48
- self.targets = [:phrase, :word]
49
- end
50
- # Extract named entities from texts.
51
- module Coreferences
52
- extend Group
53
- self.type = :annotator
54
- self.targets = [:zone]
55
- end
56
- # This module should be moved out of here ASAP.
57
- module Statistics
58
- extend Group
59
- self.type = :annotator
60
- self.targets = [:word]
61
- self.default = :none
62
- self.preprocessors = {
63
- :frequency_in => lambda do |entity, worker, options|
64
- options = {:parent => worker}.merge(options)
65
- entity.statistics(:frequency_in, options)
66
- end,
67
- :tf_idf => lambda do |entity, worker, options|
68
- entity.statistics(:tf_idf, options)
69
- end,
70
- :position_in => lambda do |entity, options|
71
- entity.statistics(:position_in, options)
72
- end
73
- }
74
- end
75
- module Roles
76
- extend Group
77
- self.type = :annotator
78
- self.targets = [:phrase]
79
- end
80
- extend Treat::Category
1
+ # Extractors extract information out of texts.
2
+ module Treat::Extractors
3
+
4
+ # Extracts the language from an entity.
5
+ module Language
6
+ extend Treat::Groupable
7
+ self.type = :annotator
8
+ self.targets = [:entity]
9
+ self.default = :what_language
81
10
  end
11
+
12
+ # Extracts the date/time of a phrase.
13
+ module Time
14
+ extend Treat::Groupable
15
+ self.type = :annotator
16
+ self.targets = [:phrase]
17
+ end
18
+
19
+ # Extract the topic from a document or zone.
20
+ module Topics
21
+ extend Treat::Groupable
22
+ self.type = :annotator
23
+ self.targets = [:document]
24
+ end
25
+
26
+ # Extract the keywords from a text.
27
+ module Keywords
28
+ extend Treat::Groupable
29
+ self.type = :annotator
30
+ self.targets = [:document]
31
+ end
32
+
33
+ # Extract clusters of topic words from a collection.
34
+ module TopicWords
35
+ extend Treat::Groupable
36
+ self.type = :annotator
37
+ self.targets = [:collection]
38
+ end
39
+
40
+ # Extract named entities from phrases.
41
+ module NameTag
42
+ extend Treat::Groupable
43
+ self.type = :annotator
44
+ self.targets = [:phrase, :word]
45
+ end
46
+
47
+ # Extract coreferences from a zone.
48
+ module Coreferences
49
+ extend Treat::Groupable
50
+ self.type = :annotator
51
+ self.targets = [:zone]
52
+ end
53
+
54
+ # Retrieve the main grammatical roles
55
+ # in the phrase (subject, verb, object).
56
+ module Roles
57
+ extend Treat::Groupable
58
+ self.type = :annotator
59
+ self.targets = [:phrase]
60
+ end
61
+
62
+ module TfIdf
63
+ extend Treat::Groupable
64
+ self.type = :annotator
65
+ self.targets = [:word]
66
+ self.default = :native
67
+ end
68
+
69
+ module Summary
70
+ extend Treat::Groupable
71
+ self.type = :annotator
72
+ self.targets = [:document]
73
+ self.default = :keyword_count
74
+ end
75
+
76
+ # Make Extractors categorizable.
77
+ extend Treat::Categorizable
78
+
82
79
  end
@@ -1,26 +1,60 @@
1
- module Treat
2
- module Extractors
3
- module Keywords
4
- class TfIdf
5
- DefaultOptions = { num_keywords: 5 }
6
- def self.keywords(entity, options = {})
7
- options = DefaultOptions.merge(options)
8
- tf_idfs = {}
9
- entity.each_word do |word|
10
- tf_idfs[word.value] ||= word.tf_idf
11
- end
12
- tf_idfs = tf_idfs.sort_by {|k,v| v}.reverse
13
- return tf_idfs if tf_idfs.size <= options[:num_keywords]
14
- keywords = []
15
- i = 0
16
- tf_idfs.each do |info|
17
- break if i > options[:num_keywords]
18
- keywords << info[0]
19
- i += 1
20
- end
21
- keywords
1
+ # This retrieves a supplied number of keywords
2
+ # by selecting the N words with the highest TF*IDF
3
+ # for each document.
4
+ class Treat::Extractors::Keywords::TfIdf
5
+
6
+ # Default options - retrieve 5 keywords.
7
+ DefaultOptions = { :number => 5 }
8
+
9
+ # Annotate a document with an array containing
10
+ # the N words with the highest TF*IDF in that
11
+ # document,
12
+ def self.keywords(entity, options = {})
13
+
14
+ options = DefaultOptions.merge(options)
15
+ tf_idfs = {}
16
+ entity.each_word do |word|
17
+ word.check_has(:tf_idf, false)
18
+ tf_idfs[word] ||= word.get(:tf_idf)
19
+ end
20
+
21
+ tf_idfs = tf_idfs.
22
+ sort_by {|k,v| v}.reverse
23
+
24
+ if tf_idfs.size <= options[:number]
25
+ return tf_idfs
26
+ end
27
+
28
+ keywords = []
29
+ i = 0
30
+
31
+ tf_idfs.each do |word|
32
+
33
+ w = word[0].to_s
34
+ next if keywords.include?(w)
35
+
36
+ entity.each_word_with_value(w) do |w2|
37
+
38
+ ps = w2.parent_phrase
39
+
40
+ if ps.has?(:keyword_count)
41
+ ps.set :keyword_count,
42
+ ps.keyword_count + 1
43
+ else
44
+ ps.set :keyword_count, 1
22
45
  end
46
+ ps.set :keyword_density,
47
+ (ps.keyword_count / ps.size)
48
+
23
49
  end
50
+
51
+ break if i > options[:number]
52
+ keywords << w
53
+
54
+ i += 1
24
55
  end
56
+
57
+ keywords
25
58
  end
26
- end
59
+
60
+ end
@@ -1,49 +1,54 @@
1
- module Treat
2
- module Extractors
3
- module Language
4
- # Require the 'whatlanguage' gem.
5
- silence_warnings { require 'whatlanguage' }
6
- String.class_eval { undef :language }
7
- DefaultOptions = {
8
- :bias => [:eng, :fre, :chi, :ger, :ara, :spa]
9
- }
10
- # Adaptor for the 'whatlanguage' gem, which
11
- # performs probabilistic language detection.
12
- # The library works by checking for the presence
13
- # of words with bloom filters built from dictionaries
14
- # based upon each source language.
15
- class WhatLanguage < LanguageExtractor
16
- # Keep only once instance of the gem class.
17
- @@detector = nil
18
- # Detect the language of an entity using the
19
- # 'whatlanguage' gem. Return an identifier
20
- # corresponding to the ISO-639-2 code for the
21
- # language.
22
- #
23
- # Options:
24
- # - (Array of Symbols) bias => Languages to bias
25
- # toward when more than one language is detected
26
- # with equal probability.
27
- def self.language(entity, options = {})
28
- options = DefaultOptions.merge(options)
29
- predetection = super(entity, options)
30
- return predetection if predetection
31
- @@detector ||= ::WhatLanguage.new(:possibilities)
32
- possibilities = @@detector.process_text(entity.to_s)
33
- lang = {}
34
- possibilities.each do |k,v|
35
- lang[Treat::Languages.code(k)] = v
36
- end
37
- max = lang.values.max
38
- ordered = lang.select { |i,j| j == max }.keys
39
- ordered.each do |l|
40
- if options[:bias].include?(l)
41
- return l
42
- end
43
- end
44
- return ordered.first
1
+ module Treat::Extractors::Language
2
+
3
+ # Adaptor for the 'whatlanguage' gem, which
4
+ # performs probabilistic language detection.
5
+ # The library works by checking for the presence
6
+ # of words with bloom filters built from
7
+ # dictionaries based upon each source language.
8
+ class WhatLanguage
9
+
10
+ # Require the 'whatlanguage' gem.
11
+ silence_warnings { require 'whatlanguage' }
12
+
13
+ # Undefine the method defined by the gem.
14
+ String.class_eval { undef :language }
15
+
16
+ # By default, bias towards common languages.
17
+ DefaultOptions = {
18
+ :bias => [:eng, :fre, :chi, :ger, :ara, :spa]
19
+ }
20
+
21
+ # Keep only once instance of the gem class.
22
+ @@detector = nil
23
+
24
+ # Detect the language of an entity using the
25
+ # 'whatlanguage' gem. Return an identifier
26
+ # corresponding to the ISO-639-2 code for the
27
+ # language.
28
+ #
29
+ # Options:
30
+ #
31
+ # - (Array of Symbols) bias => Languages to bias
32
+ # toward when more than one language is detected
33
+ # with equal probability.
34
+ def self.language(entity, options = {})
35
+ options = DefaultOptions.merge(options)
36
+ @@detector ||= ::WhatLanguage.new(:possibilities)
37
+ possibilities = @@detector.process_text(entity.to_s)
38
+ lang = {}
39
+ possibilities.each do |k,v|
40
+ lang[Treat::Languages.code(k)] = v
41
+ end
42
+ max = lang.values.max
43
+ ordered = lang.select { |i,j| j == max }.keys
44
+ ordered.each do |l|
45
+ if options[:bias].include?(l)
46
+ return l
45
47
  end
46
48
  end
49
+ return ordered.first
47
50
  end
51
+
48
52
  end
49
- end
53
+
54
+ end
@@ -0,0 +1,55 @@
1
+ # Detects the named entity tag in sentences by using
2
+ # the stanford-core-nlp gem, which interfaces with
3
+ # the Stanford Deterministic Coreference Resolver.
4
+ class Treat::Extractors::NameTag::Stanford
5
+
6
+ require 'treat/loaders/stanford'
7
+
8
+ @@classifiers = {}
9
+
10
+ def self.name_tag(entity, options = {})
11
+
12
+ pp = nil
13
+
14
+ lang = entity.language
15
+
16
+ language = Treat::Languages.describe(lang)
17
+
18
+ isolated_token = entity.is_a?(Treat::Entities::Token)
19
+ tokens = isolated_token ? [entity] : entity.tokens
20
+
21
+ ms = StanfordCoreNLP::Config::Models[:ner][language]
22
+ ms = Treat.models + 'stanford/' +
23
+ StanfordCoreNLP::Config::ModelFolders[:ner] +
24
+ ms['3class']
25
+
26
+ @@classifiers[lang] ||=
27
+ StanfordCoreNLP::CRFClassifier.
28
+ getClassifier(ms)
29
+
30
+ token_list = StanfordCoreNLP.get_list(tokens)
31
+ sentence = @@classifiers[lang].classify_sentence(token_list)
32
+
33
+ i = 0
34
+ n = 0
35
+
36
+ sentence.each do |s_token|
37
+ tag = s_token.get(:answer).to_s.downcase
38
+ tag = nil if tag == 'o'
39
+ return tag if isolated_token
40
+ if tag
41
+ tokens[i].set :name_tag, tag
42
+ n += 1
43
+ end
44
+ i += 1
45
+ end
46
+
47
+ entity.set :named_entity_count, n
48
+
49
+ nil
50
+
51
+ end
52
+
53
+
54
+
55
+ end
@@ -0,0 +1,87 @@
1
+ # Calculates the TF*IDF score of words.
2
+ module Treat::Extractors::TfIdf::Native
3
+ DefaultOptions = {
4
+ :tf => :natural,
5
+ :idf => :logarithm,
6
+ :remove_common_words => true,
7
+ :precision => 4
8
+ }
9
+ Algorithms = {
10
+ :tf => {
11
+ :natural => lambda { |tf| tf },
12
+ :logarithm => lambda { |tf| Math.log(1 + tf) },
13
+ :sqrt =>lambda { |tf| Math.sqrt(tf) }
14
+ },
15
+ :idf => {
16
+ :logarithm => lambda { |n,df| Math.log(n/(1 + df)) },
17
+ :none => lambda { |n,idf| 1 }
18
+ }
19
+ }
20
+ # Optimization caches for tf idf.
21
+ @@n = {} # Number of documents in the collection (n).
22
+ @@df= {} # Number of documents that have a given value (document count).
23
+ @@f = {} # Number of times a word appears in a given document (term count).
24
+ @@wc = {} # Number of words in a given document (word count).
25
+ @@cw = {} # Common words to filter out.
26
+ def self.tf_idf(entity, options={})
27
+ l = Treat::Languages.get(entity.language)
28
+ if l.const_defined?(:CommonWords)
29
+ @@cw[entity.language] =
30
+ l.const_get(:CommonWords)
31
+ return 0 if @@cw[entity.language].include?(entity.value)
32
+ end
33
+ return 0 if entity.value.length <= 2
34
+ options = DefaultOptions.merge(options)
35
+ lambdas = options.partition do |k,v|
36
+ [:tf, :idf, :normalization].include?(k)
37
+ end[0]
38
+ lambdas.each do |opt,val|
39
+ if opt.is_a?(Symbol)
40
+ if Algorithms[opt][val]
41
+ options[opt] = Algorithms[opt][val]
42
+ else
43
+ raise Treat::Exception,
44
+ "The specified algorithm '#{val}' "+
45
+ "to calculate #{opt} does not exist."
46
+ end
47
+ end
48
+ end
49
+ collection = entity.parent_collection
50
+ unless collection
51
+ raise Treat::Exception, "Cannot get the TF*IDF scores " +
52
+ "for a document that is not in a collection."
53
+ end
54
+ document = entity.parent_document
55
+ dc = collection.document_count
56
+ if !collection || !document
57
+ raise Treat::Exception,
58
+ "Tf*Idf requires a collection with documents."
59
+ end
60
+ val = entity.value.downcase
61
+ @@n[collection.id] = dc if @@n[collection.id].nil?
62
+ @@df[collection.id] ||= {}
63
+ if @@df[collection.id][val].nil?
64
+ df = 0
65
+ collection.each_document do |doc|
66
+ @@f[doc.id] ||= {}
67
+ if @@f[doc.id][val].nil?
68
+ @@f[doc.id][val] =
69
+ doc.frequency_of(val)
70
+ end
71
+ df += 1 if @@f[doc.id][val] > 0
72
+ end
73
+ @@df[collection.id][val] = df
74
+ end
75
+ f = @@f[document.id][entity.value].to_f
76
+ df = @@df[collection.id][entity.value].to_f
77
+ tf = options[:tf].call(f).to_f
78
+ if options[:normalize_word_count]
79
+ @@wc[document.id] ||= document.word_count
80
+ tf /= @@wc[document.id]
81
+ end
82
+ n = @@n[collection.id].to_f
83
+ idf = options[:idf].call(n, df)
84
+ tf_idf = tf * idf
85
+ tf_idf.abs.round(options[:precision])
86
+ end
87
+ end