treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
data/lib/treat/doable.rb DELETED
@@ -1,45 +0,0 @@
1
- module Treat
2
- module Doable
3
- def do(*tasks)
4
- tasks.each do |task|
5
- if task.is_a?(Hash)
6
- task.each do |k,v|
7
- t, w = k, v
8
- w, o = *w if w.is_a?(Array)
9
- o ||= {}
10
- do_task(t, w, o)
11
- end
12
- else
13
- t = task.is_a?(Array) ? task[0] : task
14
- w = task.is_a?(Array) ? task[1] : nil
15
- w, o = *w if w.is_a?(Array)
16
- o ||= {}
17
- do_task(t, w, o)
18
- end
19
- end
20
- end
21
- DEBUG = true
22
- def do_task(task, worker, options)
23
- group = Categories.lookup(task)
24
- unless group
25
- raise Treat::Exception, "Task #{task} does not exist."
26
- end
27
- entity_types = group.targets
28
- f = nil
29
- entity_types.each do |t|
30
- f = true if Treat::Entities.match_types[t][type]
31
- end
32
- if f || entity_types.include?(:entity)
33
- send(task, worker, options)
34
- else
35
- each_entity(*entity_types) do |entity|
36
- entity.do_task(task, worker, options)
37
- end
38
- unless entity_types.include?(type)
39
- features.delete(task)
40
- end
41
- nil
42
- end
43
- end
44
- end
45
- end
@@ -1,14 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents a collection of texts.
4
- class Collection < Entity
5
- # Initialize the collection with a folder
6
- # containing the texts of the collection.
7
- def initialize(folder = nil)
8
- super('', id)
9
- @type = :collection
10
- set :folder, folder
11
- end
12
- end
13
- end
14
- end
@@ -1,12 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents a document.
4
- class Document < Entity
5
- def initialize(file = nil, id = nil)
6
- super('', id)
7
- set :file, file if file
8
- @type = :document
9
- end
10
- end
11
- end
12
- end
@@ -1,17 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents any syntactic phrase of a sentence.
4
- class Phrase < Entity
5
- def initialize(value = '', id = nil)
6
- super(value, id)
7
- @type = :phrase
8
- end
9
- end
10
- class Sentence < Phrase
11
- def initialize(value = '', id = nil)
12
- super(value, id)
13
- @type = :sentence
14
- end
15
- end
16
- end
17
- end
@@ -1,61 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents a terminal element in the text structure.
4
- class Token < Entity
5
- # All tokens are leafs.
6
- def is_leaf?; true; end
7
- def initialize(value = '', id = nil)
8
- super(value, id)
9
- @type = :token
10
- end
11
- end
12
- # Represents a word.
13
- class Word < Token
14
- def initialize(value = '', id = nil)
15
- super(value, id)
16
- @type = :word
17
- end
18
- end
19
- # Represents a clitic ('s).
20
- class Clitic < Token
21
- def initialize(value = '', id = nil)
22
- super(value, id)
23
- @type = :clitic
24
- end
25
- end
26
- # Represents a number.
27
- class Number < Token
28
- # Convert the number to an integer.
29
- def to_i; to_s.to_i; end
30
- # Convert the number to a float.
31
- def to_f; to_s.to_f; end
32
- def initialize(value = '', id = nil)
33
- super(value, id)
34
- @type = :number
35
- end
36
- end
37
- # Represents a punctuation sign.
38
- class Punctuation < Token
39
- def initialize(value = '', id = nil)
40
- super(value, id)
41
- @type = :punctuation
42
- end
43
- end
44
- # Represents a character that is neither
45
- # alphabetical, numerical or a punctuation
46
- # character (e.g. @#$%&*).
47
- class Symbol < Token
48
- def initialize(value = '', id = nil)
49
- super(value, id)
50
- @type = :symbol
51
- end
52
- end
53
- # Represents an entity of unknown type.
54
- class Unknown < Token
55
- def initialize(value = '', id = nil)
56
- super(value, id)
57
- @type = :unknown
58
- end
59
- end
60
- end
61
- end
@@ -1,41 +0,0 @@
1
- module Treat
2
- module Entities
3
- # Represents a zone of text
4
- # (Title, Paragraph, List, Quote).
5
- class Zone < Entity
6
- def initialize(value = '', id = nil)
7
- super(value, id)
8
- @type = :zone
9
- end
10
- end
11
- # Represents a title, subtitle, logical header.
12
- class Title < Zone
13
- def initialize(value = '', id = nil)
14
- super(value, id)
15
- @type = :title
16
- end
17
- end
18
- # Represents a paragraph.
19
- class Paragraph < Zone
20
- def initialize(value = '', id = nil)
21
- super(value, id)
22
- @type = :paragraph
23
- end
24
- end
25
- # Represents a list.
26
- class List < Zone
27
- def initialize(value = '', id = nil)
28
- super(value, id)
29
- @type = :list
30
- end
31
- end
32
- # Represents a section, usually with a title
33
- # and at least one paragraph.
34
- class Section < Zone
35
- def initialize(value = '', id = nil)
36
- super(value, id)
37
- @type = :section
38
- end
39
- end
40
- end
41
- end
@@ -1,69 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Coreferences
4
- class Stanford
5
- require 'stanford-core-nlp'
6
- @@pipeline = nil
7
- def self.coreferences(entity, options = {})
8
- val = entity.to_s
9
- if entity.has_children?
10
- warn "The Stanford Coreference Resolver currently requires " +
11
- "an unsegmented, untokenized block of text to work with. " +
12
- "Removing and replacing all children of '#{entity.short_value}'."
13
- entity.remove_all!
14
- end
15
- @@pipeline ||= ::StanfordCoreNLP.load(
16
- :tokenize, :ssplit, :pos,
17
- :lemma, :parse, :ner, :dcoref
18
- )
19
- text = ::StanfordCoreNLP::Text.new(entity.to_s)
20
- @@pipeline.annotate(text)
21
- clusters = {}
22
- text.get(:sentences).each do |sentence|
23
- s = Treat::Entities::Sentence.
24
- from_string(sentence.get(:value).to_s, true)
25
- sentence.get(:tokens).each do |token|
26
- t = Treat::Entities::Token.
27
- from_string(token.value.to_s)
28
- tag = token.get(:named_entity_tag).
29
- to_s.downcase
30
- corefid = token.get(:coref_cluster_id).to_s
31
- unless corefid == ''
32
- clusters[corefid] ||= []
33
- clusters[corefid] << t
34
- t.set :coref_cluster_id, corefid
35
- end
36
-
37
- t.set :named_entity_tag,
38
- tag.intern unless tag == 'o'
39
- s << t
40
- end
41
- entity << s
42
- end
43
- entity.each_token do |token|
44
- if token.has?(:coref_cluster_id)
45
- id = token.coref_cluster_id
46
- links = clusters[id].dup
47
- links.delete(token)
48
- token.unset(:coref_cluster_id)
49
- next if links.empty?
50
- token.set :coreferents, links
51
- links.each do |target|
52
- token.link(target, :refers_to)
53
- end
54
- end
55
- end
56
- i = 0
57
- coreferences = {}
58
- clusters.each do |k,v|
59
- unless !v || v.size == 1
60
- coreferences[i] = v
61
- i += 1
62
- end
63
- end
64
- coreferences
65
- end
66
- end
67
- end
68
- end
69
- end
@@ -1,32 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Date
4
- # A wrapper for the 'chronic' gem, which parses
5
- # date information.
6
- #
7
- # Project website: http://chronic.rubyforge.org/
8
- class Chronic
9
- silence_warnings { require 'chronic' }
10
- require 'date'
11
- # Return the date information contained within the entity
12
- # by parsing it with the 'chronic' gem.
13
- #
14
- # Options: none.
15
- def self.date(entity, options = {})
16
- date = nil
17
- return if entity.has?(:time)
18
- s = entity.to_s
19
- s.gsub!('\/', '/')
20
- s.strip!
21
- silence_warnings do
22
- date = ::Chronic.parse(s, {:guess => true})
23
- end
24
- entity.ancestors_with_type(:phrase).each do |a|
25
- a.unset(:date) if a.has?(:date)
26
- end
27
- return date.to_date if date
28
- end
29
- end
30
- end
31
- end
32
- end
@@ -1,25 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Date
4
- # A wrapper for Ruby's native date parsing.
5
- class Ruby
6
- require 'date'
7
- # Return a DateTime object representing the date/date
8
- # contained within the entity, using Ruby's native
9
- # date/date parser.
10
- #
11
- # Options: none.
12
- def self.date(entity, options = {})
13
- begin
14
- s = entity.to_s.strip
15
- s.gsub!('\/', '/')
16
- date = ::DateTime.parse(s)
17
- date.to_date
18
- rescue
19
- nil
20
- end
21
- end
22
- end
23
- end
24
- end
25
- end
@@ -1,48 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Keywords
4
- class TopicsTfIdf
5
- DefaultOptions = {num_keywords: 5, tf_idf_threshold: 0.5, topic_words: nil}
6
- def self.keywords(entity, options = {})
7
- options = DefaultOptions.merge(options)
8
- unless options[:topic_words]
9
- options[:topic_words] = entity.parent_collection.topic_words
10
- end
11
- if Treat::Entities.rank(entity.type) <
12
- Treat::Entities.rank(:sentence)
13
- raise Treat::Exception, 'Cannot get the key ' +
14
- 'sentences of an entity smaller than a sentence.'
15
- else
16
- find_keywords(entity, options)
17
- end
18
- end
19
- def self.find_keywords(entity, options)
20
- keywords = []
21
- entity.each_word do |word|
22
- found = false
23
- tf_idf = word.tf_idf
24
- options[:topic_words].each do |i, topic_words|
25
- next if keywords.include?(word.value)
26
- if topic_words.include?(word.value)
27
- found = true
28
- if tf_idf > options[:tf_idf_threshold]
29
- keywords << word.value
30
- word.set :is_keyword?, found
31
- end
32
- end
33
- end
34
- end
35
- i = 0
36
- # Take a slice of keywords with i elements.
37
- selected_keywords = []
38
- keywords.each do |keyword|
39
- break if i > options[:num_keywords]
40
- selected_keywords << keyword
41
- i += 1
42
- end
43
- selected_keywords
44
- end
45
- end
46
- end
47
- end
48
- end
@@ -1,27 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module Language
4
- # A generic language detector, which is called before
5
- # any language detector and ensures that configuration
6
- # options concerning language are enforced (e.g. returns
7
- # the default language when Treat.detect_language is false).
8
- class LanguageExtractor
9
- def self.language(entity, options = {})
10
- if entity.to_s =~ /^[[:digit:]]+$/
11
- return Treat.default_language
12
- end
13
- if Treat.detect_language == false
14
- return Treat.default_language
15
- else
16
- dlvl = Treat.language_detection_level
17
- if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
18
- entity.has_parent?
19
- anc = entity.ancestor_with_type(dlvl)
20
- return anc.language if anc
21
- end
22
- end
23
- end
24
- end
25
- end
26
- end
27
- end
@@ -1,53 +0,0 @@
1
- module Treat
2
- module Extractors
3
- module NamedEntityTag
4
- class Stanford
5
- require 'stanford-core-nlp'
6
- StanfordCoreNLP.load_class('ArrayList', 'java.util')
7
- StanfordCoreNLP.load_class('Word', 'edu.stanford.nlp.ling')
8
- @@pipeline = nil
9
- def self.named_entity_tag(entity, options = {})
10
- pp = nil
11
- if entity.is_a?(Treat::Entities::Token) &&
12
- entity.has_parent?
13
- pp = entity.parent_phrase
14
- s = get_list(pp.tokens)
15
- else
16
- s = entity.to_s
17
- end
18
-
19
- @@pipeline ||= ::StanfordCoreNLP.load(
20
- :tokenize, :ssplit, :pos, :lemma, :parse, :ner
21
- )
22
-
23
- text = ::StanfordCoreNLP::Text.new(s)
24
- @@pipeline.annotate(text)
25
-
26
- add_to = pp ? pp : entity
27
-
28
- if entity.is_a?(Treat::Entities::Phrase)
29
- text.get(:tokens).each do |token|
30
- t = Treat::Entities::Token.from_string(token.value.to_s)
31
- tag = token.get(:named_entity_tag).to_s.downcase
32
- t.set :named_entity_tag, tag.intern unless tag == 'o'
33
- add_to << t
34
- end
35
- elsif entity.is_a?(Treat::Entities::Token)
36
- tag = text.get(:tokens).iterator.next.
37
- get(:named_entity_tag).to_s.downcase
38
- entity.set :named_entity_tag, tag.intern unless tag == 'o'
39
- end
40
-
41
- end
42
-
43
- def self.get_list(words)
44
- list = StanfordCoreNLP::ArrayList.new
45
- words.each do |w|
46
- list.add(StanfordCoreNLP::Word.new(w.to_s))
47
- end
48
- list
49
- end
50
- end
51
- end
52
- end
53
- end