treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -0,0 +1,55 @@
1
+ # A wrapper for the 'chronic' gem, which parses
2
+ # date information.
3
+ #
4
+ # Project website: http://chronic.rubyforge.org/
5
+ class Treat::Extractors::Time::Chronic
6
+
7
+ # Require the 'chronic' gem.
8
+ silence_warnings { require 'chronic' }
9
+
10
+ # Require the Ruby DateTime module
11
+ require 'date'
12
+
13
+ # Return the date information contained within
14
+ # the entity by parsing it with the 'chronic' gem.
15
+ #
16
+ # Options: none.
17
+ def self.time(entity, options = {})
18
+
19
+ s = entity.to_s
20
+ return if s =~ /^[0-9]+$/
21
+ time = nil
22
+
23
+ silence_warnings do
24
+ time = ::Chronic.parse(s, {:guess => true})
25
+ end
26
+
27
+ if entity.has_parent? && remove_time_from_ancestors(entity, time)
28
+ nil
29
+ else
30
+ time
31
+ end
32
+
33
+ end
34
+
35
+ # Keeps the lowest-level time annotations that do
36
+ # not conflict with a higher time annotation.
37
+ # Returns true if the entity conflicts with a
38
+ # higher-level time annotation.
39
+ def self.remove_time_from_ancestors(entity, time)
40
+
41
+ entity.ancestors_with_type(:phrase).each do |a|
42
+
43
+ next if !a.has?(:time)
44
+ unless a.get(:time) == time
45
+ return true
46
+ end
47
+ a.unset(:time)
48
+
49
+ end
50
+
51
+ false
52
+
53
+ end
54
+
55
+ end
@@ -1,71 +1,95 @@
1
- module Treat
2
- module Extractors
3
- module Time
4
- # A wrapper for the 'nickel' gem, which parses
5
- # times and dates and supplies additional information
6
- # concerning these. The additional information supplied
7
- # that this class annotates entities with is:
8
- #
9
- # - time_recurrence: frequency of recurrence in words*.
10
- # - time_recurrence_interval: frequency of recurrence in days.
11
- # - start_time: a DateTime object representing the beginning of
12
- # an event.
13
- # - end_time: a DateTime object representing the end of an event.
14
- #
15
- # Examples of values for time_recurrence are:
16
- #
17
- # - single: "lunch with megan tomorrow at noon"
18
- # - daily: "Art exhibit until March 1st"
19
- # - weekly: "math class every wed from 8-11am"
20
- # - daymonthly: "open bar at joes the first friday of every month"
21
- # - datemonthly: "pay credit card bill on the 22nd of each month"
22
- #
23
- # Project website: http://naturalinputs.com/
24
- class Nickel
25
- require 'date'
26
- silence_warnings { require 'nickel' }
27
- # Extract time information from a bit of text.
28
- def self.time(entity, options = {})
29
- return nil if entity.to_s.strip == ''
30
- n = nil
31
- silence_warnings { n = ::Nickel.parse(entity.to_s.strip) }
32
- occ = n.occurrences[0]
33
- return nil unless occ
1
+ # A wrapper for the 'nickel' gem, which parses
2
+ # times and dates and supplies additional information
3
+ # concerning these. The additional information supplied
4
+ # that this class annotates entities with is:
5
+ #
6
+ # - time_recurrence: frequency of recurrence in words*.
7
+ # - time_recurrence_interval: frequency of recurrence in days.
8
+ # - start_time: a DateTime object representing the beginning of
9
+ # an event.
10
+ # - end_time: a DateTime object representing the end of an event.
11
+ #
12
+ # Examples of values for time_recurrence are:
13
+ #
14
+ # - single: "lunch with megan tomorrow at noon"
15
+ # - daily: "Art exhibit until March 1st"
16
+ # - weekly: "math class every wed from 8-11am"
17
+ # - daymonthly: "open bar at joes the first friday of every month"
18
+ # - datemonthly: "pay credit card bill on the 22nd of each month"
19
+ #
20
+ # Project website: http://naturalinputs.com/
21
+ class Treat::Extractors::Time::Nickel
34
22
 
35
- rec = occ.type.to_s.gsub('single', 'once').intern
36
- time_recurrence = rec
37
- interval = occ.interval ? occ.interval : :none
38
- time_recurrence_interval = interval
23
+ require 'date'
39
24
 
40
- s = [occ.start_date, occ.start_time]
41
- ds = [s[0].year, s[0].month, s[0].day] if s[0]
42
- ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
25
+ silence_warnings { require 'nickel' }
43
26
 
44
- e = [occ.end_date, occ.end_time]
45
- de = [e[0].year, e[0].month, e[0].day] if e[0]
46
- te = [e[1].hour, e[1].minute, e[1].second] if e[1]
27
+ # Extract time information from a bit of text.
28
+ def self.time(entity, options = {})
29
+
30
+ s = entity.to_s
31
+ return if s =~ /^[0-9]+$/
32
+
33
+ n = nil
34
+
35
+ begin
36
+ silence_warnings { n = ::Nickel.parse(s.to_s.strip) }
37
+ rescue
38
+ return
39
+ end
40
+
41
+ occ = n.occurrences[0]
42
+
43
+ return unless occ
44
+
45
+ rec = occ.type.to_s.gsub('single', 'once').intern
46
+ time_recurrence = rec
47
+ interval = occ.interval ?
48
+ occ.interval : :none
49
+ time_recurrence_interval = interval
50
+
51
+
52
+ s = [occ.start_date, occ.start_time]
53
+ ds = [s[0].year, s[0].month, s[0].day] if s[0]
54
+ ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
47
55
 
48
- start_time = ::DateTime.civil(*ds) if ds && !ts
49
- start_time = ::DateTime.civil(*ds, *ts) if ds && ts
50
- end_time = ::DateTime.civil(*de) if de && !te
51
- end_time = ::DateTime.civil(*de, *te) if de && te
56
+ e = [occ.end_date, occ.end_time]
57
+ de = [e[0].year, e[0].month, e[0].day] if e[0]
58
+ te = [e[1].hour, e[1].minute, e[1].second] if e[1]
52
59
 
53
- time = Treat::Features::Time.new( # Fix - time message.
54
- start_time, end_time, time_recurrence,
55
- time_recurrence_interval
56
- )
60
+ start_time = ::DateTime.civil(*ds) if ds && !ts
61
+ start_time = ::DateTime.civil(*ds, *ts) if ds && ts
62
+ end_time = ::DateTime.civil(*de) if de && !te
63
+ end_time = ::DateTime.civil(*de, *te) if de && te
64
+
65
+ return unless start_time
57
66
 
58
- # Keeps the lowest-level time annotations
59
- # that do not conflict with the highest-level
60
- # time annotation.
61
- entity.ancestors_with_type(:phrase).each do |a|
62
- unless a.id == entity.id || a.children[0].size == 0
63
- a.unset(:time)
64
- end
65
- end
66
- time
67
- end
68
- end
67
+ if entity.has_parent? &&
68
+ remove_time_from_ancestors(entity, start_time)
69
+ nil
70
+ else
71
+ entity.set :time_recurrence,
72
+ time_recurrence
73
+ entity.set :time_recurrence_interval,
74
+ time_recurrence_interval
75
+ entity.set :end_time, end_time if end_time
76
+ start_time
69
77
  end
78
+
70
79
  end
80
+
81
+ # Keeps the lowest-level time annotations that do
82
+ # not conflict with a higher time annotation.
83
+ # Returns true if the entity conflicts with a
84
+ # higher-level time annotation.
85
+ def self.remove_time_from_ancestors(entity, time)
86
+ entity.ancestors_with_type(:phrase).each do |a|
87
+ next if !a.has?(:time)
88
+ return false unless a.get(:time).to_s == time.to_s
89
+ a.unset(:time, :time_recurrence,
90
+ :time_recurrence_interval, :end_time)
91
+ end
92
+ true
93
+ end
94
+
71
95
  end
@@ -0,0 +1,53 @@
1
+ # A wrapper for Ruby's native date/time parsing.
2
+ class Treat::Extractors::Time::Ruby
3
+
4
+ # Require Ruby's date module.
5
+ require 'date'
6
+
7
+ # Return a DateTime object representing the date/time
8
+ # contained within the entity, using Ruby's native
9
+ # date/time parser. This extractor is suitable for the
10
+ # detection of well-structured dates and times, such as
11
+ # 2011/02/03 5:00.
12
+ #
13
+ # Options: none.
14
+ def self.time(entity, options = {})
15
+ s = entity.to_s
16
+ return if s =~ /^[0-9]+$/
17
+ begin
18
+ time = ::DateTime.parse(s)
19
+ if entity.has_parent? &&
20
+ remove_time_from_ancestors(entity, time)
21
+ nil
22
+ else
23
+ time
24
+ end
25
+ rescue
26
+ nil
27
+ end
28
+ end
29
+
30
+
31
+ # Keeps the lowest-level time annotations that do
32
+ # not conflict with a higher time annotation.
33
+ # Returns true if the entity conflicts with a
34
+ # higher-level time annotation.
35
+ def self.remove_time_from_ancestors(entity, time)
36
+
37
+ entity.ancestors_with_type(:phrase).each do |a|
38
+
39
+ next if !a.has?(:time)
40
+
41
+ unless a.get(:time) == time
42
+ return true
43
+ end
44
+
45
+ a.unset(:time)
46
+
47
+ end
48
+
49
+ false
50
+
51
+ end
52
+
53
+ end
@@ -1,63 +1,72 @@
1
- module Treat
2
- module Extractors
3
- module TopicWords
4
- # An adapter for the 'lda-ruby' gem, which clusters
5
- # documents into topics based on Latent Dirichlet
6
- # Allocation.
7
- #
8
- # Original paper:
9
- # Blei, David M., Ng, Andrew Y., and Jordan, Michael
10
- # I. 2003. Latent dirichlet allocation. Journal of
11
- # Machine Learning Research. 3 (Mar. 2003), 993-1022.
12
- #
13
- # Project website: https://github.com/ealdent/lda-ruby
14
- class LDA
15
- # Require the lda-ruby gem.
16
- silence_warnings { require 'lda-ruby' }
17
- # Monkey patch the TextCorpus class to call it without
18
- # having to create any files.
19
- Lda::TextCorpus.class_eval do
20
- # Ruby, Y U NO SHUT UP!
21
- silence_warnings { undef :initialize }
22
- # Redefine initialize to take in an array of sections
23
- def initialize(sections)
24
- super(nil)
25
- sections.each do |section|
26
- add_document(Lda::TextDocument.new(self, section))
27
- end
28
- end
29
- end
30
- # Default options for the LDA algorithm.
31
- DefaultOptions = {
32
- :num_topics => 20,
33
- :words_per_topic => 10,
34
- :iterations => 20
35
- }
36
- # Retrieve the topic words of a collection.
37
- def self.topic_words(collection, options = {})
38
- options = DefaultOptions.merge(options)
39
- # Create a corpus with the collection
40
- sections = collection.sections.collect do |t|
41
- t.to_s.encode('UTF-8', :invalid => :replace,
42
- :undef => :replace, :replace => "?") # Fix
43
- end
44
- corpus = Lda::TextCorpus.new(sections)
1
+ # An adapter for the 'lda-ruby' gem, which clusters
2
+ # documents into topics based on Latent Dirichlet
3
+ # Allocation.
4
+ #
5
+ # Original paper:
6
+ # Blei, David M., Ng, Andrew Y., and Jordan, Michael
7
+ # I. 2003. Latent dirichlet allocation. Journal of
8
+ # Machine Learning Research. 3 (Mar. 2003), 993-1022.
9
+ #
10
+ # Project website: https://github.com/ealdent/lda-ruby
11
+ module Treat::Extractors::TopicWords::LDA
45
12
 
46
- # Create an Lda object for training
47
- lda = Lda::Lda.new(corpus)
48
- lda.num_topics = options[:num_topics]
49
- lda.max_iter = options[:iterations]
50
- # Run the EM algorithm using random starting points
51
- silence_stdout { lda.em('random') }
52
- # Load the vocabulary.
53
- if options[:vocabulary]
54
- lda.load_vocabulary(options[:vocabulary])
55
- end
56
-
57
- # Get the topic words.
58
- lda.top_words(options[:words_per_topic])
59
- end
13
+ # Require the lda-ruby gem.
14
+ silence_warnings { require 'lda-ruby' }
15
+
16
+ # Monkey patch the TextCorpus class to
17
+ # call it without having to create any files.
18
+ Lda::TextCorpus.class_eval do
19
+ # Ruby, Y U NO SHUT UP!
20
+ silence_warnings { undef :initialize }
21
+ # Redefine initialize to take in an
22
+ # array of sections.
23
+ def initialize(sections)
24
+ super(nil)
25
+ sections.each do |section|
26
+ add_document(
27
+ Lda::TextDocument.new(self, section))
60
28
  end
61
29
  end
62
30
  end
31
+
32
+ # Default options for the LDA algorithm.
33
+ DefaultOptions = {
34
+ :num_topics => 20,
35
+ :words_per_topic => 10,
36
+ :iterations => 20,
37
+ :vocabulary => nil
38
+ }
39
+
40
+ # Retrieve the topic words of a collection.
41
+ def self.topic_words(collection, options = {})
42
+
43
+ options = DefaultOptions.merge(options)
44
+
45
+ docs = collection.documents.map { |d| d.to_s }
46
+ # Create a corpus with the collection
47
+ corpus = Lda::TextCorpus.new(docs)
48
+
49
+ # Create an Lda object for training
50
+ lda = Lda::Lda.new(corpus)
51
+ lda.num_topics = options[:num_topics]
52
+ lda.max_iter = options[:iterations]
53
+ # Run the EM algorithm using random
54
+ # starting points
55
+
56
+ silence_stdout do
57
+ lda.em('random')
58
+ end
59
+
60
+ # Load the vocabulary.
61
+ if options[:vocabulary]
62
+ lda.load_vocabulary(options[:vocabulary])
63
+ end
64
+
65
+ # Get the topic words.
66
+ lda.top_words(
67
+ options[:words_per_topic]
68
+ ).values
69
+
70
+ end
71
+
63
72
  end
@@ -1,92 +1,105 @@
1
- module Treat
2
- module Extractors
3
- module Topics
4
- # A Ruby Part text categorizer that was trained
5
- # using the Reuters news story corpus. Version 0.1
6
- #
7
- # Copyright 2005 Mark Watson. All rights reserved.
8
- # This software is released under the GPL.
9
- # Rewrite for inclusion in Treat by Louis Mullie (2011).
10
- #
11
- # Original project website: http://www.markwatson.com/opensource/
12
- class Reuters
13
- # Require the Nokogiri XML parser.
14
- require 'nokogiri'
15
- # Hashes to hold the topics.
16
- @@industry = {}
17
- @@region = {}
18
- @@topics = {}
19
- # Get the topic of the text.
20
- #
21
- # Options: none.
22
- def self.topics(text, options = {})
23
- stems = []
24
- @@reduce = 0
25
- unless text.words.size > 0
26
- raise Treat::Exception,
27
- "Annotator 'topics' requires processor 'tokenize'."
28
- end
29
- text.words.collect! do |tok|
30
- stem = tok.stem.downcase
31
- val = tok.value.downcase
32
- stems << stem
33
- unless stem == val
34
- stems << val
35
- end
36
- end
37
- get_topics
38
- score_words(@@industry, stems) +
39
- score_words(@@region, stems) +
40
- score_words(@@topics, stems)
41
- #Treat::Feature.new(topics)
42
- end
43
- # Read the topics from the XML files.
44
- def self.get_topics
45
- return unless @@industry.size == 0
46
- @@industry = read_xml(Treat.lib + '/treat/extractors/topics/reuters/industry.xml')
47
- @@region = read_xml(Treat.lib + '/treat/extractors/topics/reuters/region.xml')
48
- @@topics = read_xml(Treat.lib + '/treat/extractors/topics/reuters/topics.xml')
49
- end
50
- def self.read_xml(file_name)
51
- hash = {}
52
- doc = Nokogiri::XML(File.read(file_name))
53
- doc.root.children.each do |category|
54
- cat = category["cat"]
55
- next if cat.nil?
56
- cat = cat.downcase
57
- hash[cat] ||= {}
58
- hash[cat][category["name"]] =
59
- category["score"].to_f
60
- end
61
- hash
62
- end
63
- def self.score_words(hash, word_list)
64
- category_names = hash.keys
65
- count_hash = {}
66
- category_names.each do |cat_name|
67
- cat_name = cat_name.downcase
68
- count_hash[cat_name] ||= 0
69
- word_list.each do |word|
70
- unless hash[cat_name][word].nil?
71
- count_hash[cat_name] +=
72
- hash[cat_name][word]
73
- end
74
- end
75
- end
76
- count_hash = best_of_hash(count_hash)
77
- count_hash.keys
78
- end
79
- def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
80
- ret = {}
81
- hash.keys.each do |key|
82
- if hash[key] > cutoff
83
- ret[key] = hash[key] * scale
84
- ret[key] = ret[key].round(2)
85
- end
86
- end
87
- ret
1
+ # A Ruby text categorizer that was trained using
2
+ # the Reuters news story corpus.
3
+ #
4
+ # Copyright 2005 Mark Watson. All rights reserved.
5
+ # Rewrite for inclusion in Treat by Louis Mullie (2011).
6
+ #
7
+ # Original project website:
8
+ # http://www.markwatson.com/opensource/
9
+ module Treat::Extractors::Topics::Reuters
10
+
11
+ # Require the Nokogiri XML parser.
12
+ require 'nokogiri'
13
+
14
+ # Hashes to hold the topics.
15
+ @@industry = {}
16
+ @@region = {}
17
+ @@topics = {}
18
+
19
+ # Get the general topic of the text using
20
+ # a Reuters-trained model.
21
+ #
22
+ # Options: none.
23
+ def self.topics(text, options = {})
24
+ stems = []
25
+ @@reduce = 0
26
+ unless text.words.size > 0
27
+ raise Treat::Exception,
28
+ "Annotator 'topics' requires " +
29
+ "processor 'tokenize'."
30
+ end
31
+ text.words.collect! do |tok|
32
+ stem = tok.stem.downcase
33
+ val = tok.value.downcase
34
+ stems << stem
35
+ unless stem == val
36
+ stems << val
37
+ end
38
+ end
39
+ get_topics
40
+ score_words(@@industry, stems) +
41
+ score_words(@@region, stems) +
42
+ score_words(@@topics, stems)
43
+ #Treat::Feature.new(topics)
44
+ end
45
+
46
+ # Read the topics from the XML files.
47
+ def self.get_topics
48
+ return unless @@industry.size == 0
49
+ @@industry = read_xml(Treat.models +
50
+ 'reuters/industry.xml')
51
+ @@region = read_xml(Treat.models +
52
+ 'reuters/region.xml')
53
+ @@topics = read_xml(Treat.models +
54
+ 'reuters/topics.xml')
55
+ end
56
+
57
+ # Read an XML file and populate a
58
+ # hash of topics.
59
+ def self.read_xml(file_name)
60
+ hash = {}
61
+ doc = Nokogiri::XML(File.read(file_name))
62
+ doc.root.children.each do |category|
63
+ cat = category["cat"]
64
+ next if cat.nil?
65
+ cat = cat.downcase
66
+ hash[cat] ||= {}
67
+ hash[cat][category["name"]] =
68
+ category["score"].to_f
69
+ end
70
+ hash
71
+ end
72
+
73
+ # Score the words by adding the scores
74
+ # of each word occurence.
75
+ def self.score_words(hash, word_list)
76
+ category_names = hash.keys
77
+ count_hash = {}
78
+ category_names.each do |cat_name|
79
+ cat_name = cat_name.downcase
80
+ count_hash[cat_name] ||= 0
81
+ word_list.each do |word|
82
+ unless hash[cat_name][word].nil?
83
+ count_hash[cat_name] +=
84
+ hash[cat_name][word]
88
85
  end
89
86
  end
90
87
  end
88
+ count_hash = best_of_hash(count_hash)
89
+ count_hash.keys
90
+ end
91
+
92
+ # Retrieve the words with the scores above
93
+ # cutoff inside the hash of scored words.
94
+ def self.best_of_hash(hash, cutoff = 0.0, scale = 1.0)
95
+ ret = {}
96
+ hash.keys.each do |key|
97
+ if hash[key] > cutoff
98
+ ret[key] = hash[key] * scale
99
+ ret[key] = ret[key].round(2)
100
+ end
101
+ end
102
+ ret
91
103
  end
104
+
92
105
  end