treat 1.0.6 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (210) hide show
  1. data/LICENSE +2 -4
  2. data/README.md +13 -12
  3. data/bin/MANIFEST +1 -0
  4. data/bin/stanford/bridge.jar +0 -0
  5. data/bin/stanford/joda-time.jar +0 -0
  6. data/bin/stanford/stanford-corenlp.jar +0 -0
  7. data/bin/stanford/stanford-parser.jar +0 -0
  8. data/bin/stanford/xom.jar +0 -0
  9. data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
  10. data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
  11. data/files/{INFO → MANIFEST} +0 -0
  12. data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
  13. data/files/weather-central-canada-heat-wave.html +1370 -0
  14. data/lib/treat/config/core/acronyms.rb +4 -0
  15. data/lib/treat/config/core/encodings.rb +8 -0
  16. data/lib/treat/config/core/entities.rb +2 -0
  17. data/lib/treat/config/core/language.rb +3 -0
  18. data/lib/treat/config/core/paths.rb +8 -0
  19. data/lib/treat/config/core/syntax.rb +1 -0
  20. data/lib/treat/config/core/verbosity.rb +1 -0
  21. data/lib/treat/config/databases/mongo.rb +3 -0
  22. data/lib/treat/config/languages/agnostic.rb +34 -0
  23. data/lib/treat/config/languages/arabic.rb +13 -0
  24. data/lib/treat/config/languages/chinese.rb +13 -0
  25. data/lib/treat/config/languages/dutch.rb +12 -0
  26. data/lib/treat/config/languages/english.rb +60 -0
  27. data/lib/treat/config/languages/french.rb +18 -0
  28. data/lib/treat/config/languages/german.rb +18 -0
  29. data/lib/treat/config/languages/greek.rb +12 -0
  30. data/lib/treat/config/languages/italian.rb +12 -0
  31. data/lib/treat/config/languages/polish.rb +12 -0
  32. data/lib/treat/config/languages/portuguese.rb +12 -0
  33. data/lib/treat/config/languages/russian.rb +12 -0
  34. data/lib/treat/config/languages/spanish.rb +12 -0
  35. data/lib/treat/config/languages/swedish.rb +12 -0
  36. data/lib/treat/config/libraries/stanford.rb +1 -0
  37. data/lib/treat/config/linguistics/categories.rb +4 -0
  38. data/lib/treat/config/linguistics/punctuation.rb +33 -0
  39. data/lib/treat/config/tags/aligned.rb +221 -0
  40. data/lib/treat/config/tags/enju.rb +71 -0
  41. data/lib/treat/config/tags/paris7.rb +17 -0
  42. data/lib/treat/config/tags/ptb.rb +15 -0
  43. data/lib/treat/config/workers/extractors.rb +39 -0
  44. data/lib/treat/config/workers/formatters.rb +20 -0
  45. data/lib/treat/config/workers/inflectors.rb +27 -0
  46. data/lib/treat/config/workers/learners.rb +6 -0
  47. data/lib/treat/config/workers/lexicalizers.rb +18 -0
  48. data/lib/treat/config/workers/list.rb +1 -0
  49. data/lib/treat/config/workers/processors.rb +19 -0
  50. data/lib/treat/config/workers/retrievers.rb +12 -0
  51. data/lib/treat/config.rb +125 -0
  52. data/lib/treat/{classification.rb → core/classification.rb} +1 -1
  53. data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
  54. data/lib/treat/{tree.rb → core/node.rb} +5 -5
  55. data/lib/treat/core/server.rb +3 -0
  56. data/lib/treat/core.rb +5 -0
  57. data/lib/treat/entities/abilities/buildable.rb +61 -56
  58. data/lib/treat/entities/abilities/checkable.rb +2 -2
  59. data/lib/treat/entities/abilities/comparable.rb +21 -0
  60. data/lib/treat/entities/abilities/copyable.rb +2 -0
  61. data/lib/treat/entities/abilities/countable.rb +1 -1
  62. data/lib/treat/entities/abilities/debuggable.rb +1 -1
  63. data/lib/treat/entities/abilities/delegatable.rb +42 -36
  64. data/lib/treat/entities/abilities/doable.rb +2 -2
  65. data/lib/treat/entities/abilities/exportable.rb +1 -1
  66. data/lib/treat/entities/abilities/iterable.rb +21 -33
  67. data/lib/treat/entities/abilities/magical.rb +8 -8
  68. data/lib/treat/entities/abilities/registrable.rb +0 -38
  69. data/lib/treat/entities/abilities/stringable.rb +19 -19
  70. data/lib/treat/entities/collection.rb +31 -0
  71. data/lib/treat/entities/document.rb +10 -0
  72. data/lib/treat/entities/entity.rb +18 -13
  73. data/lib/treat/entities/group.rb +15 -0
  74. data/lib/treat/entities/section.rb +13 -0
  75. data/lib/treat/entities/token.rb +35 -0
  76. data/lib/treat/entities/zone.rb +11 -0
  77. data/lib/treat/entities.rb +5 -75
  78. data/lib/treat/helpers/didyoumean.rb +57 -0
  79. data/lib/treat/helpers/escaping.rb +15 -0
  80. data/lib/treat/helpers/formatting.rb +41 -0
  81. data/lib/treat/helpers/platform.rb +15 -0
  82. data/lib/treat/helpers/reflection.rb +17 -0
  83. data/lib/treat/helpers/temporary.rb +27 -0
  84. data/lib/treat/helpers/verbosity.rb +19 -0
  85. data/lib/treat/helpers.rb +5 -0
  86. data/lib/treat/installer.rb +46 -165
  87. data/lib/treat/loaders/linguistics.rb +22 -27
  88. data/lib/treat/loaders/stanford.rb +23 -41
  89. data/lib/treat/loaders.rb +10 -0
  90. data/lib/treat/proxies.rb +73 -24
  91. data/lib/treat/version.rb +3 -0
  92. data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
  93. data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
  94. data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
  95. data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
  96. data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
  97. data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
  98. data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
  99. data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
  100. data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
  101. data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
  102. data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
  103. data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
  104. data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
  105. data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
  106. data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
  107. data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
  108. data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
  109. data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
  110. data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
  111. data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
  112. data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
  113. data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
  114. data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
  115. data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
  116. data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
  117. data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
  118. data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
  119. data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
  120. data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
  121. data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
  122. data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
  123. data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
  124. data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
  125. data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
  126. data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
  127. data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
  128. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
  129. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
  130. data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
  131. data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
  132. data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
  133. data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
  134. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
  135. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
  136. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
  137. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
  138. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
  139. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
  140. data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
  141. data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
  142. data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
  143. data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
  144. data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
  145. data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
  146. data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
  147. data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
  148. data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
  149. data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
  150. data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
  151. data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
  152. data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
  153. data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
  154. data/lib/treat/workers.rb +96 -0
  155. data/lib/treat.rb +23 -49
  156. data/spec/collection.rb +4 -4
  157. data/spec/document.rb +5 -5
  158. data/spec/entity.rb +33 -32
  159. data/spec/{tree.rb → node.rb} +5 -5
  160. data/spec/phrase.rb +5 -39
  161. data/spec/sandbox.rb +212 -6
  162. data/spec/token.rb +12 -9
  163. data/spec/treat.rb +12 -9
  164. data/spec/word.rb +10 -9
  165. data/spec/zone.rb +6 -2
  166. data/tmp/{INFO → MANIFEST} +0 -0
  167. data/tmp/english.yaml +10340 -0
  168. metadata +149 -139
  169. data/lib/treat/ai.rb +0 -12
  170. data/lib/treat/categories.rb +0 -90
  171. data/lib/treat/categorizable.rb +0 -44
  172. data/lib/treat/configurable.rb +0 -115
  173. data/lib/treat/dependencies.rb +0 -25
  174. data/lib/treat/downloader.rb +0 -87
  175. data/lib/treat/entities/abilities.rb +0 -10
  176. data/lib/treat/entities/entities.rb +0 -102
  177. data/lib/treat/exception.rb +0 -7
  178. data/lib/treat/extractors.rb +0 -79
  179. data/lib/treat/formatters/serializers/mongo.rb +0 -64
  180. data/lib/treat/formatters.rb +0 -41
  181. data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
  182. data/lib/treat/inflectors.rb +0 -52
  183. data/lib/treat/kernel.rb +0 -208
  184. data/lib/treat/languages/arabic.rb +0 -16
  185. data/lib/treat/languages/chinese.rb +0 -16
  186. data/lib/treat/languages/dutch.rb +0 -16
  187. data/lib/treat/languages/english.rb +0 -63
  188. data/lib/treat/languages/french.rb +0 -20
  189. data/lib/treat/languages/german.rb +0 -20
  190. data/lib/treat/languages/greek.rb +0 -16
  191. data/lib/treat/languages/italian.rb +0 -17
  192. data/lib/treat/languages/language.rb +0 -10
  193. data/lib/treat/languages/list.txt +0 -504
  194. data/lib/treat/languages/polish.rb +0 -16
  195. data/lib/treat/languages/portuguese.rb +0 -16
  196. data/lib/treat/languages/russian.rb +0 -16
  197. data/lib/treat/languages/spanish.rb +0 -16
  198. data/lib/treat/languages/swedish.rb +0 -16
  199. data/lib/treat/languages.rb +0 -132
  200. data/lib/treat/lexicalizers.rb +0 -37
  201. data/lib/treat/object.rb +0 -7
  202. data/lib/treat/processors/chunkers/autoselect.rb +0 -16
  203. data/lib/treat/processors/chunkers/txt.rb +0 -21
  204. data/lib/treat/processors.rb +0 -38
  205. data/lib/treat/retrievers.rb +0 -27
  206. data/lib/treat/server.rb +0 -26
  207. data/lib/treat/universalisation/encodings.rb +0 -12
  208. data/lib/treat/universalisation/tags.rb +0 -453
  209. data/lib/treat/universalisation.rb +0 -9
  210. data/spec/languages.rb +0 -25
data/lib/treat/proxies.rb CHANGED
@@ -1,57 +1,106 @@
1
- # Proxies install builders on core Ruby objects,
2
- # so that methods called on them may be passed
3
- # to the entity that can be built from the core
4
- # class instance.
5
- module Treat::Proxies
6
-
1
+ # Proxies install builders on core Ruby objects;
2
+ # when a method defined by Treat is called on these
3
+ # objects, the Ruby object is cast to a Treat entity
4
+ # and the method is called on the resultant type.
5
+ module Treat::Core::Proxies
6
+
7
7
  # Provides a base functionality for proxies.
8
8
  module Proxy
9
-
9
+
10
10
  # Build the entity corresponding to the proxied
11
11
  # object and send the method call to the entity.
12
12
  def method_missing(sym, *args, &block)
13
- if sym == :do || Treat::Categories.lookup(sym)
13
+ if sym == :do || Treat::Workers.lookup(sym)
14
14
  to_entity.send(sym, *args)
15
15
  else
16
16
  super(sym, *args, &block)
17
17
  end
18
18
  end
19
-
19
+
20
20
  # Create an unknown type of entity by default.
21
21
  def to_entity(builder = nil)
22
22
  Treat::Entities::Unknown(self.to_s)
23
23
  end
24
-
24
+
25
25
  end
26
-
26
+
27
27
  # Install Treat functions on String objects.
28
28
  module String
29
-
29
+
30
30
  # Include base proxy functionality.
31
- include Treat::Proxies::Proxy
32
-
31
+ include Treat::Core::Proxies::Proxy
32
+
33
33
  # Return the entity corresponding to the string.
34
34
  def to_entity
35
- Treat::Entities::Entity.from_string(self.to_s)
35
+ Treat::Entities::Entity.from_string(self)
36
36
  end
37
-
37
+
38
38
  end
39
-
39
+
40
40
  # Install Treat functions on Numeric objects.
41
41
  module Numeric
42
-
42
+
43
43
  # Include base proxy functionality.
44
- include Treat::Proxies::Proxy
45
-
44
+ include Treat::Core::Proxies::Proxy
45
+
46
46
  # Return the entity corresponding to the number.
47
47
  def to_entity(builder = nil)
48
48
  Treat::Entities::Number.from_numeric(self)
49
49
  end
50
+
51
+ end
50
52
 
53
+ # Include Treat methods on strings.
54
+ ::String.class_eval do
55
+ include Treat::Core::Proxies::String
51
56
  end
52
57
 
53
- # Include the proxies in the core classes.
54
- ::String.class_eval { include Treat::Proxies::String }
55
- ::Numeric.class_eval { include Treat::Proxies::Numeric }
58
+ # Include Treat methods on numerics.
59
+ ::Numeric.class_eval do
60
+ include Treat::Core::Proxies::Numeric
61
+ end
62
+
63
+ # This is kind of ugly; need to find a
64
+ # better solution eventually (?)
65
+ Treat::Entities::Entity.class_eval do
66
+
67
+ # Rename the true language detection
68
+ # method to :language_proxied, and
69
+ # only call it if language detection
70
+ # is turned on in the configuration.
71
+ alias :language_proxied :language
72
+
73
+ # Proxy the #language method, defined on
74
+ # all textual entities, in order to catch
75
+ # the method call if language detection is
76
+ # turned off and return the default language
77
+ # in that case.
78
+ def language(extractor = nil, options = {})
56
79
 
57
- end
80
+ return Treat.core.language.default if
81
+ !Treat.core.language.detect
82
+
83
+ if is_a?(Treat::Entities::Symbol) ||
84
+ is_a?(Treat::Entities::Number)
85
+ return Treat.core.language.default
86
+ end
87
+
88
+ dlvl = Treat.core.language.detect_at
89
+ dklass = Treat::Entities.const_get(cc(dlvl))
90
+
91
+ if self.class.compare_with(
92
+ dklass) < 1 && has_parent?
93
+ anc = ancestor_with_type(dlvl)
94
+ return anc.language if anc
95
+ end
96
+
97
+ extractor ||= Treat.workers.
98
+ extractors.language.default
99
+
100
+ language_proxied(extractor, options)
101
+
102
+ end
103
+
104
+ end
105
+
106
+ end
@@ -0,0 +1,3 @@
1
+ module Treat
2
+ VERSION = "1.1.0"
3
+ end
@@ -1,7 +1,7 @@
1
1
  # This retrieves a supplied number of keywords
2
2
  # by selecting the N words with the highest TF*IDF
3
3
  # for each document.
4
- class Treat::Extractors::Keywords::TfIdf
4
+ class Treat::Workers::Extractors::Keywords::TfIdf
5
5
 
6
6
  # Default options - retrieve 5 keywords.
7
7
  DefaultOptions = { :number => 5 }
@@ -1,4 +1,4 @@
1
- module Treat::Extractors::Language
1
+ module Treat::Workers::Extractors::Language
2
2
 
3
3
  # Adaptor for the 'whatlanguage' gem, which
4
4
  # performs probabilistic language detection.
@@ -15,7 +15,7 @@ module Treat::Extractors::Language
15
15
 
16
16
  # By default, bias towards common languages.
17
17
  DefaultOptions = {
18
- :bias => [:eng, :fre, :chi, :ger, :ara, :spa]
18
+ :bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
19
19
  }
20
20
 
21
21
  # Keep only once instance of the gem class.
@@ -32,21 +32,28 @@ module Treat::Extractors::Language
32
32
  # toward when more than one language is detected
33
33
  # with equal probability.
34
34
  def self.language(entity, options = {})
35
+
35
36
  options = DefaultOptions.merge(options)
37
+
36
38
  @@detector ||= ::WhatLanguage.new(:possibilities)
37
39
  possibilities = @@detector.process_text(entity.to_s)
38
40
  lang = {}
41
+
39
42
  possibilities.each do |k,v|
40
- lang[Treat::Languages.code(k)] = v
43
+ lang[k.intern] = v
41
44
  end
45
+
42
46
  max = lang.values.max
43
47
  ordered = lang.select { |i,j| j == max }.keys
48
+
44
49
  ordered.each do |l|
45
- if options[:bias].include?(l)
50
+ if options[:bias_toward].include?(l)
46
51
  return l
47
52
  end
48
53
  end
54
+
49
55
  return ordered.first
56
+
50
57
  end
51
58
 
52
59
  end
@@ -1,7 +1,7 @@
1
1
  # Detects the named entity tag in sentences by using
2
2
  # the stanford-core-nlp gem, which interfaces with
3
3
  # the Stanford Deterministic Coreference Resolver.
4
- class Treat::Extractors::NameTag::Stanford
4
+ class Treat::Workers::Extractors::NameTag::Stanford
5
5
 
6
6
  require 'treat/loaders/stanford'
7
7
  Treat::Loaders::Stanford.load
@@ -13,9 +13,8 @@ class Treat::Extractors::NameTag::Stanford
13
13
  pp = nil
14
14
 
15
15
  lang = entity.language
16
-
17
- language = Treat::Languages.describe(lang)
18
- Treat::Loaders::Stanford.load(language)
16
+
17
+ Treat::Loaders::Stanford.load(lang)
19
18
 
20
19
  isolated_token = entity.is_a?(Treat::Entities::Token)
21
20
  tokens = isolated_token ? [entity] : entity.tokens
@@ -1,5 +1,5 @@
1
1
  # Calculates the TF*IDF score of words.
2
- module Treat::Extractors::TfIdf::Native
2
+ module Treat::Workers::Extractors::TfIdf::Native
3
3
  DefaultOptions = {
4
4
  :tf => :natural,
5
5
  :idf => :logarithm,
@@ -24,10 +24,9 @@ module Treat::Extractors::TfIdf::Native
24
24
  @@wc = {} # Number of words in a given document (word count).
25
25
  @@cw = {} # Common words to filter out.
26
26
  def self.tf_idf(entity, options={})
27
- l = Treat::Languages.get(entity.language)
28
- if l.const_defined?(:CommonWords)
29
- @@cw[entity.language] =
30
- l.const_get(:CommonWords)
27
+ l = Treat.languages.send(entity.language)
28
+ if l.stop_words
29
+ @@cw[entity.language] = l.stop_words.list
31
30
  return 0 if @@cw[entity.language].include?(entity.value)
32
31
  end
33
32
  return 0 if entity.value.length <= 2
@@ -2,7 +2,7 @@
2
2
  # date information.
3
3
  #
4
4
  # Project website: http://chronic.rubyforge.org/
5
- class Treat::Extractors::Time::Chronic
5
+ class Treat::Workers::Extractors::Time::Chronic
6
6
 
7
7
  # Require the 'chronic' gem.
8
8
  silence_warnings { require 'chronic' }
@@ -18,7 +18,7 @@
18
18
  # - datemonthly: "pay credit card bill on the 22nd of each month"
19
19
  #
20
20
  # Project website: http://naturalinputs.com/
21
- class Treat::Extractors::Time::Nickel
21
+ class Treat::Workers::Extractors::Time::Nickel
22
22
 
23
23
  require 'date'
24
24
 
@@ -1,5 +1,5 @@
1
1
  # A wrapper for Ruby's native date/time parsing.
2
- class Treat::Extractors::Time::Ruby
2
+ class Treat::Workers::Extractors::Time::Ruby
3
3
 
4
4
  # Require Ruby's date module.
5
5
  require 'date'
@@ -8,7 +8,7 @@
8
8
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
9
9
  #
10
10
  # Project website: https://github.com/ealdent/lda-ruby
11
- module Treat::Extractors::TopicWords::LDA
11
+ module Treat::Workers::Extractors::TopicWords::LDA
12
12
 
13
13
  # Require the lda-ruby gem.
14
14
  silence_warnings { require 'lda-ruby' }
@@ -6,7 +6,7 @@
6
6
  #
7
7
  # Original project website:
8
8
  # http://www.markwatson.com/opensource/
9
- module Treat::Extractors::Topics::Reuters
9
+ module Treat::Workers::Extractors::Topics::Reuters
10
10
 
11
11
  # Require the Nokogiri XML parser.
12
12
  require 'nokogiri'
@@ -46,11 +46,11 @@ module Treat::Extractors::Topics::Reuters
46
46
  # Read the topics from the XML files.
47
47
  def self.get_topics
48
48
  return unless @@industry.size == 0
49
- @@industry = read_xml(Treat.models +
49
+ @@industry = read_xml(Treat.paths.models +
50
50
  'reuters/industry.xml')
51
- @@region = read_xml(Treat.models +
51
+ @@region = read_xml(Treat.paths.models +
52
52
  'reuters/region.xml')
53
- @@topics = read_xml(Treat.models +
53
+ @@topics = read_xml(Treat.paths.models +
54
54
  'reuters/topics.xml')
55
55
  end
56
56
 
@@ -8,7 +8,7 @@
8
8
  # Todo: reimplement with Nokogiri and use
9
9
  # XML node information to better translate
10
10
  # the format of the text.
11
- class Treat::Formatters::Readers::ABW
11
+ class Treat::Workers::Formatters::Readers::ABW
12
12
 
13
13
  silence_warnings do
14
14
  require 'rexml/document'
@@ -25,7 +25,7 @@ class Treat::Formatters::Readers::ABW
25
25
  IO.read(document.file), xml_h)
26
26
 
27
27
  document.value = xml_h.plain_text
28
- document.set :format, :abw_word
28
+ document.set :format, 'abw'
29
29
  document
30
30
 
31
31
  end
@@ -1,9 +1,9 @@
1
- class Treat::Formatters::Readers::Autoselect
1
+ class Treat::Workers::Formatters::Readers::Autoselect
2
2
 
3
3
  ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
4
4
  ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
5
5
  DefaultOptions = {
6
- :default_to => :txt
6
+ :default_to => 'txt'
7
7
  }
8
8
 
9
9
  # Choose a reader to use.
@@ -16,6 +16,7 @@ class Treat::Formatters::Readers::Autoselect
16
16
  end
17
17
 
18
18
  def self.detect_format(filename, default_to = nil)
19
+
19
20
  default_to ||= DefaultOptions[:default_to]
20
21
  ext = filename.scan(ExtensionRegexp)
21
22
  ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
@@ -25,7 +26,13 @@ class Treat::Formatters::Readers::Autoselect
25
26
  format = 'yaml' if format == 'yml'
26
27
 
27
28
  format = default_to if format.to_s == ''
28
-
29
+
30
+ begin
31
+ Treat::Workers::Formatters::Readers.const_get(cc(format))
32
+ rescue Treat::Exception
33
+ format = default_to
34
+ end
35
+
29
36
  format.intern
30
37
 
31
38
  end
@@ -1,5 +1,5 @@
1
1
  # A wrapper for the 'antiword' command-line utility.
2
- class Treat::Formatters::Readers::DOC
2
+ class Treat::Workers::Formatters::Readers::DOC
3
3
 
4
4
  # Extract the readable text from a DOC file
5
5
  # using the antiword command-line utility.
@@ -13,7 +13,7 @@ class Treat::Formatters::Readers::DOC
13
13
  f.gsub!('#keep#', "\n\n")
14
14
 
15
15
  document.value = f
16
- document.set :format, :doc
16
+ document.set :format, 'doc'
17
17
  document
18
18
 
19
19
  end
@@ -4,8 +4,8 @@
4
4
  #
5
5
  # Project homepage:
6
6
  # https://github.com/iterationlabs/ruby-readability
7
- class Treat::Formatters::Readers::HTML
8
-
7
+ class Treat::Workers::Formatters::Readers::HTML
8
+
9
9
  silence_warnings { require 'ruby-readability' }
10
10
 
11
11
  # By default, don't backup the original HTML
@@ -45,9 +45,9 @@ class Treat::Formatters::Readers::HTML
45
45
  html.gsub!(/<!--[^>]*-->/m, '')
46
46
  d = Readability::Document.new(html, options)
47
47
  document.value = "<h1>#{d.title}</h1>\n" + d.content
48
- document.set :format, :html
48
+ document.set :format, 'html'
49
49
  end
50
-
50
+
51
51
  document
52
52
 
53
53
  end
@@ -11,7 +11,7 @@
11
11
  #
12
12
  # Breuel, Thomas M. The Ocropus Open Source OCR System.
13
13
  # DFKI and U. Kaiserslautern, Germany.
14
- class Treat::Formatters::Readers::Image
14
+ class Treat::Workers::Formatters::Readers::Image
15
15
 
16
16
  # Read a file using the Google Ocropus reader.
17
17
  #
@@ -29,7 +29,7 @@ class Treat::Formatters::Readers::Image
29
29
  doc.set :file, "#{tmp}/output.html"
30
30
  doc = doc.read(:html)
31
31
  doc.set :file, f
32
- doc.set :format, :image
32
+ doc.set :format, 'image'
33
33
  end
34
34
  end
35
35
 
@@ -10,7 +10,7 @@
10
10
  # Todo: reimplement with Nokogiri and use
11
11
  # XML node information to better translate
12
12
  # the format of the text.
13
- class Treat::Formatters::Readers::ODT
13
+ class Treat::Workers::Formatters::Readers::ODT
14
14
 
15
15
  # Require the 'zip' gem to unarchive the ODT files
16
16
  silence_warnings { require 'zip' }
@@ -30,7 +30,7 @@ class Treat::Formatters::Readers::ODT
30
30
  REXML::Document.parse_stream(f, xml_h)
31
31
 
32
32
  document.value = xml_h.plain_text
33
- document.set :format, :odt_office
33
+ document.set :format, 'odt'
34
34
  document
35
35
 
36
36
  end
@@ -1,7 +1,7 @@
1
1
  # encoding: utf-8
2
2
  # A wrapper for the Poppler pdf2text utility, which
3
3
  # extracts the text from a PDF file.
4
- module Treat::Formatters::Readers::PDF
4
+ module Treat::Workers::Formatters::Readers::PDF
5
5
 
6
6
  # Read a PDF file using the Poppler pdf2text utility.
7
7
  #
@@ -21,7 +21,7 @@ module Treat::Formatters::Readers::PDF
21
21
  f.gsub!('#keep#', "\n\n")
22
22
 
23
23
  document.value = f
24
- document.set :format, :pdf
24
+ document.set :format, 'pdf'
25
25
  document
26
26
 
27
27
  end
@@ -1,5 +1,5 @@
1
1
  # This class simply reads a plain text file.
2
- class Treat::Formatters::Readers::TXT
2
+ class Treat::Workers::Formatters::Readers::TXT
3
3
 
4
4
  # Build an entity from a string
5
5
  # in plain text format.
@@ -7,7 +7,7 @@ class Treat::Formatters::Readers::TXT
7
7
  # Options: none.
8
8
  def self.read(document, options = {})
9
9
  document.value = File.read(document.file)
10
- document.set :format, :txt
10
+ document.set :format, 'txt'
11
11
  document
12
12
  end
13
13
 
@@ -1,4 +1,4 @@
1
- class Treat::Formatters::Readers::XML
1
+ class Treat::Workers::Formatters::Readers::XML
2
2
 
3
3
  require 'treat/loaders/stanford'
4
4
  Treat::Loaders::Stanford.load
@@ -70,7 +70,7 @@ class Treat::Formatters::Readers::XML
70
70
 
71
71
  end
72
72
 
73
- document.set :format, :xml
73
+ document.set :format, 'xml'
74
74
  document
75
75
 
76
76
  end
@@ -0,0 +1,60 @@
1
+ # Stores an entity in a Mongo collection.
2
+ class Treat::Workers::Formatters::Serializers::Mongo
3
+
4
+ # Reauire the Mongo DB
5
+ require 'mongo'
6
+
7
+ DefaultOptions = {
8
+ :recursive => true,
9
+ :stop_at => :token
10
+ }
11
+
12
+ def self.serialize(entity, options = {})
13
+
14
+ options = DefaultOptions.merge(options)
15
+ stop_at = options[:stop_at] ?
16
+ Treat::Entities.const_get(
17
+ options[:stop_at].to_s.capitalize) :
18
+ Treat::Entities::Token
19
+
20
+ if !Treat.databases.mongo.db && !options[:db]
21
+ raise Treat::Exception,
22
+ 'Must supply the database name in config. ' +
23
+ '(Treat.databases.mongo.db = ...) or pass ' +
24
+ 'it as a parameter to #serialize.'
25
+ end
26
+
27
+ @@database ||= Mongo::Connection.
28
+ new(Treat.databases.mongo.host).
29
+ db(Treat.databases.mongo.db || options[:db])
30
+
31
+ type = cl(entity.class.superclass).downcase
32
+ type = entity.type.to_s if type == 'entity'
33
+ types = type + 's'
34
+
35
+ coll = @@database.collection(types)
36
+
37
+ entity_token = {
38
+ :id => entity.id,
39
+ :value => entity.value,
40
+ :string => entity.to_s,
41
+ :type => entity.type,
42
+ :children => entity.children.map { |c| [c.id, c.type] },
43
+ :parent => (entity.has_parent? ? entity.parent.id : nil),
44
+ :features => entity.features
45
+ }
46
+
47
+ coll.insert(entity_token)
48
+
49
+ if options[:recursive] && entity.has_children?
50
+ entity.each do |child|
51
+ next if child.class.compare_with(stop_at) < 0
52
+ self.serialize(child, options)
53
+ end
54
+ end
55
+
56
+ end
57
+
58
+ end
59
+
60
+
@@ -1,5 +1,5 @@
1
1
  # This class converts an entity to a storable XML format.
2
- class Treat::Formatters::Serializers::XML
2
+ class Treat::Workers::Formatters::Serializers::XML
3
3
 
4
4
  # Reauire the Nokogiri XML parser.
5
5
  require 'nokogiri'
@@ -74,7 +74,6 @@ class Treat::Formatters::Serializers::XML
74
74
  f.write(string)
75
75
  end
76
76
  end
77
- # puts string
78
77
  end
79
78
  string
80
79
  end
@@ -1,5 +1,5 @@
1
1
  # This class serializes entities in YAML format.
2
- class Treat::Formatters::Serializers::YAML
2
+ class Treat::Workers::Formatters::Serializers::YAML
3
3
 
4
4
  silence_warnings do
5
5
  # Require the Psych YAML serializer.
@@ -1,4 +1,4 @@
1
- class Treat::Formatters::Unserializers::Autoselect
1
+ class Treat::Workers::Formatters::Unserializers::Autoselect
2
2
 
3
3
  def self.unserialize(document, options = {})
4
4
  file = document.file
@@ -6,6 +6,8 @@ class Treat::Formatters::Unserializers::Autoselect
6
6
  document.unserialize(:yaml, options)
7
7
  elsif file.index('xml')
8
8
  document.unserialize(:xml, options)
9
+ elsif file.index('mongo')
10
+ document.unserialize(:mongo, options)
9
11
  else
10
12
  raise Treat::Exception,
11
13
  "Unreadable serialized format for file #{file}."
@@ -0,0 +1,80 @@
1
+ module Treat::Workers::Formatters::Unserializers::Mongo
2
+
3
+ DefaultOptions = {
4
+ :recursive => true,
5
+ :stop_at => nil
6
+ }
7
+
8
+ require 'mongo'
9
+
10
+ def self.unserialize(entity, options={})
11
+
12
+ options = DefaultOptions.merge(options)
13
+ options[:stop_at] = options[:stop_at] ?
14
+ Treat::Entities.const_get(
15
+ options[:stop_at].to_s.capitalize) :
16
+ Treat::Entities::Token
17
+
18
+ if !Treat.databases.mongo.db && !options[:db]
19
+ raise Treat::Exception,
20
+ 'Must supply the database name in config. ' +
21
+ '(Treat.databases.mongo.db = ...) or pass ' +
22
+ 'it as a parameter to #unserialize.'
23
+ end
24
+
25
+ @@database ||= Mongo::Connection.
26
+ new(Treat.databases.mongo.host).
27
+ db(Treat.databases.mongo.db || options[:db])
28
+
29
+ self.do_unserialize(entity, options)
30
+
31
+ end
32
+
33
+ def self.do_unserialize(entity, options)
34
+
35
+ supertype = cl(Treat::Entities.const_get(
36
+ entity.type.to_s.capitalize.intern).superclass).downcase
37
+ supertype = entity.type.to_s if supertype == 'entity'
38
+ supertypes = supertype + 's'
39
+
40
+ coll = @@database.collection(supertypes)
41
+ record = coll.find_one(:id => entity.id)
42
+
43
+ unless record
44
+ raise Treat::Exception,
45
+ "Couldn't find record ID #{entity.id}."
46
+ end
47
+
48
+ # Convert feature keys to symbols.
49
+ features = record['features']
50
+ new_feat = {}
51
+ features.each do |feature, value|
52
+ new_feat[feature.intern] = value
53
+ end
54
+ entity.features = new_feat
55
+
56
+ # Set the entity's value.
57
+ entity.value = record['value']
58
+
59
+ if entity.class.compare_with(
60
+ options[:stop_at]) == 0
61
+ entity.value = record['string']
62
+ end
63
+
64
+ return entity unless options[:recursive]
65
+
66
+ record['children'].each do |c|
67
+ cid, ctype = *c
68
+ cklass = Treat::Entities.const_get(
69
+ ctype.capitalize.intern)
70
+ next if cklass.compare_with(
71
+ options[:stop_at]) < 0
72
+ entity << self.do_unserialize(
73
+ cklass.new('', cid), options)
74
+ end
75
+
76
+ entity
77
+
78
+ end
79
+
80
+ end
@@ -1,6 +1,6 @@
1
1
  # Recreates the entity tree corresponding to
2
2
  # a serialized XML file.
3
- module Treat::Formatters::Unserializers::XML
3
+ module Treat::Workers::Formatters::Unserializers::XML
4
4
 
5
5
  require 'nokogiri'
6
6
 
@@ -78,7 +78,7 @@ module Treat::Formatters::Unserializers::XML
78
78
  current_value = ''
79
79
  type = xml_reader.name.intern
80
80
 
81
- if Treat::Entities.list.include?(type)
81
+ if Treat.core.entities.list.include?(type)
82
82
  if !current_element
83
83
  current_element = self.revive(type, current_value, id)
84
84
  else