treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -1,9 +1,15 @@
1
- # Detects the named entity tag in sentences by using
2
- # the stanford-core-nlp gem, which interfaces with
3
- # the Stanford Deterministic Coreference Resolver.
1
+ # Named entity tag extraction using the Stanford NLP
2
+ # Deterministic Coreference Resolver, which implements a
3
+ # multi-pass sieve coreference resolution (or anaphora
4
+ # resolution) system.
5
+ #
6
+ # Original paper: Heeyoung Lee, Yves Peirsman, Angel
7
+ # Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
8
+ # Stanford's Multi-Pass Sieve Coreference Resolution
9
+ # System at the CoNLL-2011 Shared Task. In Proceedings
10
+ # of the CoNLL-2011 Shared Task, 2011.
4
11
  class Treat::Workers::Extractors::NameTag::Stanford
5
12
 
6
- require 'treat/loaders/stanford'
7
13
  Treat::Loaders::Stanford.load
8
14
 
9
15
  @@classifiers = {}
@@ -0,0 +1,36 @@
1
+ class Treat::Workers::Extractors::Similarity
2
+ # Default options.
3
+ DefaultOptions = {
4
+ with: '',
5
+ ins_cost: 1,
6
+ del_cost: 1,
7
+ sub_cost: 1
8
+ }
9
+ # Return the levensthein distance between
10
+ # two strings taking into account the costs
11
+ # of insertion, deletion, and substitution.
12
+ # Used by did_you_mean? to detect typos.
13
+ def self.similarity(entity, options)
14
+ first, other = entity.to_s, options[:with].to_s
15
+ options = DefaultOptions.merge(options)
16
+ other, ins, del, sub, = options[:with],
17
+ options[:inst_cost], options[:del_cost],
18
+ options[:sub_cost]
19
+ fill, dm = [0] * (first.length - 1).abs,
20
+ [(0..first.length).collect { |i| i * ins}]
21
+ for i in 1..other.length
22
+ dm[i] = [i * del, fill.flatten]
23
+ end
24
+ for i in 1..other.length
25
+ for j in 1..first.length
26
+ dm[i][j] = [
27
+ dm[i-1][j-1] + (first[i-1] ==
28
+ other[i-1] ? 0 : sub), dm[i][j-1] +
29
+ ins, dm[i-1][j] + del
30
+ ].min
31
+ end
32
+ end
33
+ dm[other.length][first.length]
34
+ end
35
+
36
+ end
@@ -0,0 +1,27 @@
1
+ # Calculates the TF*IDF score of words.
2
+ class Treat::Workers::Extractors::Similarity::TfIdf
3
+
4
+ require 'tf-idf-similarity'
5
+
6
+ @collections = {}
7
+
8
+ def self.tf_idf(collection, options={})
9
+ coll = TfIdfSimilarity::Collection.new
10
+ collection.each_document do |doc|
11
+ tdoc = TfIdfSimilarity::Document.new(doc.to_s)
12
+ term_counts = Hash.new(0)
13
+ doc.each_word do |word|
14
+ val = word.value.downcase
15
+ term_counts[val] ||= 0.0
16
+ term_counts[val] += 1.0
17
+ end
18
+ size = term_counts.values.reduce(:+)
19
+ tdoc.instance_eval do
20
+ @term_counts, @size = term_counts, size
21
+ end
22
+ coll << tdoc
23
+ end
24
+ puts coll.similarity_matrix.inspect
25
+ end
26
+
27
+ end
@@ -1,5 +1,5 @@
1
1
  # Calculates the TF*IDF score of words.
2
- module Treat::Workers::Extractors::TfIdf::Native
2
+ class Treat::Workers::Extractors::TfIdf::Native
3
3
  DefaultOptions = {
4
4
  :tf => :natural,
5
5
  :idf => :logarithm,
@@ -24,9 +24,9 @@ module Treat::Workers::Extractors::TfIdf::Native
24
24
  @@wc = {} # Number of words in a given document (word count).
25
25
  @@cw = {} # Common words to filter out.
26
26
  def self.tf_idf(entity, options={})
27
- l = Treat.languages.send(entity.language)
28
- if l.stop_words
29
- @@cw[entity.language] = l.stop_words.list
27
+ l = Treat.languages[entity.language]
28
+ if l.respond_to?(:stop_words)
29
+ @@cw[entity.language] = l.stop_words
30
30
  return 0 if @@cw[entity.language].include?(entity.value)
31
31
  end
32
32
  return 0 if entity.value.length <= 2
@@ -1,7 +1,5 @@
1
- # A wrapper for the 'chronic' gem, which parses
2
- # date information.
3
- #
4
- # Project website: http://chronic.rubyforge.org/
1
+ # Time/date extraction using a rule-based, pure
2
+ # Ruby natural language date parser.
5
3
  class Treat::Workers::Extractors::Time::Chronic
6
4
 
7
5
  # Require the 'chronic' gem.
@@ -1,23 +1,5 @@
1
- # A wrapper for the 'nickel' gem, which parses
2
- # times and dates and supplies additional information
3
- # concerning these. The additional information supplied
4
- # that this class annotates entities with is:
5
- #
6
- # - time_recurrence: frequency of recurrence in words*.
7
- # - time_recurrence_interval: frequency of recurrence in days.
8
- # - start_time: a DateTime object representing the beginning of
9
- # an event.
10
- # - end_time: a DateTime object representing the end of an event.
11
- #
12
- # Examples of values for time_recurrence are:
13
- #
14
- # - single: "lunch with megan tomorrow at noon"
15
- # - daily: "Art exhibit until March 1st"
16
- # - weekly: "math class every wed from 8-11am"
17
- # - daymonthly: "open bar at joes the first friday of every month"
18
- # - datemonthly: "pay credit card bill on the 22nd of each month"
19
- #
20
- # Project website: http://naturalinputs.com/
1
+ # Time extraction using a pure Ruby natural language
2
+ # time parser.
21
3
  class Treat::Workers::Extractors::Time::Nickel
22
4
 
23
5
  require 'date'
@@ -25,6 +7,23 @@ class Treat::Workers::Extractors::Time::Nickel
25
7
  silence_warnings { require 'nickel' }
26
8
 
27
9
  # Extract time information from a bit of text.
10
+ #
11
+ # In addition to the :time annotation, this class will provided:
12
+ #
13
+ # - time_recurrence: frequency of recurrence in words*.
14
+ # - time_recurrence_interval: frequency of recurrence in days.
15
+ # - start_time: a DateTime object representing the beginning of
16
+ # an event.
17
+ # - end_time: a DateTime object representing the end of an event.
18
+ #
19
+ # Examples of values for time_recurrence are:
20
+ #
21
+ # - single: "lunch with megan tomorrow at noon"
22
+ # - daily: "Art exhibit until March 1st"
23
+ # - weekly: "math class every wed from 8-11am"
24
+ # - daymonthly: "open bar at joes the first friday of every month"
25
+ # - datemonthly: "pay credit card bill on the 22nd of each month"
26
+ #
28
27
  def self.time(entity, options = {})
29
28
 
30
29
  s = entity.to_s
@@ -1,4 +1,5 @@
1
- # A wrapper for Ruby's native date/time parsing.
1
+ # Date extraction using Ruby's standard library
2
+ # DateTime.parse() method.
2
3
  class Treat::Workers::Extractors::Time::Ruby
3
4
 
4
5
  # Require Ruby's date module.
@@ -1,14 +1,14 @@
1
- # An adapter for the 'lda-ruby' gem, which clusters
2
- # documents into topics based on Latent Dirichlet
3
- # Allocation.
1
+ # Topic word retrieval using a thin wrapper over a
2
+ # C implementation of Latent Dirichlet Allocation (LDA),
3
+ # a statistical model that posits each document
4
+ # is a mixture of a small number of topics and that
5
+ # each word's creation is attributable to one of the
6
+ # document's topics.
4
7
  #
5
- # Original paper:
6
- # Blei, David M., Ng, Andrew Y., and Jordan, Michael
7
- # I. 2003. Latent dirichlet allocation. Journal of
8
+ # Original paper: Blei, David, Ng, Andrew, and Jordan,
9
+ # Michael. 2003. Latent dirichlet allocation. Journal of
8
10
  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
9
- #
10
- # Project website: https://github.com/ealdent/lda-ruby
11
- module Treat::Workers::Extractors::TopicWords::LDA
11
+ class Treat::Workers::Extractors::TopicWords::LDA
12
12
 
13
13
  # Require the lda-ruby gem.
14
14
  silence_warnings { require 'lda-ruby' }
@@ -53,9 +53,9 @@ module Treat::Workers::Extractors::TopicWords::LDA
53
53
  # Run the EM algorithm using random
54
54
  # starting points
55
55
 
56
- silence_stdout do
57
- lda.em('random')
58
- end
56
+ Treat.core.verbosity.silence ?
57
+ silence_stdout { lda.em('random') :
58
+ lda.em('random')
59
59
 
60
60
  # Load the vocabulary.
61
61
  if options[:vocabulary]
@@ -1,12 +1,9 @@
1
1
  # A Ruby text categorizer that was trained using
2
- # the Reuters news story corpus.
2
+ # the Reuters news story corpus. Works well for
3
+ # news articles, not so well for other sources.
3
4
  #
4
- # Copyright 2005 Mark Watson. All rights reserved.
5
- # Rewrite for inclusion in Treat by Louis Mullie (2011).
6
- #
7
- # Original project website:
8
- # http://www.markwatson.com/opensource/
9
- module Treat::Workers::Extractors::Topics::Reuters
5
+ # Authors: Mark Watson, 2005; Louis Mullie, 2011.
6
+ class Treat::Workers::Extractors::Topics::Reuters
10
7
 
11
8
  # Require the Nokogiri XML parser.
12
9
  require 'nokogiri'
@@ -46,12 +43,11 @@ module Treat::Workers::Extractors::Topics::Reuters
46
43
  # Read the topics from the XML files.
47
44
  def self.get_topics
48
45
  return unless @@industry.size == 0
49
- @@industry = read_xml(Treat.paths.models +
50
- 'reuters/industry.xml')
51
- @@region = read_xml(Treat.paths.models +
52
- 'reuters/region.xml')
53
- @@topics = read_xml(Treat.paths.models +
54
- 'reuters/topics.xml')
46
+ path = (Treat.libraries.reuters.model_path ||
47
+ (Treat.paths.models + 'reuters/'))
48
+ @@industry = read_xml(path + 'industry.xml')
49
+ @@region = read_xml(path + 'region.xml')
50
+ @@topics = read_xml(path + 'topics.xml')
55
51
  end
56
52
 
57
53
  # Read an XML file and populate a
@@ -28,7 +28,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
28
28
  format = default_to if format.to_s == ''
29
29
 
30
30
  begin
31
- Treat::Workers::Formatters::Readers.const_get(cc(format))
31
+ Treat::Workers::Formatters::Readers.const_get(format.cc)
32
32
  rescue Treat::Exception
33
33
  format = default_to
34
34
  end
@@ -7,9 +7,8 @@
7
7
  # statistical natural language modeling, and multi-
8
8
  # lingual capabilities."
9
9
  #
10
- # Original paper:
11
- #
12
- # Breuel, Thomas M. The Ocropus Open Source OCR System.
10
+ # Original paper: Google Ocropus Engine: Breuel,
11
+ # Thomas M. The Ocropus Open Source OCR System.
13
12
  # DFKI and U. Kaiserslautern, Germany.
14
13
  class Treat::Workers::Formatters::Readers::Image
15
14
 
@@ -21,24 +20,35 @@ class Treat::Workers::Formatters::Readers::Image
21
20
  def self.read(document, options = {})
22
21
 
23
22
  read = lambda do |doc|
24
- create_temp_dir do |tmp|
23
+ self.create_temp_dir do |tmp|
25
24
  `ocropus book2pages #{tmp}/out #{doc.file}`
26
25
  `ocropus pages2lines #{tmp}/out`
27
26
  `ocropus lines2fsts #{tmp}/out`
28
27
  `ocropus buildhtml #{tmp}/out > #{tmp}/output.html`
29
28
  doc.set :file, "#{tmp}/output.html"
29
+ doc.set :format, :html
30
30
  doc = doc.read(:html)
31
- doc.set :file, f
32
- doc.set :format, 'image'
33
31
  end
34
32
  end
35
33
 
36
- options[:silent] ?
37
- silence_stdout { read.call(document) } :
38
- read.call(document)
34
+ Treat.core.verbosity.silence ? silence_stdout {
35
+ read.call(document) } : read.call(document)
39
36
 
40
37
  document
41
38
 
42
39
  end
43
40
 
41
+ # Create a dire that gets deleted after execution of the block.
42
+ def self.create_temp_dir(&block)
43
+ if not FileTest.directory?(Treat.paths.tmp)
44
+ FileUtils.mkdir(Treat.paths.tmp)
45
+ end
46
+ dname = Treat.paths.tmp +
47
+ "#{Random.rand(10000000).to_s}"
48
+ Dir.mkdir(dname)
49
+ block.call(dname)
50
+ ensure
51
+ FileUtils.rm_rf(dname)
52
+ end
53
+
44
54
  end
@@ -24,7 +24,8 @@ class Treat::Workers::Formatters::Readers::ODT
24
24
  Zip::ZipFile::CREATE) do |zipfile|
25
25
  f = zipfile.read('content.xml')
26
26
  end
27
- raise "Couldn't unzip dot file " +
27
+ raise Treat::Exception,
28
+ "Couldn't unzip dot file " +
28
29
  "#{document.file}!" unless f
29
30
  xml_h = ODTXmlHandler.new
30
31
  REXML::Document.parse_stream(f, xml_h)
@@ -1,15 +1,16 @@
1
1
  # encoding: utf-8
2
2
  # A wrapper for the Poppler pdf2text utility, which
3
3
  # extracts the text from a PDF file.
4
- module Treat::Workers::Formatters::Readers::PDF
4
+ class Treat::Workers::Formatters::Readers::PDF
5
5
 
6
+ require 'fileutils'
7
+
6
8
  # Read a PDF file using the Poppler pdf2text utility.
7
9
  #
8
10
  # Options: none.
9
11
  def self.read(document, options = {})
10
12
 
11
- create_temp_file(:txt) do |tmp|
12
-
13
+ self.create_temp_file(:txt) do |tmp|
13
14
  `pdftotext #{document.file} #{tmp} `.strip
14
15
  f = File.read(tmp)
15
16
  f.gsub!("\t\r ", '')
@@ -27,5 +28,21 @@ module Treat::Workers::Formatters::Readers::PDF
27
28
  end
28
29
 
29
30
  end
31
+
32
+ # Create a temporary file which is deleted
33
+ # after execution of the block.
34
+ def self.create_temp_file(ext, value = nil, &block)
35
+ if not FileTest.directory?(Treat.paths.tmp)
36
+ FileUtils.mkdir(Treat.paths.tmp)
37
+ end
38
+ fname = Treat.paths.tmp +
39
+ "#{Random.rand(10000000).to_s}.#{ext}"
40
+ File.open(fname, 'w') do |f|
41
+ f.write(value) if value
42
+ block.call(f.path)
43
+ end
44
+ ensure
45
+ File.delete(fname)
46
+ end
30
47
 
31
48
  end
@@ -1,6 +1,5 @@
1
1
  class Treat::Workers::Formatters::Readers::XML
2
2
 
3
- require 'treat/loaders/stanford'
4
3
  Treat::Loaders::Stanford.load
5
4
  require 'cgi'
6
5
 
@@ -1,4 +1,4 @@
1
- # Stores an entity in a Mongo collection.
1
+ # Serialization of entities to a Mongo database.
2
2
  class Treat::Workers::Formatters::Serializers::Mongo
3
3
 
4
4
  # Reauire the Mongo DB
@@ -16,7 +16,9 @@ class Treat::Workers::Formatters::Serializers::Mongo
16
16
  Treat::Entities.const_get(
17
17
  options[:stop_at].to_s.capitalize) : nil
18
18
 
19
- if !Treat.databases.mongo.db && !options[:db]
19
+ options[:db] ||= Treat.databases.mongo.db
20
+
21
+ if !options[:db]
20
22
  raise Treat::Exception,
21
23
  'Must supply the database name in config. ' +
22
24
  '(Treat.databases.mongo.db = ...) or pass ' +
@@ -25,30 +27,18 @@ class Treat::Workers::Formatters::Serializers::Mongo
25
27
 
26
28
  @@database ||= Mongo::Connection.
27
29
  new(Treat.databases.mongo.host).
28
- db(options[:db] || Treat.databases.mongo.db)
30
+ db(options[:db])
29
31
 
30
- supertype = cl(Treat::Entities.const_get(
31
- entity.type.to_s.capitalize.intern).superclass).downcase
32
+ supertype = Treat::Entities.const_get(
33
+ entity.type.to_s.capitalize.intern).superclass.mn.downcase
32
34
  supertype = entity.type.to_s if supertype == 'entity'
33
35
  supertypes = supertype + 's'
34
36
 
35
37
  coll = @@database.collection(supertypes)
38
+ entity_token = self.do_serialize(entity, options)
39
+ coll.update({id: entity.id}, entity_token, {upsert: true})
36
40
 
37
- if entity.type == :collection
38
- docs = @@database.collection('documents') # Take a design decision here.
39
- coll.update(
40
- {id: entity.id}, self.do_serialize(entity,
41
- options.merge({:stop_at => Treat::Entities::Document})),
42
- {upsert: true})
43
- entity.each_document do |doc|
44
- docs.update(
45
- {id: doc.id}, self.do_serialize(doc, options),
46
- {upsert: true})
47
- end
48
- else
49
- entity_token = self.do_serialize(entity, options)
50
- coll.update({id: entity.id}, entity_token, {upsert: true})
51
- end
41
+ {db: options[:db], collection: supertypes, id: entity.id}
52
42
 
53
43
  end
54
44
 
@@ -1,4 +1,4 @@
1
- # This class converts an entity to a storable XML format.
1
+ # Serialization of entities to XML format.
2
2
  class Treat::Workers::Formatters::Serializers::XML
3
3
 
4
4
  # Reauire the Nokogiri XML parser.
@@ -8,18 +8,20 @@ class Treat::Workers::Formatters::Serializers::XML
8
8
  # Options:
9
9
  # - (String) :file => a file to write to.
10
10
  def self.serialize(entity, options = {})
11
- if options[:indent].nil?
12
- options = options.merge({:indent => 0})
13
- end
14
- indent = options[:indent]
15
- if options[:indent] == 0
16
- enc = entity.to_s.encoding.to_s.downcase
17
- string = "<?xml version=\"1.0\" " +
18
- "encoding=\"#{enc}\" ?>\n<treat>\n"
19
- else
20
- string = ''
21
- end
22
- spaces = ''
11
+ options[:file] ||= (entity.id.to_s + '.xml')
12
+ options[:indent] = 0
13
+ enc = entity.to_s.encoding.to_s.downcase
14
+ string = "<?xml version=\"1.0\" " +
15
+ "encoding=\"#{enc}\" ?>\n<treat>\n"
16
+ val = self.recurse(entity, options)
17
+ string += "#{val}\n</treat>"
18
+ File.open(options[:file], 'w') do |f|
19
+ f.write(string)
20
+ end; return string
21
+ end
22
+
23
+ def self.recurse(entity, options)
24
+ spaces, string = '', ''
23
25
  options[:indent].times { spaces << ' ' }
24
26
  attributes = " id='#{entity.id}'"
25
27
  if !entity.features.nil? && entity.features.size != 0
@@ -55,27 +57,16 @@ class Treat::Workers::Formatters::Serializers::XML
55
57
  if entity.has_children?
56
58
  options[:indent] += 1
57
59
  entity.children.each do |child|
58
- string =
59
- string +
60
- serialize(child, options)
60
+ string += self.recurse(child, options)
61
61
  end
62
62
  options[:indent] -= 1
63
63
  else
64
- string = string + "#{escape(entity.value)}"
64
+ string += "#{escape(entity.value)}"
65
65
  end
66
66
  unless entity.is_a?(Treat::Entities::Token)
67
67
  string += "#{spaces}"
68
68
  end
69
69
  string += "</#{tag}>\n"
70
- if indent == 0
71
- string += "\n</treat>"
72
- if options[:file]
73
- File.open(options[:file], 'w') do |f|
74
- f.write(string)
75
- end
76
- end
77
- end
78
- string
79
70
  end
80
71
 
81
72
  def self.escape(input)
@@ -1,9 +1,9 @@
1
- # This class serializes entities in YAML format.
1
+ # Serialization of entities to YAML format.
2
2
  class Treat::Workers::Formatters::Serializers::YAML
3
3
 
4
4
  silence_warnings do
5
5
  # Require the Psych YAML serializer.
6
- require 'psych'
6
+ require 'yaml'
7
7
  end
8
8
 
9
9
  # Serialize an entity in YAML format.
@@ -11,13 +11,14 @@ class Treat::Workers::Formatters::Serializers::YAML
11
11
  # Options:
12
12
  # - (String) :file => a file to write to.
13
13
  def self.serialize(entity, options = {})
14
- yaml = ::Psych.dump(entity)
14
+ yaml = ::YAML.dump(entity)
15
+ options[:file] ||= (entity.id.to_s + '.yml')
15
16
  if options[:file]
16
17
  File.open(options[:file], 'w') do |f|
17
18
  f.write(yaml)
18
19
  end
19
20
  end
20
- yaml
21
+ options[:file]
21
22
  end
22
23
 
23
24
  end
@@ -1,4 +1,5 @@
1
- module Treat::Workers::Formatters::Unserializers::Mongo
1
+ # Unserialization of entities stored in a Mongo database.
2
+ class Treat::Workers::Formatters::Unserializers::Mongo
2
3
 
3
4
  require 'mongo'
4
5
 
@@ -18,8 +19,8 @@ module Treat::Workers::Formatters::Unserializers::Mongo
18
19
  new(Treat.databases.mongo.host).
19
20
  db(Treat.databases.mongo.db || db)
20
21
 
21
- supertype = cl(Treat::Entities.const_get(
22
- entity.type.to_s.capitalize.intern).superclass).downcase
22
+ supertype = Treat::Entities.const_get(
23
+ entity.type.to_s.capitalize.intern).superclass.mn.downcase
23
24
  supertype = entity.type.to_s if supertype == 'entity'
24
25
  supertypes = supertype + 's'
25
26
  supertypes = 'documents' if entity.type == :collection
@@ -58,7 +59,6 @@ module Treat::Workers::Formatters::Unserializers::Mongo
58
59
  const_get(record['type'].
59
60
  capitalize.intern).new(
60
61
  record['value'], record['id'])
61
-
62
62
  features = record['features']
63
63
  new_feat = {}
64
64
  features.each do |feature, value|
@@ -1,6 +1,5 @@
1
- # Recreates the entity tree corresponding to
2
- # a serialized XML file.
3
- module Treat::Workers::Formatters::Unserializers::XML
1
+ # Unserialization of entities stored in XML format.
2
+ class Treat::Workers::Formatters::Unserializers::XML
4
3
 
5
4
  require 'nokogiri'
6
5
 
@@ -106,7 +105,7 @@ module Treat::Workers::Formatters::Unserializers::XML
106
105
  end
107
106
 
108
107
  def self.revive(type, value, id)
109
- klass = Treat::Entities.const_get(cc(type))
108
+ klass = Treat::Entities.const_get(type.cc)
110
109
  klass.new(value, id)
111
110
  end
112
111
 
@@ -1,10 +1,9 @@
1
- # This class is a wrapper for the Psych YAML
2
- # parser; it unserializes YAML files.
1
+ # Unserialization of entities stored in YAML format.
3
2
  class Treat::Workers::Formatters::Unserializers::YAML
4
3
 
5
4
  silence_warnings do
6
5
  # Require the Psych YAML parser.
7
- require 'psych'
6
+ require 'yaml'
8
7
  end
9
8
 
10
9
  # Require date to revive DateTime.
@@ -14,7 +13,7 @@ class Treat::Workers::Formatters::Unserializers::YAML
14
13
  #
15
14
  # Options: none.
16
15
  def self.unserialize(document, options = {})
17
- document << ::Psych.load(
16
+ document << ::YAML.load(
18
17
  File.read(document.file))
19
18
  document
20
19
  end
@@ -1,3 +1,4 @@
1
+ # Visualization of entities in DOT graph format.
1
2
  class Treat::Workers::Formatters::Visualizers::DOT
2
3
 
3
4
  require 'date'
@@ -1,6 +1,5 @@
1
- # This class allows the visualization of
2
- # an entity in standoff format; for example:
3
- # (S (NP John) (VP has (VP come))).
1
+ # Visualization of entities in standoff (tag-bracketed)
2
+ # format, based on the Stanford tag-bracketed format.
4
3
  class Treat::Workers::Formatters::Visualizers::Standoff
5
4
 
6
5
  # Start out with an indent of 0.
@@ -1,9 +1,8 @@
1
- # This class generates an ASCII representation
2
- # of a tree of entities.
1
+ # Visualization of entities in ASCII tree format.
3
2
  class Treat::Workers::Formatters::Visualizers::Tree
4
3
 
5
4
  # Start out with an indent at 0.
6
- DefaultOptions = { :indent => 0 }
5
+ DefaultOptions = { indent: 0 }
7
6
 
8
7
  # Obtain a plain text tree representation
9
8
  # of the entity.