treat 1.0.6 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (210) hide show
  1. data/LICENSE +2 -4
  2. data/README.md +13 -12
  3. data/bin/MANIFEST +1 -0
  4. data/bin/stanford/bridge.jar +0 -0
  5. data/bin/stanford/joda-time.jar +0 -0
  6. data/bin/stanford/stanford-corenlp.jar +0 -0
  7. data/bin/stanford/stanford-parser.jar +0 -0
  8. data/bin/stanford/xom.jar +0 -0
  9. data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
  10. data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
  11. data/files/{INFO → MANIFEST} +0 -0
  12. data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
  13. data/files/weather-central-canada-heat-wave.html +1370 -0
  14. data/lib/treat/config/core/acronyms.rb +4 -0
  15. data/lib/treat/config/core/encodings.rb +8 -0
  16. data/lib/treat/config/core/entities.rb +2 -0
  17. data/lib/treat/config/core/language.rb +3 -0
  18. data/lib/treat/config/core/paths.rb +8 -0
  19. data/lib/treat/config/core/syntax.rb +1 -0
  20. data/lib/treat/config/core/verbosity.rb +1 -0
  21. data/lib/treat/config/databases/mongo.rb +3 -0
  22. data/lib/treat/config/languages/agnostic.rb +34 -0
  23. data/lib/treat/config/languages/arabic.rb +13 -0
  24. data/lib/treat/config/languages/chinese.rb +13 -0
  25. data/lib/treat/config/languages/dutch.rb +12 -0
  26. data/lib/treat/config/languages/english.rb +60 -0
  27. data/lib/treat/config/languages/french.rb +18 -0
  28. data/lib/treat/config/languages/german.rb +18 -0
  29. data/lib/treat/config/languages/greek.rb +12 -0
  30. data/lib/treat/config/languages/italian.rb +12 -0
  31. data/lib/treat/config/languages/polish.rb +12 -0
  32. data/lib/treat/config/languages/portuguese.rb +12 -0
  33. data/lib/treat/config/languages/russian.rb +12 -0
  34. data/lib/treat/config/languages/spanish.rb +12 -0
  35. data/lib/treat/config/languages/swedish.rb +12 -0
  36. data/lib/treat/config/libraries/stanford.rb +1 -0
  37. data/lib/treat/config/linguistics/categories.rb +4 -0
  38. data/lib/treat/config/linguistics/punctuation.rb +33 -0
  39. data/lib/treat/config/tags/aligned.rb +221 -0
  40. data/lib/treat/config/tags/enju.rb +71 -0
  41. data/lib/treat/config/tags/paris7.rb +17 -0
  42. data/lib/treat/config/tags/ptb.rb +15 -0
  43. data/lib/treat/config/workers/extractors.rb +39 -0
  44. data/lib/treat/config/workers/formatters.rb +20 -0
  45. data/lib/treat/config/workers/inflectors.rb +27 -0
  46. data/lib/treat/config/workers/learners.rb +6 -0
  47. data/lib/treat/config/workers/lexicalizers.rb +18 -0
  48. data/lib/treat/config/workers/list.rb +1 -0
  49. data/lib/treat/config/workers/processors.rb +19 -0
  50. data/lib/treat/config/workers/retrievers.rb +12 -0
  51. data/lib/treat/config.rb +125 -0
  52. data/lib/treat/{classification.rb → core/classification.rb} +1 -1
  53. data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
  54. data/lib/treat/{tree.rb → core/node.rb} +5 -5
  55. data/lib/treat/core/server.rb +3 -0
  56. data/lib/treat/core.rb +5 -0
  57. data/lib/treat/entities/abilities/buildable.rb +61 -56
  58. data/lib/treat/entities/abilities/checkable.rb +2 -2
  59. data/lib/treat/entities/abilities/comparable.rb +21 -0
  60. data/lib/treat/entities/abilities/copyable.rb +2 -0
  61. data/lib/treat/entities/abilities/countable.rb +1 -1
  62. data/lib/treat/entities/abilities/debuggable.rb +1 -1
  63. data/lib/treat/entities/abilities/delegatable.rb +42 -36
  64. data/lib/treat/entities/abilities/doable.rb +2 -2
  65. data/lib/treat/entities/abilities/exportable.rb +1 -1
  66. data/lib/treat/entities/abilities/iterable.rb +21 -33
  67. data/lib/treat/entities/abilities/magical.rb +8 -8
  68. data/lib/treat/entities/abilities/registrable.rb +0 -38
  69. data/lib/treat/entities/abilities/stringable.rb +19 -19
  70. data/lib/treat/entities/collection.rb +31 -0
  71. data/lib/treat/entities/document.rb +10 -0
  72. data/lib/treat/entities/entity.rb +18 -13
  73. data/lib/treat/entities/group.rb +15 -0
  74. data/lib/treat/entities/section.rb +13 -0
  75. data/lib/treat/entities/token.rb +35 -0
  76. data/lib/treat/entities/zone.rb +11 -0
  77. data/lib/treat/entities.rb +5 -75
  78. data/lib/treat/helpers/didyoumean.rb +57 -0
  79. data/lib/treat/helpers/escaping.rb +15 -0
  80. data/lib/treat/helpers/formatting.rb +41 -0
  81. data/lib/treat/helpers/platform.rb +15 -0
  82. data/lib/treat/helpers/reflection.rb +17 -0
  83. data/lib/treat/helpers/temporary.rb +27 -0
  84. data/lib/treat/helpers/verbosity.rb +19 -0
  85. data/lib/treat/helpers.rb +5 -0
  86. data/lib/treat/installer.rb +46 -165
  87. data/lib/treat/loaders/linguistics.rb +22 -27
  88. data/lib/treat/loaders/stanford.rb +23 -41
  89. data/lib/treat/loaders.rb +10 -0
  90. data/lib/treat/proxies.rb +73 -24
  91. data/lib/treat/version.rb +3 -0
  92. data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
  93. data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
  94. data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
  95. data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
  96. data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
  97. data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
  98. data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
  99. data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
  100. data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
  101. data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
  102. data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
  103. data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
  104. data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
  105. data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
  106. data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
  107. data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
  108. data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
  109. data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
  110. data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
  111. data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
  112. data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
  113. data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
  114. data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
  115. data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
  116. data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
  117. data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
  118. data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
  119. data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
  120. data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
  121. data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
  122. data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
  123. data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
  124. data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
  125. data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
  126. data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
  127. data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
  128. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
  129. data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
  130. data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
  131. data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
  132. data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
  133. data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
  134. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
  135. data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
  136. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
  137. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
  138. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
  139. data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
  140. data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
  141. data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
  142. data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
  143. data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
  144. data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
  145. data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
  146. data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
  147. data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
  148. data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
  149. data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
  150. data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
  151. data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
  152. data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
  153. data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
  154. data/lib/treat/workers.rb +96 -0
  155. data/lib/treat.rb +23 -49
  156. data/spec/collection.rb +4 -4
  157. data/spec/document.rb +5 -5
  158. data/spec/entity.rb +33 -32
  159. data/spec/{tree.rb → node.rb} +5 -5
  160. data/spec/phrase.rb +5 -39
  161. data/spec/sandbox.rb +212 -6
  162. data/spec/token.rb +12 -9
  163. data/spec/treat.rb +12 -9
  164. data/spec/word.rb +10 -9
  165. data/spec/zone.rb +6 -2
  166. data/tmp/{INFO → MANIFEST} +0 -0
  167. data/tmp/english.yaml +10340 -0
  168. metadata +149 -139
  169. data/lib/treat/ai.rb +0 -12
  170. data/lib/treat/categories.rb +0 -90
  171. data/lib/treat/categorizable.rb +0 -44
  172. data/lib/treat/configurable.rb +0 -115
  173. data/lib/treat/dependencies.rb +0 -25
  174. data/lib/treat/downloader.rb +0 -87
  175. data/lib/treat/entities/abilities.rb +0 -10
  176. data/lib/treat/entities/entities.rb +0 -102
  177. data/lib/treat/exception.rb +0 -7
  178. data/lib/treat/extractors.rb +0 -79
  179. data/lib/treat/formatters/serializers/mongo.rb +0 -64
  180. data/lib/treat/formatters.rb +0 -41
  181. data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
  182. data/lib/treat/inflectors.rb +0 -52
  183. data/lib/treat/kernel.rb +0 -208
  184. data/lib/treat/languages/arabic.rb +0 -16
  185. data/lib/treat/languages/chinese.rb +0 -16
  186. data/lib/treat/languages/dutch.rb +0 -16
  187. data/lib/treat/languages/english.rb +0 -63
  188. data/lib/treat/languages/french.rb +0 -20
  189. data/lib/treat/languages/german.rb +0 -20
  190. data/lib/treat/languages/greek.rb +0 -16
  191. data/lib/treat/languages/italian.rb +0 -17
  192. data/lib/treat/languages/language.rb +0 -10
  193. data/lib/treat/languages/list.txt +0 -504
  194. data/lib/treat/languages/polish.rb +0 -16
  195. data/lib/treat/languages/portuguese.rb +0 -16
  196. data/lib/treat/languages/russian.rb +0 -16
  197. data/lib/treat/languages/spanish.rb +0 -16
  198. data/lib/treat/languages/swedish.rb +0 -16
  199. data/lib/treat/languages.rb +0 -132
  200. data/lib/treat/lexicalizers.rb +0 -37
  201. data/lib/treat/object.rb +0 -7
  202. data/lib/treat/processors/chunkers/autoselect.rb +0 -16
  203. data/lib/treat/processors/chunkers/txt.rb +0 -21
  204. data/lib/treat/processors.rb +0 -38
  205. data/lib/treat/retrievers.rb +0 -27
  206. data/lib/treat/server.rb +0 -26
  207. data/lib/treat/universalisation/encodings.rb +0 -12
  208. data/lib/treat/universalisation/tags.rb +0 -453
  209. data/lib/treat/universalisation.rb +0 -9
  210. data/spec/languages.rb +0 -25
@@ -1,4 +1,4 @@
1
- class Treat::Processors::Chunkers::HTML
1
+ class Treat::Workers::Processors::Chunkers::HTML
2
2
 
3
3
  require 'nokogiri'
4
4
 
@@ -24,6 +24,7 @@ class Treat::Processors::Chunkers::HTML
24
24
  node.parent && node.parent.type == :section)
25
25
 
26
26
  if $1
27
+
27
28
  lvl = $1.to_i
28
29
  if lvl <= level
29
30
  node.ancestors_with_type(:section).
@@ -41,13 +42,13 @@ class Treat::Processors::Chunkers::HTML
41
42
  node.set :level, level
42
43
 
43
44
  end
44
-
45
+
45
46
  t = node <<
46
47
  Treat::Entities::Title.new(txt)
47
48
  t.set :level, level
48
49
 
49
50
  elsif child.name == 'p'
50
-
51
+
51
52
  node << Treat::Entities::Zone.
52
53
  from_string(txt)
53
54
 
@@ -0,0 +1,32 @@
1
+ class Treat::Workers::Processors::Chunkers::TXT
2
+
3
+ # Separates a string into
4
+ # zones on the basis of newlines.
5
+ #
6
+ # Options: none.
7
+ def self.chunk(entity, options = {})
8
+
9
+ entity.check_hasnt_children
10
+ zones = entity.to_s.split("\n")
11
+ current = entity
12
+ zones.each do |zone|
13
+ zone.strip!
14
+ next if zone == ''
15
+ c = Treat::Entities::
16
+ Zone.from_string(zone)
17
+ if c.type == :title
18
+ if current.type == :section
19
+ current = current.parent
20
+ current = entity << Treat::
21
+ Entities::Section.new
22
+ else
23
+ current = entity << Treat::
24
+ Entities::Section.new
25
+ end
26
+ end
27
+ current << c
28
+ end
29
+
30
+ end
31
+
32
+ end
@@ -10,7 +10,7 @@
10
10
  # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
11
11
  # 2007. Efficient HPSG Parsing with Supertagging and
12
12
  # CFG-filtering. In Proceedings of IJCAI 2007.
13
- module Treat::Processors::Parsers::Enju
13
+ module Treat::Workers::Processors::Parsers::Enju
14
14
 
15
15
  # Require the 'open3' library to connect
16
16
  # with the background Enju process.
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
23
23
  @@parser = nil
24
24
 
25
25
  # A hash of Enju cat tags mapped to word categories.
26
- Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
26
+ Ectc = Treat.tags.enju.cat_to_category
27
27
 
28
28
  # A hash of Enju cat/xcat pairs mapped to PTB tags.
29
- Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
29
+ Ecxtp = Treat.tags.enju.xcat_to_ptb
30
30
 
31
31
  # Parse the entity into its syntactical
32
32
  # phrases using Enju.
@@ -1,8 +1,10 @@
1
1
  # A wrapper class for the Stanford parser.
2
- class Treat::Processors::Parsers::Stanford
2
+ class Treat::Workers::Processors::Parsers::Stanford
3
3
 
4
4
  require 'treat/loaders/stanford'
5
5
 
6
+ Pttc = Treat.tags.aligned.phrase_tags_to_category
7
+
6
8
  # Hold one instance of the pipeline per language.
7
9
  @@parsers = {}
8
10
 
@@ -27,9 +29,7 @@ class Treat::Processors::Parsers::Stanford
27
29
  lang = entity.language
28
30
  init(lang, options)
29
31
 
30
- tag_set = Treat::Universalisation::Tags::
31
- StanfordTagSetForLanguage[
32
- Treat::Languages.describe(lang)]
32
+ tag_set = StanfordCoreNLP::Config::TagSets[language]
33
33
 
34
34
  text = ::StanfordCoreNLP::Text.new(val)
35
35
  @@parsers[lang].annotate(text)
@@ -58,8 +58,7 @@ class Treat::Processors::Parsers::Stanford
58
58
  def self.init(lang, options)
59
59
  return if @@parsers[lang]
60
60
 
61
- language = Treat::Languages.describe(lang)
62
- Treat::Loaders::Stanford.load(language)
61
+ Treat::Loaders::Stanford.load(lang)
63
62
 
64
63
  options = DefaultOptions.merge(options)
65
64
  StanfordCoreNLP.use(lang)
@@ -117,8 +116,7 @@ class Treat::Processors::Parsers::Stanford
117
116
  tag_s, tag_opt = *tag.split('-')
118
117
  tag_s ||= ''
119
118
 
120
- if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
121
- Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
119
+ if Pttc[tag_s] && Pttc[tag_s][tag_set]
122
120
  ruby_child = Treat::Entities::Phrase.new
123
121
  else
124
122
  l = java_child.children[0].to_s
@@ -5,9 +5,7 @@
5
5
  # Original paper: Kiss, Tibor and Strunk, Jan (2006):
6
6
  # Unsupervised Multilingual Sentence Boundary Detection.
7
7
  # Computational Linguistics 32: 485-525.
8
- module Treat::Processors::Segmenters::Punkt
9
-
10
- require 'treat/helpers/decimal_point_escaper'
8
+ module Treat::Workers::Processors::Segmenters::Punkt
11
9
 
12
10
  # Require silently the punkt-segmenter gem.
13
11
  silence_warnings { require 'punkt-segmenter' }
@@ -41,7 +39,8 @@ module Treat::Processors::Segmenters::Punkt
41
39
  # Replace the point in all floating-point numbers
42
40
  # by ^^; this is a fix since Punkt trips on decimal
43
41
  # numbers.
44
- Treat::Helpers::DecimalPointEscaper.escape!(s)
42
+
43
+ escape_floats!(s)
45
44
  s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
46
45
 
47
46
  result = @@segmenters[lang].
@@ -50,8 +49,7 @@ module Treat::Processors::Segmenters::Punkt
50
49
 
51
50
  result.each do |sentence|
52
51
  # Unescape the sentence.
53
- Treat::Helpers::DecimalPointEscaper.
54
- unescape!(sentence)
52
+ unescape_floats!(sentence)
55
53
  entity << Treat::Entities::Phrase.
56
54
  from_string(sentence)
57
55
  end
@@ -65,13 +63,11 @@ module Treat::Processors::Segmenters::Punkt
65
63
  if options[:model]
66
64
  model = options[:model]
67
65
  else
68
- l = Treat::Languages.describe(lang)
69
- model = "#{Treat.models}punkt/#{l}.yaml"
70
-
66
+ model = "#{Treat.paths.models}punkt/#{lang}.yaml"
71
67
  unless File.readable?(model)
72
68
  raise Treat::Exception,
73
69
  "Could not get the language model " +
74
- "for the Punkt segmenter for #{l}."
70
+ "for the Punkt segmenter for #{lang.to_s.capitalize}."
75
71
  end
76
72
  end
77
73
 
@@ -1,6 +1,6 @@
1
1
  # A wrapper for the sentence splitter supplied by
2
2
  # the Stanford parser.
3
- class Treat::Processors::Segmenters::Stanford
3
+ class Treat::Workers::Processors::Segmenters::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
6
  Treat::Loaders::Stanford.load
@@ -41,7 +41,7 @@ class Treat::Processors::Segmenters::Stanford
41
41
  from_string(sentence, true)
42
42
  entity << s
43
43
  if options[:also_tokenize]
44
- Treat::Processors::Tokenizers::Stanford.
44
+ Treat::Workers::Processors::Tokenizers::Stanford.
45
45
  add_tokens(s, sentence.get(:tokens))
46
46
  end
47
47
  end
@@ -7,7 +7,7 @@
7
7
  # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
8
8
  # and the Problem with the U.S. University of California, Berkeley.
9
9
  # http://dgillick.com/resource/sbd_naacl_2009.pdf
10
- module Treat::Processors::Segmenters::Tactful
10
+ module Treat::Workers::Processors::Segmenters::Tactful
11
11
 
12
12
  # Require the 'tactful_tokenizer' gem.
13
13
  silence_warnings { require 'tactful_tokenizer' }
@@ -15,8 +15,6 @@ module Treat::Processors::Segmenters::Tactful
15
15
  # Remove function definition 'tactful_tokenizer' by gem.
16
16
  String.class_eval { undef :tokenize }
17
17
 
18
- require 'treat/helpers/decimal_point_escaper'
19
-
20
18
  # Keep only one copy of the segmenter.
21
19
  @@segmenter = nil
22
20
 
@@ -30,7 +28,7 @@ module Treat::Processors::Segmenters::Tactful
30
28
 
31
29
  s = entity.to_s
32
30
 
33
- Treat::Helpers::DecimalPointEscaper.escape!(s)
31
+ escape_floats!(s)
34
32
 
35
33
  s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
36
34
 
@@ -39,8 +37,7 @@ module Treat::Processors::Segmenters::Tactful
39
37
  sentences = @@segmenter.tokenize_text(s)
40
38
 
41
39
  sentences.each do |sentence|
42
- Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
43
- puts sentence.to_s if sentence.to_s.include?('staff')
40
+ unescape_floats!(sentence)
44
41
  entity << Treat::Entities::Phrase.from_string(sentence)
45
42
  end
46
43
 
@@ -11,7 +11,7 @@
11
11
  # All rights reserved. This program is free software;
12
12
  # you can redistribute it and/or modify it under the
13
13
  # same terms as Ruby itself.
14
- module Treat::Processors::Tokenizers::PTB
14
+ module Treat::Workers::Processors::Tokenizers::PTB
15
15
 
16
16
  # Tokenize the entity using a native rule-based algorithm.
17
17
  def self.tokenize(entity, options = {})
@@ -35,14 +35,11 @@ module Treat::Processors::Tokenizers::PTB
35
35
 
36
36
  s = " " + string + " "
37
37
 
38
- # Translate some common extended ascii
39
- # characters to quotes
40
38
  s.gsub!(/‘/,'`')
41
39
  s.gsub!(/’/,"'")
42
40
  s.gsub!(/“/,"``")
43
41
  s.gsub!(/”/,"''")
44
-
45
-
42
+
46
43
  s.gsub!(/\s+/," ")
47
44
  s.gsub!(/(\s+)''/,'\1"')
48
45
  s.gsub!(/(\s+)``/,'\1"')
@@ -83,6 +80,10 @@ module Treat::Processors::Tokenizers::PTB
83
80
  s.gsub!(/\//, ' / ')
84
81
  s.gsub!(/\s+/,' ')
85
82
  s.strip!
83
+
84
+ s.gsub!(/``/,'"')
85
+ s.gsub!(/''/,'"')
86
+
86
87
  s.split(/\s+/)
87
88
  end
88
89
 
@@ -12,7 +12,7 @@
12
12
  # (almost rewrite).
13
13
  #
14
14
  # Project website: https://github.com/lfcipriani/punkt-segmenter
15
- class Treat::Processors::Tokenizers::Punkt
15
+ class Treat::Workers::Processors::Tokenizers::Punkt
16
16
 
17
17
  SentEndChars = ['.', '?', '!']
18
18
  ReSentEndChars = /[.?!]/
@@ -1,6 +1,6 @@
1
1
  # A wrapper for the Stanford parser's
2
2
  # Penn-Treebank style tokenizer.
3
- class Treat::Processors::Tokenizers::Stanford
3
+ class Treat::Workers::Processors::Tokenizers::Stanford
4
4
 
5
5
  require 'treat/loaders/stanford'
6
6
  Treat::Loaders::Stanford.load
@@ -4,10 +4,8 @@
4
4
  # Released under the GNU GPL v3. Modified by Louis Mullie.
5
5
  #
6
6
  # Project website: https://github.com/SlyShy/Tactful_Tokenizer
7
- class Treat::Processors::Tokenizers::Tactful
7
+ class Treat::Workers::Processors::Tokenizers::Tactful
8
8
 
9
- require 'treat/helpers/decimal_point_escaper'
10
-
11
9
  ReTokenize = [
12
10
  # Uniform Quotes
13
11
  [/''|``/, '"'],
@@ -52,7 +50,7 @@ class Treat::Processors::Tokenizers::Tactful
52
50
 
53
51
  s = entity.to_s
54
52
 
55
- Treat::Helpers::DecimalPointEscaper.escape!(s)
53
+ escape_floats!(s)
56
54
 
57
55
  ReTokenize.each do |rules|
58
56
  s.gsub!(rules[0], rules[1])
@@ -60,7 +58,7 @@ class Treat::Processors::Tokenizers::Tactful
60
58
 
61
59
  s.split(' ').each do |token|
62
60
 
63
- Treat::Helpers::DecimalPointEscaper.unescape!(token)
61
+ unescape_floats!(token)
64
62
  entity << Treat::Entities::Token.
65
63
  from_string(token)
66
64
  end
@@ -3,7 +3,7 @@
3
3
  #
4
4
  # Documentation:
5
5
  # http://rubydoc.info/gems/ferret
6
- class Treat::Retrievers::Indexers::Ferret
6
+ class Treat::Workers::Retrievers::Indexers::Ferret
7
7
 
8
8
  # Require Ferret and file utilities.
9
9
  silence_warnings { require 'ferret' }
@@ -4,7 +4,7 @@
4
4
  #
5
5
  # Documentation:
6
6
  # http://rubydoc.info/gems/ferret
7
- class Treat::Retrievers::Searchers::Ferret
7
+ class Treat::Workers::Retrievers::Searchers::Ferret
8
8
 
9
9
  silence_warnings { require 'ferret' }
10
10
  require 'find'
@@ -0,0 +1,96 @@
1
+ # This module creates all the worker categories
2
+ # and the groups within these categories and adds
3
+ # the relevant hooks on the appropriate entities.
4
+ module Treat::Workers
5
+
6
+ require 'treat/workers/group'
7
+
8
+ # A lookup table for entity types.
9
+ @@lookup = {}
10
+
11
+ # Find a worker group based on method.
12
+ def self.lookup(method)
13
+ @@lookup[method]
14
+ end
15
+
16
+ def self.create_categories
17
+ Treat.workers.list.each do |cat|
18
+ create_category(cat.to_s.
19
+ capitalize.intern,
20
+ load_category_conf(cat))
21
+ end
22
+ end
23
+
24
+ def self.load_category_conf(name)
25
+ config = Treat.workers[name]
26
+ if config.nil?
27
+ raise Treat::Exception,
28
+ "The configuration file " +
29
+ "for #{cat_sym} is missing."
30
+ end
31
+ config
32
+ end
33
+
34
+ def self.create_category(name, conf)
35
+ category = self.const_set(name, Module.new)
36
+ conf.each_pair do |group, worker|
37
+ name = group.to_s.capitalize.intern
38
+ category.module_eval do
39
+ @@methods = []; def methods;
40
+ @@methods; end; def groups;
41
+ self.constants; end
42
+ end
43
+ self.create_group(name, worker, category)
44
+ end
45
+ end
46
+
47
+ def self.create_group(name, conf, category)
48
+ group = category.const_set(name, Module.new)
49
+ self.set_group_options(group, conf)
50
+ self.bind_group_targets(group)
51
+ self.register_group_presets(group, conf)
52
+ @@methods << group.method
53
+ @@lookup[group.method] = group
54
+ end
55
+
56
+ def self.bind_group_targets(group)
57
+ group.targets.each do |entity_type|
58
+ entity = Treat::Entities.
59
+ const_get(cc(entity_type))
60
+ entity.class_eval do
61
+ add_workers group
62
+ end
63
+ end
64
+ end
65
+
66
+ def self.register_group_presets(group, conf)
67
+ return unless conf.respond_to? :presets
68
+ conf.presets.each do |m|
69
+ @@methods << m
70
+ @@lookup[m] = group
71
+ end
72
+ end
73
+
74
+ def self.set_group_options(group, conf)
75
+ group.module_eval do
76
+ extend Treat::Workers::Group
77
+ self.type = conf.type
78
+ self.targets = conf.targets
79
+ if conf.respond_to?(:default)
80
+ self.default = conf.default
81
+ end
82
+ if conf.respond_to?(:preset_option)
83
+ self.preset_option = conf.preset_option
84
+ end
85
+ if conf.respond_to?(:presets)
86
+ self.presets = conf.presets
87
+ end
88
+ if conf.respond_to?(:recursive)
89
+ self.recursive = conf.recursive
90
+ end
91
+ end
92
+ end
93
+
94
+ self.create_categories
95
+
96
+ end
data/lib/treat.rb CHANGED
@@ -1,62 +1,36 @@
1
1
  module Treat
2
2
 
3
- # Require custom exception cass.
4
- require 'treat/exception'
5
-
6
- # Treat requires Ruby 1.9 or higher.
7
- if RUBY_VERSION <= '1.9'
8
- raise Treat::Exception,
9
- 'Treat requires Ruby 1.9 or higher.'
10
- end
11
-
12
- # The current version of Treat.
13
- VERSION = "1.0.6"
14
-
15
- # Add methods to handle syntactic sugar,
16
- # language configuration options, and paths.
17
- require 'treat/configurable'
18
- extend Treat::Configurable
19
-
20
- # The folders in the library and descriptions.
21
- Paths = {
22
- :tmp => 'temporary files',
23
- :lib => 'class and module definitions',
24
- :bin => 'binary files',
25
- :files => 'user-saved files',
26
- :data => 'data set files',
27
- :models => 'model files',
28
- :spec => 'spec test files'
29
- }
30
-
31
- # Add methods to provide access to common paths.
32
- class << self
33
- Paths.each do |path, _|
34
- define_method(path) do
35
- (File.dirname(__FILE__).
36
- split('/')[0..-2].join('/') +
37
- '/' + path.to_s + '/').gsub(
38
- 'lib/../', '')
39
- end
40
- end
3
+ # Treat requires Ruby >= 1.9.2
4
+ if RUBY_VERSION < '1.9.2'
5
+ raise "Treat requires Ruby version 1.9.2 " +
6
+ "or higher, but current is #{RUBY_VERSION}."
41
7
  end
42
-
43
- require 'treat/object'
44
- require 'treat/kernel'
45
- require 'treat/downloader'
46
- require 'treat/languages'
47
- require 'treat/universalisation'
8
+
9
+ # Custom exception class.
10
+ class Exception < ::Exception; end
11
+
12
+ # Load configuration options.
13
+ require 'treat/config'
14
+ # Load all workers.
15
+ require 'treat/helpers'
16
+ # Require library loaders.
17
+ require 'treat/loaders'
18
+ # Require all core classes.
19
+ require 'treat/core'
20
+ # Require all entity classes.
48
21
  require 'treat/entities'
49
- require 'treat/categories'
50
- require 'treat/data_set'
22
+ # Lazy load worker classes.
23
+ require 'treat/workers'
24
+ # Require proxies last.
51
25
  require 'treat/proxies'
52
26
 
27
+ # Turn sugar on.
28
+ Treat::Config.sweeten!
29
+
53
30
  # Install packages for a given language.
54
31
  def self.install(language = :english)
55
32
  require 'treat/installer'
56
33
  Treat::Installer.install(language)
57
34
  end
58
35
 
59
- # Enable syntactic sugar by default.
60
- Treat.sweeten!
61
-
62
36
  end
data/spec/collection.rb CHANGED
@@ -3,7 +3,7 @@ require_relative '../lib/treat'
3
3
  describe Treat::Entities::Collection do
4
4
 
5
5
  before :all do
6
- @file = Treat.spec + 'samples/mathematicians'
6
+ @file = Treat.paths.spec + 'samples/mathematicians'
7
7
  end
8
8
 
9
9
  describe "#<<" do
@@ -12,7 +12,7 @@ describe Treat::Entities::Collection do
12
12
 
13
13
  it "copies the document to the collection's folder " +
14
14
  "and adds the document object to the collection" do
15
- f = Treat.spec + 'samples/test'
15
+ f = Treat.paths.spec + 'samples/test'
16
16
  ff = '3_2_release_notes.html'
17
17
  u = 'http://guides.rubyonrails.org/' + ff
18
18
  c = Treat::Entities::Collection.build(f)
@@ -26,7 +26,7 @@ describe Treat::Entities::Collection do
26
26
 
27
27
  context "when supplied with anything else" do
28
28
  it "adds the object to the collection" do
29
- f = Treat.spec + 'samples/test'
29
+ f = Treat.paths.spec + 'samples/test'
30
30
  c = Treat::Entities::Collection.build(f)
31
31
  c << Treat::Entities::Document.new
32
32
  c.size.should eql 2
@@ -53,7 +53,7 @@ describe Treat::Entities::Collection do
53
53
  context "when supplied a folder name that doesn't exist" do
54
54
 
55
55
  it "creates the directory and opens the collection" do
56
- f = Treat.spec + 'samples/test'
56
+ f = Treat.paths.spec + 'samples/test'
57
57
  c = Treat::Entities::Collection.build(f)
58
58
  FileTest.directory?(f).should eql true
59
59
  c.should be_an_instance_of Treat::Entities::Collection
data/spec/document.rb CHANGED
@@ -8,7 +8,7 @@ describe Treat::Entities::Document do
8
8
 
9
9
  it "returns a list of general topics the document belongs to" do
10
10
  #doc = Treat::Entities::Document.new(
11
- #Treat.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
11
+ #Treat.paths.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
12
12
  #doc.do(:chunk, :segment, :tokenize)
13
13
  #puts doc.topics.inspect
14
14
  end
@@ -24,7 +24,7 @@ describe Treat::Entities::Document do
24
24
  context "when supplied with a readable file name" do
25
25
  it "opens the file and reads its " +
26
26
  "content into a document" do
27
- f = Treat.spec + 'samples/mathematicians/leibniz.txt'
27
+ f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
28
28
  d = Treat::Entities::Document.build(f)
29
29
  d.should be_an_instance_of Treat::Entities::Document
30
30
  d.to_s.index('Gottfried Leibniz').should_not eql nil
@@ -36,7 +36,7 @@ describe Treat::Entities::Document do
36
36
  "a document with the contents of the file" do
37
37
  url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
38
38
  d = Treat::Entities::Document.build(url)
39
- d.format.should eql :html
39
+ d.format.should eql 'html'
40
40
  d.print_tree
41
41
  d.should be_an_instance_of Treat::Entities::Document
42
42
  d.to_s.index('Rubyist').should_not eql nil
@@ -75,7 +75,7 @@ describe Treat::Entities::Document do
75
75
 
76
76
  context "when called on an HTML document" do
77
77
  doc = Treat::Entities::Document.new(
78
- Treat.spec + 'samples/mathematicians/euler.html').read(:html)
78
+ Treat.paths.spec + 'samples/mathematicians/euler.html').read(:html)
79
79
  it "splits the HTML document into sections, " +
80
80
  "titles, paragraphs and lists" do
81
81
  doc.chunk
@@ -88,7 +88,7 @@ describe Treat::Entities::Document do
88
88
 
89
89
  context "when called on a text document" do
90
90
 
91
- doc = Treat::Entities::Document.new(Treat.spec +
91
+ doc = Treat::Entities::Document.new(Treat.paths.spec +
92
92
  'samples/mathematicians/leibniz.txt').read(:txt)
93
93
  it "splits the document into titles and paragraphs" do
94
94
  doc.chunk