treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,137 +1,136 @@
1
- module Treat
2
- module Processors
3
- module Parsers
4
- # A wrapper class for the Stanford parser.
5
- class Stanford
6
- require 'stanford-core-nlp'
7
- @@parser = {}
8
- DefaultOptions = {
9
- :silence => false,
10
- :log_to_file => nil,
11
- :parser_model => nil,
12
- :tagger_model => nil
13
- }
14
- # Parse the entity using the Stanford parser.
15
- #
16
- # Options:
17
- # - (String) :log_to_file => a filename to log output to
18
- # instead of displaying it.
19
- def self.parse(entity, options = {})
20
- val = entity.to_s
21
- entity.remove_all! if entity.has_children?
22
- options = DefaultOptions.merge(options)
23
- lang = entity.language
24
- StanfordCoreNLP.use(lang)
25
- if options[:tagger_model]
26
- ::StanfordCoreNLP.set_model(
27
- 'pos.model', options[:tagger_model]
28
- )
29
- end
30
- if options[:parser_model]
31
- ::StanfordCoreNLP.set_model(
32
- 'parser.model', options[:parser_model]
33
- )
34
- end
35
- if options[:silence]
36
- options[:log_to_file] = '/dev/null'
37
- end
38
- if options[:log_to_file]
39
- ::StanfordCoreNLP.log_file =
40
- options[:log_to_file]
41
- end
42
- @@parser[lang] ||=
43
- ::StanfordCoreNLP.load(
44
- :tokenize, :ssplit, :pos, :lemma, :parse
45
- )
46
- text = ::StanfordCoreNLP::Text.new(val)
47
- @@parser[lang].annotate(text)
48
-
49
- text.get(:sentences).each do |s|
50
- if entity.is_a?(Treat::Entities::Sentence) ||
51
- entity.is_a?(Treat::Entities::Phrase)
52
- tag = s.get(:category).to_s
53
- tag_s, tag_opt = *tag.split('-')
54
- tag_s ||= 'S'
55
- entity.set :tag_set, :penn
56
- entity.set :tag, tag_s
57
- entity.set :tag_opt, tag_opt if tag_opt
58
- recurse(s.get(:tree), entity)
59
- break
60
- else
61
- recurse(s.get(:tree), entity)
62
- end
63
- end
1
+ # A wrapper class for the Stanford parser.
2
+ class Treat::Processors::Parsers::Stanford
3
+
4
+ require 'treat/loaders/stanford'
5
+
6
+ # Hold one instance of the pipeline per language.
7
+ @@parsers = {}
8
+
9
+ DefaultOptions = {
10
+ :parser_model => nil,
11
+ :tagger_model => nil
12
+ }
13
+
14
+ # Parse the entity using the Stanford parser.
15
+ #
16
+ # Options:
17
+ #
18
+ # - (Boolean) :silent => whether to silence the output
19
+ # of the JVM.
20
+ # - (String) :log_file => a filename to log output to
21
+ # instead of displaying it.
22
+ def self.parse(entity, options = {})
23
+
24
+ entity.check_hasnt_children
25
+
26
+ val = entity.to_s
27
+ lang = entity.language
28
+ init(lang, options)
29
+
30
+ text = ::StanfordCoreNLP::Text.new(val)
31
+ @@parsers[lang].annotate(text)
32
+
33
+ text.get(:sentences).each do |s|
34
+
35
+ if entity.is_a?(Treat::Entities::Sentence) ||
36
+ entity.is_a?(Treat::Entities::Phrase)
37
+ tag = s.get(:category).to_s
38
+ tag_s, tag_opt = *tag.split('-')
39
+ tag_s ||= 'S'
40
+ entity.set :tag_set, :penn
41
+ entity.set :tag, tag_s
42
+ entity.set :tag_opt, tag_opt if tag_opt
43
+ recurse(s.get(:tree).children[0], entity)
44
+ break
45
+ else
46
+ recurse(s.get(:tree), entity)
47
+ end
48
+
49
+ end
50
+
51
+ end
52
+
53
+ def self.init(lang, options)
54
+ return if @@parsers[lang]
55
+ options = DefaultOptions.merge(options)
56
+ StanfordCoreNLP.use(lang)
57
+ if options[:tagger_model]
58
+ ::StanfordCoreNLP.set_model(
59
+ 'pos.model', options[:tagger_model]
60
+ )
61
+ end
62
+ if options[:parser_model]
63
+ ::StanfordCoreNLP.set_model(
64
+ 'parser.model', options[:parser_model]
65
+ )
66
+ end
67
+ @@parsers[lang] ||=
68
+ ::StanfordCoreNLP.load(
69
+ :tokenize, :ssplit, :pos, :lemma, :parse
70
+ )
71
+ end
72
+
73
+ # Helper method which recurses the tree supplied by
74
+ # the Stanford parser.
75
+ def self.recurse(java_node, ruby_node, additional_tags = [])
76
+
77
+ if java_node.num_children == 0
78
+
79
+ label = java_node.label
80
+ tag = label.get(:part_of_speech).to_s
81
+ tag_s, tag_opt = *tag.split('-')
82
+ tag_s ||= ''
83
+ ruby_node.value = java_node.value.to_s.strip
84
+ ruby_node.set :tag_set, :penn
85
+ ruby_node.set :tag, tag_s
86
+ ruby_node.set :tag_opt, tag_opt if tag_opt
87
+ ruby_node.set :tag_set, :penn
88
+ ruby_node.set :lemma, label.get(:lemma).to_s
89
+
90
+ additional_tags.each do |t|
91
+ lt = label.get(t)
92
+ ruby_node.set t, lt.to_s if lt
93
+ end
94
+
95
+ ruby_node
96
+
97
+ else
98
+
99
+ if java_node.num_children == 1 &&
100
+ java_node.children[0].num_children == 0
101
+ recurse(java_node.children[0],
102
+ ruby_node, additional_tags)
103
+ return
104
+ end
105
+
106
+ java_node.children.each do |java_child|
107
+ label = java_child.label
108
+ tag = label.get(:category).to_s
109
+ tag_s, tag_opt = *tag.split('-')
110
+ tag_s ||= ''
111
+
112
+ if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
113
+ ruby_child = Treat::Entities::Phrase.new
114
+ else
115
+ l = java_child.children[0].to_s
116
+ v = java_child.children[0].value.to_s.strip
117
+ # Mhmhmhmhmhm
118
+ val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
119
+ ruby_child = Treat::Entities::Token.from_string(val)
64
120
  end
65
-
66
- # Helper method which recurses the tree supplied by
67
- # the Stanford parser.
68
- def self.recurse(java_node, ruby_node, additional_tags = [])
69
- # Leaf
70
- if java_node.num_children == 0
71
- label = java_node.label
72
- tag = label.get(:part_of_speech).to_s
73
- tag_s, tag_opt = *tag.split('-')
74
- tag_s ||= ''
75
- ruby_node.value = java_node.value.to_s.strip
76
- ruby_node.set :tag_set, :penn
77
- ruby_node.set :tag, tag_s
78
- ruby_node.set :tag_opt, tag_opt if tag_opt
79
- ruby_node.set :tag_set, :penn
80
- ruby_node.set :lemma, label.get(:lemma).to_s
81
-
82
- ruby_node.set :character_offset_begin,
83
- label.get(:character_offset_begin).to_s
84
-
85
- ruby_node.set :character_offset_end,
86
- label.get(:character_offset_end).to_s
87
-
88
- ruby_node.set :begin_index,
89
- label.get(:begin_index).to_s
90
-
91
- ruby_node.set :end_index,
92
- label.get(:end_index).to_s
93
-
94
- additional_tags.each do |t|
95
- lt = label.get(t)
96
- ruby_node.set t, lt.to_s if lt
97
- end
98
- return ruby_node
99
- else
100
-
101
- if java_node.num_children == 1 &&
102
- java_node.children[0].num_children == 0
103
- recurse(java_node.children[0], ruby_node, additional_tags)
104
- return
105
- end
106
-
107
- java_node.children.each do |java_child|
108
- label = java_child.label
109
- tag = label.get(:category).to_s
110
- tag_s, tag_opt = *tag.split('-')
111
- tag_s ||= ''
112
-
113
- if Treat::Languages::Tags::PhraseTagToCategory[tag_s]
114
- ruby_child = Treat::Entities::Phrase.new
115
- else
116
- l = java_child.children[0].to_s
117
- v = java_child.children[0].value.to_s.strip
118
- # Mhmhmhmhmhm
119
- val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
120
- ruby_child = Treat::Entities::Token.from_string(val)
121
- end
122
-
123
- ruby_child.set :tag_set, :penn
124
- ruby_child.set :tag, tag_s
125
- ruby_child.set :tag_opt, tag_opt if tag_opt
126
- ruby_node << ruby_child
127
-
128
- unless java_child.children.empty?
129
- recurse(java_child, ruby_child, additional_tags)
130
- end
131
- end
132
- end
121
+
122
+ ruby_child.set :tag_set, :penn
123
+ ruby_child.set :tag, tag_s
124
+ ruby_child.set :tag_opt, tag_opt if tag_opt
125
+ ruby_node << ruby_child
126
+
127
+ unless java_child.children.empty?
128
+ recurse(java_child, ruby_child, additional_tags)
133
129
  end
130
+
134
131
  end
132
+
135
133
  end
134
+
136
135
  end
137
136
  end
@@ -1,48 +1,82 @@
1
- module Treat
2
- module Processors
3
- module Segmenters
4
- # An adapter for the 'punk-segmenter' gem, which segments
5
- # texts into sentences based on an unsupervised, language
6
- # independent algorithm.
7
- #
8
- # Original paper: Kiss, Tibor and Strunk, Jan (2006):
9
- # Unsupervised Multilingual Sentence Boundary Detection.
10
- # Computational Linguistics 32: 485-525.
11
- class Punkt
12
- silence_warnings { require 'punkt-segmenter' }
13
- require 'psych'
14
- # Hold one copy of the segmenter per language.
15
- @@segmenters = {}
16
- # Hold only one trainer per language.
17
- @@trainers = {}
18
- # Segment a text using the Punkt segmenter gem.
19
- #
20
- # Options:
21
- #
22
- # :training_text => (String) Text to train the segmenter on.
23
- def self.segment(entity, options = {})
24
- lang = entity.language
25
- if options[:model]
26
- model = options[:model]
27
- else
28
- l = Treat::Languages.describe(lang)
29
- model = "#{Treat.lib}/treat/processors/segmenters/punkt/#{l}.yaml"
30
- unless File.readable?(model)
31
- raise Treat::Exception,
32
- "Could not get the language model for the Punkt segmenter for #{l}."
33
- end
34
- end
35
- t = ::Psych.load(File.read(model))
36
- @@segmenters[lang] ||= ::Punkt::SentenceTokenizer.new(t)
37
- s = entity.to_s
38
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
39
- result = @@segmenters[lang].sentences_from_text(
40
- s, :output => :sentences_text)
41
- result.each do |sentence|
42
- entity << Treat::Entities::Phrase.from_string(sentence)
43
- end
44
- end
1
+ # An adapter for the 'punk-segmenter' gem, which segments
2
+ # texts into sentences based on an unsupervised, language
3
+ # independent algorithm.
4
+ #
5
+ # Original paper: Kiss, Tibor and Strunk, Jan (2006):
6
+ # Unsupervised Multilingual Sentence Boundary Detection.
7
+ # Computational Linguistics 32: 485-525.
8
+ module Treat::Processors::Segmenters::Punkt
9
+
10
+ require 'treat/helpers/decimal_point_escaper'
11
+
12
+ # Require silently the punkt-segmenter gem.
13
+ silence_warnings { require 'punkt-segmenter' }
14
+
15
+ # Require the YAML parser.
16
+ silence_warnings { require 'psych' }
17
+
18
+ # Hold one copy of the segmenter per language.
19
+ @@segmenters = {}
20
+
21
+ # Hold only one trainer per language.
22
+ @@trainers = {}
23
+
24
+ # Segment a text using the Punkt segmenter gem.
25
+ # The included models for this segmenter have
26
+ # been trained on one or two lengthy books
27
+ # from the corresponding language.
28
+ #
29
+ # Options:
30
+ #
31
+ # (String) :training_text => Text to train on.
32
+ def self.segment(entity, options = {})
33
+
34
+ entity.check_hasnt_children
35
+
36
+ lang = entity.language
37
+ set_options(lang, options)
38
+
39
+ s = entity.to_s
40
+
41
+ # Replace all decimal points by ^^
42
+ Treat::Helpers::DecimalPointEscaper.escape!(s)
43
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
44
+
45
+ result = @@segmenters[lang].
46
+ sentences_from_text(s,
47
+ :output => :sentences_text)
48
+
49
+ result.each do |sentence|
50
+ Treat::Helpers::DecimalPointEscaper.
51
+ unescape!(sentence)
52
+ entity << Treat::Entities::Phrase.
53
+ from_string(sentence)
54
+ end
55
+
56
+ end
57
+
58
+ def self.set_options(lang, options)
59
+
60
+ return @@segmenters[lang] if @@segmenters[lang]
61
+
62
+ if options[:model]
63
+ model = options[:model]
64
+ else
65
+ l = Treat::Languages.describe(lang)
66
+ model = "#{Treat.models}punkt/#{l}.yaml"
67
+
68
+ unless File.readable?(model)
69
+ raise Treat::Exception,
70
+ "Could not get the language model " +
71
+ "for the Punkt segmenter for #{l}."
45
72
  end
46
73
  end
74
+
75
+ t = ::Psych.load(File.read(model))
76
+
77
+ @@segmenters[lang] =
78
+ ::Punkt::SentenceTokenizer.new(t)
79
+
47
80
  end
48
- end
81
+
82
+ end
@@ -1,52 +1,50 @@
1
- module Treat
2
- module Processors
3
- module Segmenters
4
- # A wrapper for the sentence splitter supplied by
5
- # the Stanford parser.
6
- class Stanford
7
- require 'stanford-core-nlp'
8
- DefaultOptions = {
9
- :silence => false,
10
- :log_to_file => false,
11
- :also_tokenize => false
12
- }
13
- # Segment sentences using the sentence splitter supplied by
14
- # the Stanford parser. By default, this segmenter also adds
15
- # the tokens as children of the sentences.
16
- #
17
- # Options:
18
- # - (Boolean) :also_tokenize - Whether to also add the tokens
19
- # as children of the sentence.
20
- # - (String) :log_to_file => a filename to log output to
21
- # instead of displaying it.
22
- # - (String) :silence => send
23
- def self.segment(entity, options = {})
24
- options = DefaultOptions.merge(options)
25
- options[:log_to_file] = '/dev/null' if options[:silence]
26
- if options[:log_to_file]
27
- ::StanfordCoreNLP.log_file = options[:log_to_file]
28
- end
29
- options = DefaultOptions.merge(options)
30
- pipeline = ::StanfordCoreNLP.load(:tokenize, :ssplit)
31
- text = ::StanfordCoreNLP::Text.new(entity.to_s)
32
- pipeline.annotate(text)
33
- text.get(:sentences).each do |sentence|
34
- s = Treat::Entities::Sentence.from_string(sentence.to_s, true)
35
- entity << s
36
- if options[:also_tokenize]
37
- sentence.get(:tokens).each do |token|
38
- t = Treat::Entities::Phrase.from_string(token.value)
39
- s << t
40
- t.set :character_offset_begin,
41
- token.get(:character_offset_begin)
42
-
43
- t.set :character_offset_end,
44
- token.get(:character_offset_end)
45
- end
46
- end
47
- end
48
- end
1
+ # A wrapper for the sentence splitter supplied by
2
+ # the Stanford parser.
3
+ class Treat::Processors::Segmenters::Stanford
4
+
5
+ require 'treat/loaders/stanford'
6
+
7
+ DefaultOptions = {
8
+ :also_tokenize => false
9
+ }
10
+
11
+ # Keep one copy of the Stanford Core NLP pipeline.
12
+ @@segmenter = nil
13
+
14
+ # Segment sentences using the sentence splitter
15
+ # supplied by the Stanford parser. For better
16
+ # performance, set the option :also_tokenize
17
+ # to true, and this segmenter will also add
18
+ # the tokens as children of the sentences.
19
+ #
20
+ # Options:
21
+ #
22
+ # - (Boolean) :also_tokenize - Whether to also
23
+ # add the tokens as children of the sentence.
24
+ def self.segment(entity, options = {})
25
+
26
+ options = DefaultOptions.merge(options)
27
+ entity.check_hasnt_children
28
+
29
+ @@segmenter ||=
30
+ ::StanfordCoreNLP.load(:tokenize, :ssplit)
31
+
32
+ s = entity.to_s
33
+
34
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
35
+
36
+ @@segmenter.annotate(text)
37
+ text.get(:sentences).each do |sentence|
38
+ sentence = sentence.to_s
39
+ s = Treat::Entities::Sentence.
40
+ from_string(sentence, true)
41
+ entity << s
42
+ if options[:also_tokenize]
43
+ Treat::Processors::Tokenizers::Stanford.
44
+ add_tokens(s, sentence.get(:tokens))
49
45
  end
50
46
  end
47
+
51
48
  end
49
+
52
50
  end