treat 0.2.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. data/LICENSE +3 -3
  2. data/README.md +33 -0
  3. data/files/INFO +1 -0
  4. data/lib/treat.rb +40 -105
  5. data/lib/treat/ai.rb +12 -0
  6. data/lib/treat/ai/classifiers/id3.rb +27 -0
  7. data/lib/treat/categories.rb +82 -35
  8. data/lib/treat/categorizable.rb +44 -0
  9. data/lib/treat/classification.rb +61 -0
  10. data/lib/treat/configurable.rb +115 -0
  11. data/lib/treat/data_set.rb +42 -0
  12. data/lib/treat/dependencies.rb +24 -0
  13. data/lib/treat/downloader.rb +87 -0
  14. data/lib/treat/entities.rb +68 -66
  15. data/lib/treat/entities/abilities.rb +10 -0
  16. data/lib/treat/entities/abilities/buildable.rb +327 -0
  17. data/lib/treat/entities/abilities/checkable.rb +31 -0
  18. data/lib/treat/entities/abilities/copyable.rb +45 -0
  19. data/lib/treat/entities/abilities/countable.rb +51 -0
  20. data/lib/treat/entities/abilities/debuggable.rb +83 -0
  21. data/lib/treat/entities/abilities/delegatable.rb +123 -0
  22. data/lib/treat/entities/abilities/doable.rb +62 -0
  23. data/lib/treat/entities/abilities/exportable.rb +11 -0
  24. data/lib/treat/entities/abilities/iterable.rb +115 -0
  25. data/lib/treat/entities/abilities/magical.rb +83 -0
  26. data/lib/treat/entities/abilities/registrable.rb +74 -0
  27. data/lib/treat/entities/abilities/stringable.rb +91 -0
  28. data/lib/treat/entities/entities.rb +104 -0
  29. data/lib/treat/entities/entity.rb +122 -245
  30. data/lib/treat/exception.rb +4 -4
  31. data/lib/treat/extractors.rb +77 -80
  32. data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
  33. data/lib/treat/extractors/language/what_language.rb +50 -45
  34. data/lib/treat/extractors/name_tag/stanford.rb +55 -0
  35. data/lib/treat/extractors/tf_idf/native.rb +87 -0
  36. data/lib/treat/extractors/time/chronic.rb +55 -0
  37. data/lib/treat/extractors/time/nickel.rb +86 -62
  38. data/lib/treat/extractors/time/ruby.rb +53 -0
  39. data/lib/treat/extractors/topic_words/lda.rb +67 -58
  40. data/lib/treat/extractors/topics/reuters.rb +100 -87
  41. data/lib/treat/formatters.rb +39 -35
  42. data/lib/treat/formatters/readers/abw.rb +49 -29
  43. data/lib/treat/formatters/readers/autoselect.rb +37 -33
  44. data/lib/treat/formatters/readers/doc.rb +19 -13
  45. data/lib/treat/formatters/readers/html.rb +52 -30
  46. data/lib/treat/formatters/readers/image.rb +41 -40
  47. data/lib/treat/formatters/readers/odt.rb +59 -45
  48. data/lib/treat/formatters/readers/pdf.rb +28 -25
  49. data/lib/treat/formatters/readers/txt.rb +12 -15
  50. data/lib/treat/formatters/readers/xml.rb +73 -36
  51. data/lib/treat/formatters/serializers/xml.rb +80 -79
  52. data/lib/treat/formatters/serializers/yaml.rb +19 -18
  53. data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
  54. data/lib/treat/formatters/unserializers/xml.rb +94 -99
  55. data/lib/treat/formatters/unserializers/yaml.rb +20 -19
  56. data/lib/treat/formatters/visualizers/dot.rb +132 -132
  57. data/lib/treat/formatters/visualizers/standoff.rb +52 -44
  58. data/lib/treat/formatters/visualizers/tree.rb +26 -29
  59. data/lib/treat/groupable.rb +153 -0
  60. data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
  61. data/lib/treat/inflectors.rb +50 -45
  62. data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
  63. data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
  64. data/lib/treat/inflectors/declensors/active_support.rb +31 -0
  65. data/lib/treat/inflectors/declensors/english.rb +38 -0
  66. data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
  67. data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
  68. data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
  69. data/lib/treat/inflectors/stemmers/porter.rb +160 -0
  70. data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
  71. data/lib/treat/inflectors/stemmers/uea.rb +28 -0
  72. data/lib/treat/installer.rb +308 -0
  73. data/lib/treat/kernel.rb +105 -27
  74. data/lib/treat/languages.rb +122 -88
  75. data/lib/treat/languages/arabic.rb +15 -15
  76. data/lib/treat/languages/chinese.rb +15 -15
  77. data/lib/treat/languages/dutch.rb +15 -15
  78. data/lib/treat/languages/english.rb +61 -62
  79. data/lib/treat/languages/french.rb +19 -19
  80. data/lib/treat/languages/german.rb +20 -20
  81. data/lib/treat/languages/greek.rb +15 -15
  82. data/lib/treat/languages/italian.rb +16 -16
  83. data/lib/treat/languages/polish.rb +15 -15
  84. data/lib/treat/languages/portuguese.rb +15 -15
  85. data/lib/treat/languages/russian.rb +15 -15
  86. data/lib/treat/languages/spanish.rb +16 -16
  87. data/lib/treat/languages/swedish.rb +16 -16
  88. data/lib/treat/lexicalizers.rb +34 -55
  89. data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
  90. data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
  91. data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
  92. data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
  93. data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
  94. data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
  95. data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
  96. data/lib/treat/linguistics.rb +9 -0
  97. data/lib/treat/linguistics/categories.rb +11 -0
  98. data/lib/treat/linguistics/tags.rb +422 -0
  99. data/lib/treat/loaders/linguistics.rb +30 -0
  100. data/lib/treat/loaders/stanford.rb +27 -0
  101. data/lib/treat/object.rb +1 -0
  102. data/lib/treat/processors.rb +37 -44
  103. data/lib/treat/processors/chunkers/autoselect.rb +16 -0
  104. data/lib/treat/processors/chunkers/html.rb +71 -0
  105. data/lib/treat/processors/chunkers/txt.rb +18 -24
  106. data/lib/treat/processors/parsers/enju.rb +253 -208
  107. data/lib/treat/processors/parsers/stanford.rb +130 -131
  108. data/lib/treat/processors/segmenters/punkt.rb +79 -45
  109. data/lib/treat/processors/segmenters/stanford.rb +46 -48
  110. data/lib/treat/processors/segmenters/tactful.rb +43 -36
  111. data/lib/treat/processors/tokenizers/perl.rb +124 -92
  112. data/lib/treat/processors/tokenizers/ptb.rb +81 -0
  113. data/lib/treat/processors/tokenizers/punkt.rb +48 -42
  114. data/lib/treat/processors/tokenizers/stanford.rb +39 -38
  115. data/lib/treat/processors/tokenizers/tactful.rb +64 -55
  116. data/lib/treat/proxies.rb +52 -35
  117. data/lib/treat/retrievers.rb +26 -16
  118. data/lib/treat/retrievers/indexers/ferret.rb +47 -26
  119. data/lib/treat/retrievers/searchers/ferret.rb +69 -50
  120. data/lib/treat/tree.rb +241 -183
  121. data/spec/collection.rb +123 -0
  122. data/spec/document.rb +93 -0
  123. data/spec/entity.rb +408 -0
  124. data/spec/languages.rb +25 -0
  125. data/spec/phrase.rb +146 -0
  126. data/spec/samples/mathematicians/archimedes.abw +34 -0
  127. data/spec/samples/mathematicians/euler.html +21 -0
  128. data/spec/samples/mathematicians/gauss.pdf +0 -0
  129. data/spec/samples/mathematicians/leibniz.txt +13 -0
  130. data/spec/samples/mathematicians/newton.doc +0 -0
  131. data/spec/sandbox.rb +5 -0
  132. data/spec/token.rb +109 -0
  133. data/spec/treat.rb +52 -0
  134. data/spec/tree.rb +117 -0
  135. data/spec/word.rb +110 -0
  136. data/spec/zone.rb +66 -0
  137. data/tmp/INFO +1 -1
  138. metadata +100 -201
  139. data/INSTALL +0 -1
  140. data/README +0 -3
  141. data/TODO +0 -28
  142. data/lib/economist/half_cocked_basel.txt +0 -16
  143. data/lib/economist/hungarys_troubles.txt +0 -46
  144. data/lib/economist/indias_slowdown.txt +0 -15
  145. data/lib/economist/merkozy_rides_again.txt +0 -24
  146. data/lib/economist/prada_is_not_walmart.txt +0 -9
  147. data/lib/economist/to_infinity_and_beyond.txt +0 -15
  148. data/lib/ferret/_11.cfs +0 -0
  149. data/lib/ferret/_14.cfs +0 -0
  150. data/lib/ferret/_p.cfs +0 -0
  151. data/lib/ferret/_s.cfs +0 -0
  152. data/lib/ferret/_v.cfs +0 -0
  153. data/lib/ferret/_y.cfs +0 -0
  154. data/lib/ferret/segments +0 -0
  155. data/lib/ferret/segments_15 +0 -0
  156. data/lib/treat/buildable.rb +0 -157
  157. data/lib/treat/category.rb +0 -33
  158. data/lib/treat/delegatable.rb +0 -116
  159. data/lib/treat/doable.rb +0 -45
  160. data/lib/treat/entities/collection.rb +0 -14
  161. data/lib/treat/entities/document.rb +0 -12
  162. data/lib/treat/entities/phrases.rb +0 -17
  163. data/lib/treat/entities/tokens.rb +0 -61
  164. data/lib/treat/entities/zones.rb +0 -41
  165. data/lib/treat/extractors/coreferences/stanford.rb +0 -69
  166. data/lib/treat/extractors/date/chronic.rb +0 -32
  167. data/lib/treat/extractors/date/ruby.rb +0 -25
  168. data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
  169. data/lib/treat/extractors/language/language_extractor.rb +0 -27
  170. data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
  171. data/lib/treat/extractors/roles/naive.rb +0 -73
  172. data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
  173. data/lib/treat/extractors/statistics/position_in.rb +0 -14
  174. data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
  175. data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
  176. data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
  177. data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
  178. data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
  179. data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
  180. data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
  181. data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
  182. data/lib/treat/feature.rb +0 -58
  183. data/lib/treat/features.rb +0 -7
  184. data/lib/treat/formatters/visualizers/short_value.rb +0 -29
  185. data/lib/treat/formatters/visualizers/txt.rb +0 -45
  186. data/lib/treat/group.rb +0 -106
  187. data/lib/treat/helpers/linguistics_loader.rb +0 -18
  188. data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
  189. data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
  190. data/lib/treat/inflectors/declensions/english.rb +0 -319
  191. data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
  192. data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
  193. data/lib/treat/inflectors/stem/porter.rb +0 -162
  194. data/lib/treat/inflectors/stem/porter_c.rb +0 -26
  195. data/lib/treat/inflectors/stem/uea.rb +0 -30
  196. data/lib/treat/install.rb +0 -59
  197. data/lib/treat/languages/tags.rb +0 -377
  198. data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
  199. data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
  200. data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
  201. data/lib/treat/lexicalizers/tag/brill.rb +0 -91
  202. data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
  203. data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
  204. data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
  205. data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
  206. data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
  207. data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
  208. data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
  209. data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
  210. data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
  211. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
  212. data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
  213. data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
  214. data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
  215. data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
  216. data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
  217. data/lib/treat/registrable.rb +0 -28
  218. data/lib/treat/sugar.rb +0 -50
  219. data/lib/treat/viewable.rb +0 -29
  220. data/lib/treat/visitable.rb +0 -28
  221. data/test/profile.rb +0 -2
  222. data/test/tc_entity.rb +0 -117
  223. data/test/tc_extractors.rb +0 -73
  224. data/test/tc_formatters.rb +0 -41
  225. data/test/tc_inflectors.rb +0 -34
  226. data/test/tc_lexicalizers.rb +0 -32
  227. data/test/tc_processors.rb +0 -50
  228. data/test/tc_resources.rb +0 -22
  229. data/test/tc_treat.rb +0 -60
  230. data/test/tc_tree.rb +0 -60
  231. data/test/tests.rb +0 -20
  232. data/test/texts.rb +0 -19
  233. data/test/texts/english/half_cocked_basel.txt +0 -16
  234. data/test/texts/english/hose_and_dry.doc +0 -0
  235. data/test/texts/english/hungarys_troubles.abw +0 -70
  236. data/test/texts/english/long.html +0 -24
  237. data/test/texts/english/long.txt +0 -22
  238. data/test/texts/english/medium.txt +0 -5
  239. data/test/texts/english/republican_nomination.pdf +0 -0
  240. data/test/texts/english/saving_the_euro.odt +0 -0
  241. data/test/texts/english/short.txt +0 -3
  242. data/test/texts/english/zero_sum.html +0 -111
@@ -1,38 +1,45 @@
1
- module Treat
2
- module Processors
3
- module Segmenters
4
- # An adapter for the 'tactful_tokenizer' gem, which
5
- # detects sentence boundaries (the name is a misnomer;
6
- # it isn't a tokenizer, but a sentence boundary detector).
7
- # It uses a Naive Bayesian statistical model, and is
8
- # based on Splitta, but has support for ‘?’ and ‘!’
9
- # as well as primitive handling of XHTML markup.
10
- #
11
- # Project website: https://github.com/SlyShy/Tackful-Tokenizer
12
- # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
13
- # and the Problem with the U.S. University of California, Berkeley.
14
- # http://dgillick.com/resource/sbd_naacl_2009.pdf
15
- class Tactful
16
- # Require the 'tactful_tokenizer' gem.
17
- silence_warnings { require 'tactful_tokenizer' }
18
- # Remove function definition 'tactful_tokenizer' by gem.
19
- String.class_eval { undef :tokenize }
20
- # Keep only one copy of the segmenter.
21
- @@segmenter = nil
22
- # Segment a text or zone into sentences
23
- # using the 'tactful_tokenizer' gem.
24
- #
25
- # Options: none.
26
- def self.segment(entity, options = {})
27
- @@segmenter ||= TactfulTokenizer::Model.new
28
- s = entity.to_s
29
- s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
30
- sentences = @@segmenter.tokenize_text(s)
31
- sentences.each do |sentence|
32
- entity << Entities::Phrase.from_string(sentence)
33
- end
34
- end
35
- end
1
+ # An adapter for the 'tactful_tokenizer' gem, which
2
+ # detects sentence boundaries based on a Naive Bayesian
3
+ # statistical model.
4
+ #
5
+ # Project website: https://github.com/SlyShy/Tackful-Tokenizer
6
+ #
7
+ # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
8
+ # and the Problem with the U.S. University of California, Berkeley.
9
+ # http://dgillick.com/resource/sbd_naacl_2009.pdf
10
+ module Treat::Processors::Segmenters::Tactful
11
+
12
+ # Require the 'tactful_tokenizer' gem.
13
+ silence_warnings { require 'tactful_tokenizer' }
14
+
15
+ # Remove function definition 'tactful_tokenizer' by gem.
16
+ String.class_eval { undef :tokenize }
17
+
18
+ require 'treat/helpers/decimal_point_escaper'
19
+
20
+ # Keep only one copy of the segmenter.
21
+ @@segmenter = nil
22
+
23
+ # Segment a text or zone into sentences
24
+ # using the 'tactful_tokenizer' gem.
25
+ #
26
+ # Options: none.
27
+ def self.segment(entity, options = {})
28
+
29
+ entity.check_hasnt_children
30
+
31
+ s = entity.to_s
32
+ Treat::Helpers::DecimalPointEscaper.escape!(s)
33
+
34
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
35
+
36
+ @@segmenter ||= TactfulTokenizer::Model.new
37
+
38
+ sentences = @@segmenter.tokenize_text(s)
39
+ sentences.each do |sentence|
40
+ Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
41
+ entity << Treat::Entities::Phrase.from_string(sentence)
36
42
  end
37
43
  end
38
- end
44
+
45
+ end
@@ -1,96 +1,128 @@
1
1
  # encoding: utf-8
2
- module Treat
3
- module Processors
4
- module Tokenizers
5
- # Tokenize the entity using a native rule-based algorithm.
6
- # This tokenizer is a port from an unknown Perl module,
7
- # which I have lifted from the 'rbtagger' gem.
8
- #
9
- # Author: Todd A. Fisher
10
- # This code is free to use under the terms of the MIT license.
11
- #
12
- # Original project website:
13
- # https://github.com/taf2/rb-brill-tagger
14
- class Perl
15
- # Tokenize the entity using a native rule-based algorithm.
16
- # Options: none.
17
- def self.tokenize(entity, options = {})
18
- # Normalize all whitespace
19
- text = entity.to_s.gsub(/\s+/,' ')
20
- # Translate some common extended ascii characters to quotes
21
- text.gsub!(/‘/,'`')
22
- text.gsub!(/’/,"'")
23
- text.gsub!(/“/,"``")
24
- text.gsub!(/”/,"''")
25
- # Attempt to get correct directional quotes
26
- # s{\"\b} { `` }g;
27
- text.gsub!(/\"\b/,' `` ')
28
- # s{\b\"} { '' }g;
29
- text.gsub!(/\b\"/," '' ")
30
- #s{\"(?=\s)} { '' }g;
31
- text.gsub!(/\"(?=\s)/," '' ")
32
- #s{\"} { `` }g;
33
- text.gsub!(/\"(?=\s)/," `` ")
34
- # Isolate ellipses
35
- # s{\.\.\.} { ... }g;
36
- text.gsub!(/\.\.\./,' ... ')
37
- # Isolate any embedded punctuation chars
38
- # s{([,;:\@\#\$\%&])} { $1 }g;
39
- text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
40
- # Assume sentence tokenization has been done first, so split FINAL
41
- # periods only.
42
- # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
43
- text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
44
- # however, we may as well split ALL question marks and exclamation points,
45
- # since they shouldn't have the abbrev.-marker ambiguity problem
46
- #s{([?!])} { $1 }g;
47
- text.gsub!(/([?!])/, ' \1 ')
48
- # parentheses, brackets, etc.
49
- #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
50
- text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
51
- #s/(-{2,})/ $1 /g;
52
- text.gsub!(/(-{2,})/,' \1 ')
53
- # Add a space to the beginning and end of each line, to reduce
54
- # necessary number of regexps below.
55
- #s/$/ /;
56
- text.gsub!(/$/," ")
57
- #s/^/ /;
58
- text.gsub!(/^/," ")
59
- # possessive or close-single-quote
60
- #s/\([^\']\)\' /$1 \' /g;
61
- text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
62
- # as in it's, I'm, we'd
63
- #s/\'([smd]) / \'$1 /ig;
64
- text.gsub!(/\'([smd]) /i,%q( '\1 ))
65
- #s/\'(ll|re|ve) / \'$1 /ig;
66
- text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
67
- #s/n\'t / n\'t /ig;
68
- text.gsub!(/n\'t /i," n't ")
69
- #s/ (can)(not) / $1 $2 /ig;
70
- text.gsub!(/ (can)(not) /i,' \1 \2 ')
71
- #s/ (d\')(ye) / $1 $2 /ig;
72
- text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
73
- #s/ (gim)(me) / $1 $2 /ig;
74
- text.gsub!(/ (gim)(me) /i,' \1 \2 ')
75
- #s/ (gon)(na) / $1 $2 /ig;
76
- text.gsub!(/ (gon)(na) /i,' \1 \2 ')
77
- #s/ (got)(ta) / $1 $2 /ig;
78
- text.gsub!(/ (got)(ta) /i,' \1 \2 ')
79
- #s/ (lem)(me) / $1 $2 /ig;
80
- text.gsub!(/ (lem)(me) /i,' \1 \2 ')
81
- #s/ (more)(\'n) / $1 $2 /ig;
82
- text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
83
- #s/ (\'t)(is|was) / $1 $2 /ig;
84
- text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
85
- #s/ (wan)(na) / $1 $2 /ig;
86
- text.gsub!(/ (wan)(na) /i,' \1 \2 ')
87
- tokens = text.split(/\s/)
88
- tokens[1..-1].each do |token|
89
- next if token =~ /([[:space:]]+)/
90
- entity << Treat::Entities::Token.from_string(token)
91
- end
92
- end
93
- end
2
+ #
3
+ # Tokenize the entity using a native rule-based
4
+ # algorithm. This tokenizer is a port from an
5
+ # unknown Perl module, which I have lifted from
6
+ # the 'rbtagger' gem.
7
+ #
8
+ # Author: Todd A. Fisher
9
+ #
10
+ # This code is free to use under the terms of
11
+ # the MIT license.
12
+ #
13
+ # Original project website:
14
+ #
15
+ # https://github.com/taf2/rb-brill-tagger
16
+ module Treat::Processors::Tokenizers::Perl
17
+
18
+ require 'treat/helpers/decimal_point_escaper'
19
+
20
+ # Tokenize the entity using a rule-based algorithm
21
+ # ported from Perl by Todd A. Fisher.
22
+ #
23
+ # Options: none.
24
+ def self.tokenize(entity, options = {})
25
+
26
+ entity.check_hasnt_children
27
+ s = entity.to_s
28
+
29
+ tokens = get_tokens(entity.to_s)
30
+ tokens[1..-1].each do |token|
31
+ next if token =~ /^\s*$/
32
+ entity << Treat::Entities::Token.
33
+ from_string(token)
94
34
  end
35
+
95
36
  end
37
+
38
+ # Helper method to perform the tokenization.
39
+ def self.get_tokens(string)
40
+
41
+ # Normalize all whitespace
42
+ text = string.gsub(/\s+/,' ')
43
+
44
+ # Replace all decimal points by ^^
45
+ Treat::Helpers::DecimalPointEscaper.escape!(text)
46
+
47
+ # Translate some common extended ascii
48
+ # characters to quotes
49
+ text.gsub!(/‘/,'`')
50
+ text.gsub!(/’/,"'")
51
+ text.gsub!(/“/,"``")
52
+ text.gsub!(/”/,"''")
53
+
54
+ # Attempt to get correct directional quotes
55
+ # s{\"\b} { `` }g;
56
+ text.gsub!(/\"\b/,' `` ')
57
+ # s{\b\"} { '' }g;
58
+ text.gsub!(/\b\"/," '' ")
59
+ #s{\"(?=\s)} { '' }g;
60
+ text.gsub!(/\"(?=\s)/," '' ")
61
+ #s{\"} { `` }g;
62
+ text.gsub!(/\"(?=\s)/," `` ")
63
+ # Isolate ellipses
64
+ # s{\.\.\.} { ... }g;
65
+ text.gsub!(/\.\.\./,' ... ')
66
+ # Isolate any embedded punctuation chars
67
+ # s{([,;:\@\#\$\%&])} { $1 }g;
68
+ text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
69
+
70
+ # Assume sentence tokenization has been
71
+ # done first, so split FINAL
72
+ # periods only.
73
+ # s/ ([^.]) \. ([\]\)\}\>\"\']*)
74
+ # [ \t]* $ /$1 .$2 /gx;
75
+ text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
76
+ # however, we may as well split ALL
77
+ # question marks and exclamation points,
78
+ # since they shouldn't have the abbrev.
79
+ # -marker ambiguity problem
80
+ #s{([?!])} { $1 }g;
81
+ text.gsub!(/([?!])/, ' \1 ')
82
+ # parentheses, brackets, etc.
83
+ #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
84
+ text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
85
+ #s/(-{2,})/ $1 /g;
86
+ text.gsub!(/(-{2,})/,' \1 ')
87
+
88
+ # Add a space to the beginning and end of
89
+ # each line, to reduce # of regexps below.
90
+ #s/$/ /;
91
+ text.gsub!(/$/," ")
92
+ #s/^/ /;
93
+ text.gsub!(/^/," ")
94
+
95
+ # possessive or close-single-quote
96
+ #s/\([^\']\)\' /$1 \' /g;
97
+ text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
98
+ # as in it's, I'm, we'd
99
+ #s/\'([smd]) / \'$1 /ig;
100
+ text.gsub!(/\'([smd]) /i,%q( '\1 ))
101
+ #s/\'(ll|re|ve) / \'$1 /ig;
102
+ text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
103
+ #s/n\'t / n\'t /ig;
104
+ text.gsub!(/n\'t /i," n't ")
105
+
106
+ #s/ (can)(not) / $1 $2 /ig;
107
+ text.gsub!(/ (can)(not) /i,' \1 \2 ')
108
+ #s/ (d\')(ye) / $1 $2 /ig;
109
+ text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
110
+ #s/ (gim)(me) / $1 $2 /ig;
111
+ text.gsub!(/ (gim)(me) /i,' \1 \2 ')
112
+ #s/ (gon)(na) / $1 $2 /ig;
113
+ text.gsub!(/ (gon)(na) /i,' \1 \2 ')
114
+ #s/ (got)(ta) / $1 $2 /ig;
115
+ text.gsub!(/ (got)(ta) /i,' \1 \2 ')
116
+ #s/ (lem)(me) / $1 $2 /ig;
117
+ text.gsub!(/ (lem)(me) /i,' \1 \2 ')
118
+ #s/ (more)(\'n) / $1 $2 /ig;
119
+ text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
120
+ #s/ (\'t)(is|was) / $1 $2 /ig;
121
+ text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
122
+ #s/ (wan)(na) / $1 $2 /ig;
123
+ text.gsub!(/ (wan)(na) /i,' \1 \2 ')
124
+ text.split(/\s/)
125
+
126
+ end
127
+
96
128
  end
@@ -0,0 +1,81 @@
1
+ # A native rule-basd tokenizer based on the one
2
+ # developped by Robert Macyntyre in 1995 for the Penn
3
+ # Treebank project. This tokenizer follows the
4
+ # conventions used by the Penn Treebank.
5
+ #
6
+ # Original script:
7
+ # http://www.cis.upenn.edu/~treebank/tokenizer.sed
8
+ #
9
+ # Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
10
+ # All rights reserved. This program is free software;
11
+ # you can redistribute it and/or modify it under the
12
+ # same terms as Ruby itself.
13
+ module Treat::Processors::Tokenizers::PTB
14
+
15
+ require 'treat/helpers/decimal_point_escaper'
16
+
17
+ # Tokenize the entity using a native rule-based algorithm.
18
+ def self.tokenize(entity, options = {})
19
+
20
+ entity.check_hasnt_children
21
+
22
+ if entity.has_children?
23
+ raise Treat::Exception,
24
+ "Cannot tokenize an #{entity.class} " +
25
+ "that already has children."
26
+ end
27
+ chunks = split(entity.to_s)
28
+ chunks.each do |chunk|
29
+ next if chunk =~ /([[:space:]]+)/
30
+ entity << Treat::Entities::Token.from_string(chunk)
31
+ end
32
+ end
33
+
34
+ # Helper method to split the string into tokens.
35
+ def self.split(string)
36
+ s = " " + string + " "
37
+ Treat::Helpers::DecimalPointEscaper.escape!(s)
38
+ s.gsub!(/\s+/," ")
39
+ s.gsub!(/(\s+)''/,'\1"')
40
+ s.gsub!(/(\s+)``/,'\1"')
41
+ s.gsub!(/''(\s+)/,'"\1')
42
+ s.gsub!(/``(\s+)/,'"\1')
43
+ s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
44
+ s.gsub!(/([ (\[{<])"/,'\1 `` ')
45
+ s.gsub!(/\.\.\./,' ... ')
46
+ s.gsub!(/[,;:@\#$%&]/,' \& ')
47
+ s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
48
+ s.gsub!(/[?!]/,' \& ')
49
+ s.gsub!(/[\]\[(){}<>]/,' \& ')
50
+ s.gsub!(/--/,' -- ')
51
+ s.sub!(/$/,' ')
52
+ s.sub!(/^/,' ')
53
+ s.gsub!(/"/,' \'\' ')
54
+ s.gsub!(/([^'])' /,'\1 \' ')
55
+ s.gsub!(/'([sSmMdD]) /,' \'\1 ')
56
+ s.gsub!(/'ll /,' \'ll ')
57
+ s.gsub!(/'re /,' \'re ')
58
+ s.gsub!(/'ve /,' \'ve ')
59
+ s.gsub!(/n't /,' n\'t ')
60
+ s.gsub!(/'LL /,' \'LL ')
61
+ s.gsub!(/'RE /,' \'RE ')
62
+ s.gsub!(/'VE /,' \'VE ')
63
+ s.gsub!(/N'T /,' N\'T ')
64
+ s.gsub!(/ ([Cc])annot /,' \1an not ')
65
+ s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
66
+ s.gsub!(/ ([Gg])imme /,' \1im me ')
67
+ s.gsub!(/ ([Gg])onna /,' \1on na ')
68
+ s.gsub!(/ ([Gg])otta /,' \1ot ta ')
69
+ s.gsub!(/ ([Ll])emme /,' \1em me ')
70
+ s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
71
+ s.gsub!(/ '([Tt])is /,' \'\1 is ')
72
+ s.gsub!(/ '([Tt])was /,' \'\1 was ')
73
+ s.gsub!(/ ([Ww])anna /,' \1an na ')
74
+ while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
75
+ s.gsub!(/\//, ' / ')
76
+ s.gsub!(/\s+/,' ')
77
+ s.strip!
78
+ s.split(/\s+/)
79
+ end
80
+
81
+ end
@@ -1,45 +1,51 @@
1
- module Treat
2
- module Processors
3
- module Tokenizers
4
- # A tokenizer that was lifted from the 'punkt-segmenter'
5
- # Ruby gem.
6
- #
7
- # This code follows the terms and conditions of Apache
8
- # License v2 (http://www.apache.org/licenses/LICENSE-2.0)
9
- #
10
- # Authors: Willy <willy@csse.unimelb.edu.au>
11
- # (original Python port), Steven Bird
12
- # <sb@csse.unimelb.edu.au> (additions),
13
- # Edward Loper <edloper@gradient.cis.upenn.edu>
14
- # (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
15
- # (almost rewrite).
16
- #
17
- # Project website: https://github.com/lfcipriani/punkt-segmenter
18
- class Punkt
19
- SentEndChars = ['.', '?', '!']
20
- ReSentEndChars = /[.?!]/
21
- InternalPunctuation = [',', ':', ';']
22
- ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
23
- ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
24
- ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
25
- ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
26
- ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
- RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
- # Tokenize the text using the algorithm lifted from
29
- # the Punkt tokenizer gem.
30
- #
31
- # Options: none.
32
- def self.tokenize(entity, options = {})
33
- entity.to_s.scan(ReWordTokenizer).each do |token|
34
- if SentEndChars.include?(token[-1])
35
- entity << Treat::Entities::Token.from_string(token[0..-2])
36
- entity << Treat::Entities::Token.from_string(token[-1..-1])
37
- else
38
- entity << Treat::Entities::Token.from_string(token)
39
- end
40
- end
41
- end
1
+ # A tokenizer that was lifted from the 'punkt-segmenter'
2
+ # Ruby gem.
3
+ #
4
+ # This code follows the terms and conditions of Apache
5
+ # License v2 (http://www.apache.org/licenses/LICENSE-2.0)
6
+ #
7
+ # Authors: Willy <willy@csse.unimelb.edu.au>
8
+ # (original Python port), Steven Bird
9
+ # <sb@csse.unimelb.edu.au> (additions),
10
+ # Edward Loper <edloper@gradient.cis.upenn.edu>
11
+ # (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
12
+ # (almost rewrite).
13
+ #
14
+ # Project website: https://github.com/lfcipriani/punkt-segmenter
15
+ class Treat::Processors::Tokenizers::Punkt
16
+
17
+ require 'treat/helpers/decimal_point_escaper'
18
+
19
+ SentEndChars = ['.', '?', '!']
20
+ ReSentEndChars = /[.?!]/
21
+ InternalPunctuation = [',', ':', ';']
22
+ ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
23
+ ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
24
+ ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
25
+ ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
26
+ ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
+ RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
+
29
+ # Tokenize the text using the algorithm lifted from
30
+ # the Punkt tokenizer gem.
31
+ #
32
+ # Options: none.
33
+ def self.tokenize(entity, options = {})
34
+
35
+ entity.check_hasnt_children
36
+
37
+ s = entity.to_s
38
+ Treat::Helpers::DecimalPointEscaper.escape!(s)
39
+
40
+ s.scan(ReWordTokenizer).each do |token|
41
+ if SentEndChars.include?(token[-1])
42
+ entity << Treat::Entities::Token.from_string(token[0..-2])
43
+ entity << Treat::Entities::Token.from_string(token[-1..-1])
44
+ else
45
+ entity << Treat::Entities::Token.from_string(token)
42
46
  end
43
47
  end
48
+
44
49
  end
45
- end
50
+
51
+ end