treat 1.2.0 → 2.0.0rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (217) hide show
  1. data/LICENSE +2 -2
  2. data/README.md +12 -21
  3. data/lib/treat/autoload.rb +44 -0
  4. data/lib/treat/config/config.rb +38 -0
  5. data/lib/treat/config/configurable.rb +51 -0
  6. data/lib/treat/config/data/config.rb +50 -0
  7. data/lib/treat/config/data/core.rb +52 -0
  8. data/lib/treat/config/data/databases.rb +10 -0
  9. data/lib/treat/config/data/entities.rb +15 -0
  10. data/lib/treat/config/data/languages/agnostic.rb +31 -0
  11. data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
  12. data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
  13. data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
  14. data/lib/treat/config/data/languages/english.rb +95 -0
  15. data/lib/treat/config/data/languages/french.rb +148 -0
  16. data/lib/treat/config/data/languages/german.rb +135 -0
  17. data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
  18. data/lib/treat/config/data/languages/italian.rb +162 -0
  19. data/lib/treat/config/data/languages/polish.rb +11 -0
  20. data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
  21. data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
  22. data/lib/treat/config/data/languages/spanish.rb +291 -0
  23. data/lib/treat/config/data/languages/swedish.rb +289 -0
  24. data/lib/treat/config/data/libraries.rb +12 -0
  25. data/lib/treat/config/data/linguistics.rb +44 -0
  26. data/lib/treat/config/data/tags.rb +328 -0
  27. data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
  28. data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
  29. data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
  30. data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
  31. data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
  32. data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
  33. data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
  34. data/lib/treat/config/importable.rb +31 -0
  35. data/lib/treat/config/paths.rb +23 -0
  36. data/lib/treat/config/tags.rb +37 -0
  37. data/lib/treat/core/dsl.rb +55 -0
  38. data/lib/treat/{installer.rb → core/installer.rb} +10 -12
  39. data/lib/treat/core/server.rb +40 -0
  40. data/lib/treat/entities/entities.rb +101 -0
  41. data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
  42. data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
  43. data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
  44. data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
  45. data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
  46. data/lib/treat/entities/entity/debuggable.rb +86 -0
  47. data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
  48. data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
  49. data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
  50. data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
  51. data/lib/treat/entities/entity/registrable.rb +36 -0
  52. data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
  53. data/lib/treat/entities/entity.rb +86 -77
  54. data/lib/treat/exception.rb +3 -0
  55. data/lib/treat/helpers/hash.rb +29 -0
  56. data/lib/treat/helpers/help.rb +35 -0
  57. data/lib/treat/helpers/object.rb +55 -0
  58. data/lib/treat/helpers/string.rb +124 -0
  59. data/lib/treat/{core → learning}/data_set.rb +11 -11
  60. data/lib/treat/{core → learning}/export.rb +3 -3
  61. data/lib/treat/{core → learning}/problem.rb +26 -16
  62. data/lib/treat/{core → learning}/question.rb +5 -9
  63. data/lib/treat/loaders/linguistics.rb +8 -9
  64. data/lib/treat/loaders/stanford.rb +5 -11
  65. data/lib/treat/modules.rb +33 -0
  66. data/lib/treat/proxies/array.rb +27 -0
  67. data/lib/treat/proxies/language.rb +47 -0
  68. data/lib/treat/proxies/number.rb +18 -0
  69. data/lib/treat/proxies/proxy.rb +25 -0
  70. data/lib/treat/proxies/string.rb +18 -0
  71. data/lib/treat/version.rb +10 -1
  72. data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
  73. data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
  74. data/lib/treat/workers/extractors/language/what_language.rb +8 -6
  75. data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
  76. data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
  77. data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
  78. data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
  79. data/lib/treat/workers/extractors/time/chronic.rb +2 -4
  80. data/lib/treat/workers/extractors/time/nickel.rb +19 -20
  81. data/lib/treat/workers/extractors/time/ruby.rb +2 -1
  82. data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
  83. data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
  84. data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
  85. data/lib/treat/workers/formatters/readers/image.rb +19 -9
  86. data/lib/treat/workers/formatters/readers/odt.rb +2 -1
  87. data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
  88. data/lib/treat/workers/formatters/readers/xml.rb +0 -1
  89. data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
  90. data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
  91. data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
  92. data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
  93. data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
  94. data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
  95. data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
  96. data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
  97. data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
  98. data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
  99. data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
  100. data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
  101. data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
  102. data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
  103. data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
  104. data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
  105. data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
  106. data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
  107. data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
  108. data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
  109. data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
  110. data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
  111. data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
  112. data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
  113. data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
  114. data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
  115. data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
  116. data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
  117. data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
  118. data/lib/treat/workers/processors/chunkers/html.rb +1 -6
  119. data/lib/treat/workers/processors/parsers/enju.rb +2 -4
  120. data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
  121. data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
  122. data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
  123. data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
  124. data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
  125. data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
  126. data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
  127. data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
  128. data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
  129. data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
  130. data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
  131. data/lib/treat/workers/workers.rb +6 -0
  132. data/lib/treat.rb +18 -32
  133. data/models/MANIFEST +1 -0
  134. data/spec/core/data_set.rb +174 -0
  135. data/spec/core/export.rb +52 -0
  136. data/spec/core/problem.rb +144 -0
  137. data/spec/core/question.rb +52 -0
  138. data/spec/{collection.rb → entities/collection.rb} +20 -35
  139. data/spec/{document.rb → entities/document.rb} +3 -54
  140. data/spec/{entity.rb → entities/entity.rb} +10 -9
  141. data/spec/entities/phrase.rb +33 -0
  142. data/spec/{token.rb → entities/token.rb} +0 -57
  143. data/spec/entities/word.rb +3 -0
  144. data/spec/{zone.rb → entities/zone.rb} +0 -26
  145. data/spec/helper.rb +116 -32
  146. data/spec/sandbox.rb +258 -25
  147. data/spec/treat.rb +26 -34
  148. data/spec/workers/agnostic.rb +137 -0
  149. data/spec/workers/english.rb +194 -0
  150. data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
  151. data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
  152. data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
  153. data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
  154. data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
  155. data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
  156. data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
  157. data/spec/workers/examples/english/phrase.xml +5 -0
  158. data/spec/workers/examples/english/test.txt +1 -0
  159. data/spec/workers/language.rb +280 -0
  160. data/spec/workers.rb +28 -0
  161. metadata +122 -105
  162. data/lib/treat/config/core/acronyms.rb +0 -5
  163. data/lib/treat/config/core/encodings.rb +0 -8
  164. data/lib/treat/config/core/entities.rb +0 -2
  165. data/lib/treat/config/core/language.rb +0 -3
  166. data/lib/treat/config/core/paths.rb +0 -8
  167. data/lib/treat/config/core/syntax.rb +0 -1
  168. data/lib/treat/config/core/verbosity.rb +0 -1
  169. data/lib/treat/config/databases/default.rb +0 -1
  170. data/lib/treat/config/databases/mongo.rb +0 -1
  171. data/lib/treat/config/languages/agnostic.rb +0 -34
  172. data/lib/treat/config/languages/english.rb +0 -60
  173. data/lib/treat/config/languages/french.rb +0 -18
  174. data/lib/treat/config/languages/german.rb +0 -18
  175. data/lib/treat/config/languages/italian.rb +0 -12
  176. data/lib/treat/config/languages/polish.rb +0 -12
  177. data/lib/treat/config/languages/spanish.rb +0 -12
  178. data/lib/treat/config/languages/swedish.rb +0 -12
  179. data/lib/treat/config/libraries/punkt.rb +0 -1
  180. data/lib/treat/config/libraries/reuters.rb +0 -1
  181. data/lib/treat/config/libraries/stanford.rb +0 -1
  182. data/lib/treat/config/linguistics/categories.rb +0 -4
  183. data/lib/treat/config/linguistics/punctuation.rb +0 -33
  184. data/lib/treat/config/tags/aligned.rb +0 -221
  185. data/lib/treat/config/tags/enju.rb +0 -71
  186. data/lib/treat/config/tags/paris7.rb +0 -17
  187. data/lib/treat/config/tags/ptb.rb +0 -15
  188. data/lib/treat/config/workers/list.rb +0 -1
  189. data/lib/treat/config.rb +0 -135
  190. data/lib/treat/core.rb +0 -5
  191. data/lib/treat/entities/abilities/copyable.rb +0 -47
  192. data/lib/treat/entities/abilities/debuggable.rb +0 -83
  193. data/lib/treat/entities/abilities/registrable.rb +0 -46
  194. data/lib/treat/entities/collection.rb +0 -40
  195. data/lib/treat/entities/document.rb +0 -10
  196. data/lib/treat/entities/group.rb +0 -18
  197. data/lib/treat/entities/section.rb +0 -13
  198. data/lib/treat/entities/token.rb +0 -47
  199. data/lib/treat/entities/zone.rb +0 -12
  200. data/lib/treat/entities.rb +0 -6
  201. data/lib/treat/helpers/didyoumean.rb +0 -57
  202. data/lib/treat/helpers/escaping.rb +0 -15
  203. data/lib/treat/helpers/formatting.rb +0 -41
  204. data/lib/treat/helpers/objtohash.rb +0 -8
  205. data/lib/treat/helpers/platform.rb +0 -15
  206. data/lib/treat/helpers/reflection.rb +0 -17
  207. data/lib/treat/helpers/temporary.rb +0 -27
  208. data/lib/treat/helpers/verbosity.rb +0 -19
  209. data/lib/treat/helpers.rb +0 -5
  210. data/lib/treat/loaders.rb +0 -10
  211. data/lib/treat/proxies.rb +0 -106
  212. data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
  213. data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
  214. data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
  215. data/spec/core.rb +0 -441
  216. data/spec/phrase.rb +0 -112
  217. data/spec/word.rb +0 -111
@@ -1,23 +1,13 @@
1
- # Adapter class for the 'rbtagger' gem, a port
2
- # of the Perl Lingua::BrillTagger class, based
3
- # on the rule-based tagger developped by Eric Brill.
1
+ # POS tagging using a set of rules developped by Eric Brill.
4
2
  #
5
- # Original paper:
6
- #
7
- # Eric Brill. 1992. A simple rule-based part of speech tagger.
8
- # In Proceedings of the third conference on Applied natural
9
- # language processing (ANLC '92). Association for Computational
10
- # Linguistics, Stroudsburg, PA, USA, 152-155.
11
- # DOI=10.3115/974499.974526 http://dx.doi.org/10.3115/974499.974526
12
- #
13
- # Project website:
14
- #
15
- # http://rbtagger.rubyforge.org/
16
- module Treat::Workers::Lexicalizers::Taggers::Brill
3
+ # Original paper: Eric Brill. 1992. A simple rule-based
4
+ # part of speech tagger. In Proceedings of the third
5
+ # conference on Applied natural language processing.
6
+ class Treat::Workers::Lexicalizers::Taggers::Brill
17
7
 
18
8
  require 'rbtagger'
19
9
 
20
- require 'treat/workers/lexicalizers/taggers/brill/patch'
10
+ require_relative 'brill/patch'
21
11
 
22
12
  # Hold one instance of the tagger.
23
13
  @@tagger = nil
@@ -50,9 +40,8 @@ module Treat::Workers::Lexicalizers::Taggers::Brill
50
40
  return pair[1] if isolated_token
51
41
  end
52
42
 
53
- if entity.is_a?(Treat::Entities::Sentence) ||
54
- (entity.is_a?(Treat::Entities::Phrase) &&
55
- !entity.parent_sentence)
43
+ if entity.is_a?(Treat::Entities::Group) &&
44
+ !entity.parent_sentence
56
45
  entity.set :tag_set, :penn
57
46
  end
58
47
 
@@ -1,17 +1,7 @@
1
- # An adapter for the 'engtagger' gem, which
2
- # is a port of the Perl Lingua::EN::Tagger module.
3
- #
4
- # "This module uses part-of-speech statistics from
1
+ # POS tagging using part-of-speech statistics from
5
2
  # the Penn Treebank to assign POS tags to English text.
6
3
  # The tagger applies a bigram (two-word) Hidden Markov
7
4
  # Model to guess the appropriate POS tag for a word.
8
- # That means that the tagger will try to assign a POS
9
- # tag based on the known POS tags for a given word and
10
- # the POS tag assigned to its predecessor.
11
- #
12
- # Project website: http://engtagger.rubyforge.org/
13
- # Original Perl module site:
14
- # http://cpansearch.perl.org/src/ACOBURN/Lingua-EN-Tagger-0.15/
15
5
  class Treat::Workers::Lexicalizers::Taggers::Lingua
16
6
 
17
7
  # Require the 'engtagger' gem.
@@ -71,15 +61,14 @@ class Treat::Workers::Lexicalizers::Taggers::Lingua
71
61
  end
72
62
 
73
63
 
74
- if entity.is_a?(Treat::Entities::Sentence) ||
75
- (entity.is_a?(Treat::Entities::Phrase) &&
76
- !entity.parent_sentence)
64
+ if entity.is_a?(Treat::Entities::Group) &&
65
+ !entity.parent_sentence
77
66
  entity.set :tag_set, :penn
78
67
  end
79
68
 
80
69
  return 'S' if entity.is_a?(Treat::Entities::Sentence)
81
70
  return 'P' if entity.is_a?(Treat::Entities::Phrase)
82
-
71
+
83
72
  end
84
73
 
85
74
  end
@@ -1,8 +1,18 @@
1
- # Wrapper for the Stanford POS tagger.
1
+ # POS tagging using (i) explicit use of both preceding
2
+ # and following tag contexts via a dependency network
3
+ # representation, (ii) broad use of lexical features,
4
+ # including jointly conditioning on multiple consecutive
5
+ # words, (iii) effective use of priors in conditional
6
+ # loglinear models, and (iv) fine-grained modeling of
7
+ # unknown word features.
8
+ #
9
+ # Original paper: Toutanova, Manning, Klein and Singer.
10
+ # 2003. Feature-Rich Part-of-Speech Tagging with a
11
+ # Cyclic Dependency Network. In Proceedings of the
12
+ # Conference of the North American Chapter of the
13
+ # Association for Computational Linguistics.
2
14
  class Treat::Workers::Lexicalizers::Taggers::Stanford
3
15
 
4
- require 'treat/loaders/stanford'
5
-
6
16
  # Hold one tagger per language.
7
17
  @@taggers = {}
8
18
 
@@ -15,9 +25,8 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
15
25
  def self.tag(entity, options = {})
16
26
 
17
27
  # Handle tags for sentences and phrases.
18
- if entity.is_a?(Treat::Entities::Sentence) ||
19
- (entity.is_a?(Treat::Entities::Phrase) &&
20
- !entity.parent_sentence)
28
+ if entity.is_a?(Treat::Entities::Group) &&
29
+ !entity.parent_sentence
21
30
 
22
31
  tag_set = options[:tag_set]
23
32
  entity.set :tag_set, tag_set
@@ -31,18 +40,18 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
31
40
 
32
41
  # Handle options and initialize the tagger.
33
42
  lang = entity.language
34
- options = get_options(options, lang)
35
43
  init_tagger(lang) unless @@taggers[lang]
36
- tokens, list = get_token_list(entity)
44
+ options = get_options(options, lang)
45
+ tokens, t_list = get_token_list(entity)
37
46
 
38
47
  # Do the tagging.
39
48
  i = 0
40
49
  isolated_token = entity.is_a?(Treat::Entities::Token)
41
-
42
- @@taggers[lang].apply(list).each do |tok|
43
- tokens[i].set :tag, tok.tag
44
- tokens[i].set :tag_set,
45
- options[:tag_set] if isolated_token
50
+
51
+ @@taggers[lang].apply(t_list).each do |tok|
52
+ tokens[i].set(:tag, tok.tag)
53
+ tokens[i].set(:tag_set,
54
+ options[:tag_set]) if isolated_token
46
55
  return tok.tag if isolated_token
47
56
  i += 1
48
57
  end
@@ -2,12 +2,11 @@ class Treat::Workers::Processors::Chunkers::Autoselect
2
2
 
3
3
  def self.chunk(entity, options = {})
4
4
  unless entity.has?(:format)
5
- raise Treat::Exception,
6
- "Must have a format to autoselect chunker."
5
+ entity.set :format, 'txt'
7
6
  end
8
7
  begin
9
8
  k = Treat::Workers::Processors::
10
- Chunkers.const_get(cc(entity.format))
9
+ Chunkers.const_get(entity.format.cc)
11
10
  k.chunk(entity, options)
12
11
  rescue Treat::Exception
13
12
  Treat::Workers::Processors::
@@ -3,12 +3,9 @@ class Treat::Workers::Processors::Chunkers::HTML
3
3
  require 'nokogiri'
4
4
 
5
5
  def self.chunk(entity, options = {})
6
-
7
6
  entity.check_hasnt_children
8
-
9
7
  doc = Nokogiri::HTML(entity.value)
10
- recurse(entity, doc)
11
-
8
+ self.recurse(entity, doc)
12
9
  end
13
10
 
14
11
  def self.recurse(node, html_node, level = 1)
@@ -16,7 +13,6 @@ class Treat::Workers::Processors::Chunkers::HTML
16
13
  html_node.children.each do |child|
17
14
 
18
15
  next if child.name == 'text'
19
-
20
16
  txt = child.inner_text
21
17
 
22
18
  if child.name =~ /^h([0-9]{1})$/ ||
@@ -45,7 +41,6 @@ class Treat::Workers::Processors::Chunkers::HTML
45
41
 
46
42
  t = node <<
47
43
  Treat::Entities::Title.new(txt)
48
- t.set :level, level
49
44
 
50
45
  elsif child.name == 'p'
51
46
 
@@ -5,12 +5,10 @@
5
5
  # XML reader. It creates wrappers for the sentences,
6
6
  # syntactical phrases and tokens that Enju identified.
7
7
  #
8
- # Original paper:
9
- #
10
- # Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
8
+ # Original paper: Takuya M., Yusuke M., and Jun'ichi T.
11
9
  # 2007. Efficient HPSG Parsing with Supertagging and
12
10
  # CFG-filtering. In Proceedings of IJCAI 2007.
13
- module Treat::Workers::Processors::Parsers::Enju
11
+ class Treat::Workers::Processors::Parsers::Enju
14
12
 
15
13
  # Require the 'open3' library to connect
16
14
  # with the background Enju process.
@@ -1,8 +1,14 @@
1
- # A wrapper class for the Stanford parser.
1
+ # Parsing using an interface to a Java implementation
2
+ # of probabilistic natural language parsers, both
3
+ # optimized PCFG and lexicalized dependency parsers,
4
+ # and a lexicalized PCFG parser.
5
+ #
6
+ # Original paper: Dan Klein and Christopher D.
7
+ # Manning. 2003. Accurate Unlexicalized Parsing.
8
+ # Proceedings of the 41st Meeting of the Association
9
+ # for Computational Linguistics, pp. 423-430.
2
10
  class Treat::Workers::Processors::Parsers::Stanford
3
-
4
- require 'treat/loaders/stanford'
5
-
11
+
6
12
  Pttc = Treat.tags.aligned.phrase_tags_to_category
7
13
 
8
14
  # Hold one instance of the pipeline per language.
@@ -23,12 +29,12 @@ class Treat::Workers::Processors::Parsers::Stanford
23
29
  # instead of displaying it.
24
30
  def self.parse(entity, options = {})
25
31
 
26
- entity.check_hasnt_children
27
-
28
32
  val = entity.to_s
29
33
  lang = entity.language
30
34
  init(lang, options)
31
35
 
36
+ entity.check_hasnt_children
37
+
32
38
  tag_set = StanfordCoreNLP::Config::TagSets[lang]
33
39
 
34
40
  text = ::StanfordCoreNLP::Text.new(val)
@@ -46,7 +52,7 @@ class Treat::Workers::Processors::Parsers::Stanford
46
52
  recurse(s.get(:tree).children[0], entity, tag_set)
47
53
  break #######
48
54
  else
49
- recurse(s.get(:tree), entity)
55
+ recurse(s.get(:tree), entity, tag_set)
50
56
  end
51
57
 
52
58
  end
@@ -1,17 +1,19 @@
1
- # An adapter for the 'punk-segmenter' gem, which segments
2
- # texts into sentences based on an unsupervised, language
3
- # independent algorithm.
1
+ # Sentence segmentation based on a set of log-
2
+ # likelihood-based heuristics to infer abbreviations
3
+ # and common sentence starters from a large text
4
+ # corpus. Easily adaptable but requires a large
5
+ # (unlabeled) indomain corpus for assembling statistics.
4
6
  #
5
- # Original paper: Kiss, Tibor and Strunk, Jan (2006):
7
+ # Original paper: Kiss, Tibor and Strunk, Jan. 2006.
6
8
  # Unsupervised Multilingual Sentence Boundary Detection.
7
- # Computational Linguistics 32: 485-525.
8
- module Treat::Workers::Processors::Segmenters::Punkt
9
+ # Computational Linguistics 32:485-525.
10
+ class Treat::Workers::Processors::Segmenters::Punkt
9
11
 
10
12
  # Require silently the punkt-segmenter gem.
11
13
  silence_warnings { require 'punkt-segmenter' }
12
14
 
13
15
  # Require the YAML parser.
14
- silence_warnings { require 'psych' }
16
+ # silence_warnings { require 'psych' }
15
17
 
16
18
  # Hold one copy of the segmenter per language.
17
19
  @@segmenters = {}
@@ -34,13 +36,21 @@ module Treat::Workers::Processors::Segmenters::Punkt
34
36
  lang = entity.language
35
37
  set_options(lang, options)
36
38
 
39
+
37
40
  s = entity.to_s
38
41
 
39
42
  # Replace the point in all floating-point numbers
40
43
  # by ^^; this is a fix since Punkt trips on decimal
41
44
  # numbers.
42
-
43
- escape_floats!(s)
45
+ s.escape_floats!
46
+
47
+ # Take out suspension points temporarily.
48
+ s.gsub!('...', '&;&.')
49
+ # Remove abbreviations.
50
+ s.scan(/(?:[A-Za-z]\.){2,}/).each do |abbr|
51
+ s.gsub!(abbr, abbr.gsub(' ', '').gsub('.', '&-&'))
52
+ end
53
+ # Unstick sentences from each other.
44
54
  s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
45
55
 
46
56
  result = @@segmenters[lang].
@@ -49,7 +59,11 @@ module Treat::Workers::Processors::Segmenters::Punkt
49
59
 
50
60
  result.each do |sentence|
51
61
  # Unescape the sentence.
52
- unescape_floats!(sentence)
62
+ sentence.unescape_floats!
63
+ # Repair abbreviations in sentences.
64
+ sentence.gsub!('&-&', '.')
65
+ # Repair suspension points.
66
+ sentence.gsub!('&;&.', '...')
53
67
  entity << Treat::Entities::Phrase.
54
68
  from_string(sentence)
55
69
  end
@@ -73,7 +87,7 @@ module Treat::Workers::Processors::Segmenters::Punkt
73
87
  end
74
88
  end
75
89
 
76
- t = ::Psych.load(File.read(model))
90
+ t = ::YAML.load(File.read(model))
77
91
 
78
92
  @@segmenters[lang] =
79
93
  ::Punkt::SentenceTokenizer.new(t)
@@ -0,0 +1,20 @@
1
+ # Sentence segmentation based on a set of predefined
2
+ # rules that handle a large number of usage cases of
3
+ # sentence enders. The idea is to remove all cases of
4
+ # .!? being used for other purposes than marking a
5
+ # full stop before naively segmenting the text.
6
+ class Treat::Workers::Processors::Segmenters::Scalpel
7
+
8
+ require 'scalpel'
9
+
10
+ # Segment a text using the Scalpel algorithm.
11
+ def self.segment(entity, options = {})
12
+ sentences = Scalpel.cut(entity.to_s)
13
+ sentences.each do |sentence|
14
+ entity << Treat::Entities::Phrase.
15
+ from_string(sentence.strip)
16
+ end
17
+ entity
18
+ end
19
+
20
+ end
@@ -0,0 +1,42 @@
1
+ # Sentence segmentation based on a set of predefined
2
+ # rules defined in SRX (Segmentation Rules eXchange)
3
+ # format and developped by Marcin Milkowski.
4
+ #
5
+ # Original paper: Marcin Miłkowski, Jarosław Lipski,
6
+ # 2009. Using SRX standard for sentence segmentation
7
+ # in LanguageTool, in: Human Language Technologies
8
+ # as a Challenge for Computer Science and Linguistics.
9
+ class Treat::Workers::Processors::Segmenters::SRX
10
+
11
+ @@segmenters = {}
12
+
13
+ # Require the srx-english library.
14
+ # Segment a text using the SRX algorithm
15
+ def self.segment(entity, options = {})
16
+
17
+ lang = entity.language
18
+ entity.check_hasnt_children
19
+ text = entity.to_s
20
+ text.escape_floats!
21
+
22
+ unless @@segmenters[lang]
23
+ # Require the appropriate gem.
24
+ require "srx/#{lang}/sentence_splitter"
25
+ @@segmenters[lang] = SRX.const_get(
26
+ lang.capitalize).const_get(
27
+ 'SentenceSplitter')
28
+ end
29
+
30
+ sentences = @@segmenters[lang].new(text)
31
+
32
+ sentences.each do |sentence|
33
+ sentence.unescape_floats!
34
+ entity << Treat::Entities::Phrase.
35
+ from_string(sentence.strip)
36
+ end
37
+
38
+ entity
39
+
40
+ end
41
+
42
+ end
@@ -1,8 +1,9 @@
1
- # A wrapper for the sentence splitter supplied by
2
- # the Stanford parser.
1
+ # Detects sentence boundaries by first tokenizing the
2
+ # text and deciding whether periods are sentence ending
3
+ # or used for other purposes (abreviations, etc.). The
4
+ # obtained tokens are then grouped into sentences.
3
5
  class Treat::Workers::Processors::Segmenters::Stanford
4
6
 
5
- require 'treat/loaders/stanford'
6
7
  Treat::Loaders::Stanford.load
7
8
 
8
9
  DefaultOptions = {
@@ -31,8 +32,7 @@ class Treat::Workers::Processors::Segmenters::Stanford
31
32
  ::StanfordCoreNLP.load(:tokenize, :ssplit)
32
33
 
33
34
  s = entity.to_s
34
-
35
- text = ::StanfordCoreNLP::Text.new(entity.to_s)
35
+ text = ::StanfordCoreNLP::Text.new(s)
36
36
 
37
37
  @@segmenter.annotate(text)
38
38
  text.get(:sentences).each do |sentence|
@@ -1,13 +1,12 @@
1
- # An adapter for the 'tactful_tokenizer' gem, which
2
- # detects sentence boundaries based on a Naive Bayesian
3
- # statistical model.
1
+ # Sentence segmentation based on a Naive Bayesian
2
+ # statistical model. Trained on Wall Street Journal
3
+ # news combined with the Brown Corpus, which is
4
+ # intended to be widely representative of written English.
4
5
  #
5
- # Project website: https://github.com/SlyShy/Tackful-Tokenizer
6
- #
7
- # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
8
- # and the Problem with the U.S. University of California, Berkeley.
9
- # http://dgillick.com/resource/sbd_naacl_2009.pdf
10
- module Treat::Workers::Processors::Segmenters::Tactful
6
+ # Original paper: Dan Gillick. 2009. Sentence Boundary
7
+ # Detection and the Problem with the U.S. University
8
+ # of California, Berkeley.
9
+ class Treat::Workers::Processors::Segmenters::Tactful
11
10
 
12
11
  # Require the 'tactful_tokenizer' gem.
13
12
  silence_warnings { require 'tactful_tokenizer' }
@@ -27,9 +26,16 @@ module Treat::Workers::Processors::Segmenters::Tactful
27
26
  entity.check_hasnt_children
28
27
 
29
28
  s = entity.to_s
29
+ s.escape_floats!
30
30
 
31
- escape_floats!(s)
31
+ # Remove abbreviations.
32
+ s.scan(/(?:[A-Za-z]\.){2,}/).each do |abbr|
33
+ s.gsub!(abbr, abbr.gsub(' ', '').gsub('.', '&-&'))
34
+ end
32
35
 
36
+ # Take out suspension points temporarily.
37
+ s.gsub!('...', '&;&.')
38
+ # Unstick sentences from each other.
33
39
  s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
34
40
 
35
41
  @@segmenter ||= TactfulTokenizer::Model.new
@@ -37,7 +43,11 @@ module Treat::Workers::Processors::Segmenters::Tactful
37
43
  sentences = @@segmenter.tokenize_text(s)
38
44
 
39
45
  sentences.each do |sentence|
40
- unescape_floats!(sentence)
46
+ sentence.unescape_floats!
47
+ # Repair abbreviations.
48
+ sentence.gsub!('&-&', '.')
49
+ # Repair suspension points.
50
+ sentence.gsub!('&;&.', '...')
41
51
  entity << Treat::Entities::Phrase.from_string(sentence)
42
52
  end
43
53
 
@@ -1,45 +1,52 @@
1
1
  # encoding: utf-8
2
- # A native rule-basd tokenizer based on the one
3
- # developped by Robert Macyntyre in 1995 for the Penn
4
- # Treebank project. This tokenizer follows the
5
- # conventions used by the Penn Treebank.
2
+ # Tokenization based on the tokenizer developped by
3
+ # Robert Macyntyre in 1995 for the Penn Treebank
4
+ # project. This tokenizer mostly follows the conventions
5
+ # used by the Penn Treebank. N.B. Contrary to the
6
+ # standard PTB tokenization, double quotes (") are
7
+ # NOT changed to doubled single forward- and
8
+ # backward- quotes (`` and '') by default.
6
9
  #
7
- # Original script:
8
- # http://www.cis.upenn.edu/~treebank/tokenizer.sed
9
- #
10
- # Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
11
- # All rights reserved. This program is free software;
12
- # you can redistribute it and/or modify it under the
13
- # same terms as Ruby itself.
14
- module Treat::Workers::Processors::Tokenizers::PTB
15
-
16
- # Tokenize the entity using a native rule-based algorithm.
10
+ # Authors: Utiyama Masao (mutiyama@nict.go.jp).
11
+ # License: Ruby License.
12
+ class Treat::Workers::Processors::Tokenizers::PTB
13
+
14
+ # Default options for the tokenizer.
15
+ DefaultOptions = {
16
+ directional_quotes: false
17
+ }
18
+
19
+ # Perform tokenization of the entity and add
20
+ # the resulting tokens as its children.
21
+ #
22
+ # Options:
23
+ # - (Boolean) => :directional_quotes whether to
24
+ # replace double quotes by `` and '' or not.
17
25
  def self.tokenize(entity, options = {})
18
-
26
+ options = DefaultOptions.merge(options)
19
27
  entity.check_hasnt_children
20
-
21
28
  if entity.has_children?
22
29
  raise Treat::Exception,
23
30
  "Cannot tokenize an #{entity.class} " +
24
31
  "that already has children."
25
32
  end
26
- chunks = split(entity.to_s)
33
+ chunks = split(entity.to_s, options)
27
34
  chunks.each do |chunk|
28
35
  next if chunk =~ /([[:space:]]+)/
29
- entity << Treat::Entities::Token.from_string(chunk)
36
+ entity << Treat::Entities::Token.
37
+ from_string(chunk)
30
38
  end
31
39
  end
32
-
33
- # Helper method to split the string into tokens.
34
- def self.split(string)
35
-
40
+
41
+ def self.split(string, options)
42
+
36
43
  s = " " + string + " "
37
-
38
- s.gsub!(/‘/,'`')
44
+
45
+ s.gsub!(/‘/,"'")
39
46
  s.gsub!(/’/,"'")
40
47
  s.gsub!(/“/,"``")
41
48
  s.gsub!(/”/,"''")
42
-
49
+
43
50
  s.gsub!(/\s+/," ")
44
51
  s.gsub!(/(\s+)''/,'\1"')
45
52
  s.gsub!(/(\s+)``/,'\1"')
@@ -80,11 +87,14 @@ module Treat::Workers::Processors::Tokenizers::PTB
80
87
  s.gsub!(/\//, ' / ')
81
88
  s.gsub!(/\s+/,' ')
82
89
  s.strip!
83
-
84
- s.gsub!(/``/,'"')
85
- s.gsub!(/''/,'"')
86
90
 
91
+ # Remove directional quotes.
92
+ unless options[:directional_quotes]
93
+ s.gsub!(/``/,'"')
94
+ s.gsub!(/''/,'"')
95
+ end
96
+
87
97
  s.split(/\s+/)
88
98
  end
89
-
90
- end
99
+
100
+ end
@@ -1,19 +1,11 @@
1
- # A tokenizer that was lifted from the 'punkt-segmenter'
2
- # Ruby gem.
1
+ # Tokenization script from the 'punkt-segmenter' Ruby gem.
3
2
  #
4
- # This code follows the terms and conditions of Apache
5
- # License v2 (http://www.apache.org/licenses/LICENSE-2.0)
6
- #
7
- # Authors: Willy <willy@csse.unimelb.edu.au>
8
- # (original Python port), Steven Bird
9
- # <sb@csse.unimelb.edu.au> (additions),
10
- # Edward Loper <edloper@gradient.cis.upenn.edu>
11
- # (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
12
- # (almost rewrite).
13
- #
14
- # Project website: https://github.com/lfcipriani/punkt-segmenter
3
+ # Authors: Willy (willy@csse.unimelb.edu.au>),
4
+ # Steven Bird (sb@csse.unimelb.edu.au), Edward Loper
5
+ # (edloper@gradient.cis.upenn.edu), Joel Nothman
6
+ # (jnothman@student.usyd.edu.au).
7
+ # License: Apache License v2.
15
8
  class Treat::Workers::Processors::Tokenizers::Punkt
16
-
17
9
  SentEndChars = ['.', '?', '!']
18
10
  ReSentEndChars = /[.?!]/
19
11
  InternalPunctuation = [',', ':', ';']
@@ -24,8 +16,8 @@ class Treat::Workers::Processors::Tokenizers::Punkt
24
16
  ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
25
17
  RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
26
18
 
27
- # Tokenize the text using the algorithm lifted from
28
- # the Punkt tokenizer gem.
19
+ # Perform tokenization of the entity and add
20
+ # the resulting tokens as its children.
29
21
  #
30
22
  # Options: none.
31
23
  def self.tokenize(entity, options = {})
@@ -36,10 +28,13 @@ class Treat::Workers::Processors::Tokenizers::Punkt
36
28
 
37
29
  s.scan(ReWordTokenizer).each do |token|
38
30
  if SentEndChars.include?(token[-1])
39
- entity << Treat::Entities::Token.from_string(token[0..-2])
40
- entity << Treat::Entities::Token.from_string(token[-1..-1])
31
+ entity << Treat::Entities::
32
+ Token.from_string(token[0..-2])
33
+ entity << Treat::Entities::
34
+ Token.from_string(token[-1..-1])
41
35
  else
42
- entity << Treat::Entities::Token.from_string(token)
36
+ entity << Treat::Entities::
37
+ Token.from_string(token)
43
38
  end
44
39
  end
45
40