treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,45 @@
1
+ module Treat
2
+ module Processors
3
+ module Segmenters
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
9
+ unless File.readable?(jar)
10
+ raise "Could not find stanford parser JAR file in #{jar}."+
11
+ " You may need to set Treat.bin to a custom value."
12
+ end
13
+ DocumentPreprocessor =
14
+ ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
15
+ StringReader = ::Rjb::import('java.io.StringReader')
16
+ end
17
+ def self.segment(entity, options = {})
18
+ sr = StringReader.new(entity.to_s)
19
+ sit = DocumentPreprocessor.new(sr).iterator
20
+ while sit.has_next
21
+ str = sit.next.to_string
22
+ str.gsub!(', ', ' ') # Fix - find better way to implode.
23
+ str.gsub!(' \'s', '\'s')
24
+ str.gsub!(' .', '.')
25
+ str.gsub!(' ,', ',')
26
+ str.gsub!(' ;', ';')
27
+ str.gsub!(/-[A-Z]{3}-/, '')
28
+ str = str[1..-2]
29
+ sentence = Entities::Entity.from_string(str)
30
+ if options[:tokenize] == true
31
+ tit = s.iterator
32
+ while tit.has_next
33
+ w = tit.next.word
34
+ next if w[0] == '-' && w[-1] == '-'
35
+ sentence << Entities::Entity.from_string(w)
36
+ end
37
+ end
38
+ entity << sentence
39
+ end
40
+ entity
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,34 @@
1
+ module Treat
2
+ module Processors
3
+ module Segmenters
4
+ # An adapter for the 'tactful_tokenizer' gem, which
5
+ # detects sentence boundaries (the name is a misnomer;
6
+ # it isn't a tokenizer, but a sentence boundary detector).
7
+ # It uses a Naive Bayesian statistical model, and is
8
+ # based on Splitta, but has support for ‘?’ and ‘!’
9
+ # as well as primitive handling of XHTML markup.
10
+ #
11
+ # Project website:
12
+ class Tactful
13
+ # Require the 'tactful_tokenizer' gem.
14
+ silently { require 'tactful_tokenizer' }
15
+ # Somewhere in the depths of the code this is defined...
16
+ String.class_eval { undef :tokenize }
17
+ # Keep only one copy of the segmenter.
18
+ @@segmenter = nil
19
+ # Segment a text or zone into sentences
20
+ # using the 'tactful_tokenizer' gem.
21
+ #
22
+ # Options: none.
23
+ def self.segment(entity, options = {})
24
+ @@segmenter ||= TactfulTokenizer::Model.new
25
+ sentences = @@segmenter.tokenize_text(entity.to_s)
26
+ sentences.each do |sentence|
27
+ entity << Entities::Entity.from_string(sentence)
28
+ end
29
+ entity
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,76 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A native rule-basd tokenizer based on the one
5
+ # developped by Robert Macyntyre in 1995 for the Penn
6
+ # Treebank project. This tokenizer follows the
7
+ # conventions used by the Penn Treebank.
8
+ #
9
+ # Original script:
10
+ # http://www.cis.upenn.edu/~treebank/tokenizer.sed
11
+ #
12
+ # Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
13
+ # All rights reserved. This program is free software;
14
+ # you can redistribute it and/or modify it under the
15
+ # same terms as Ruby itself.
16
+ class Macintyre
17
+ # Tokenize the entity using a native rule-based algorithm.
18
+ def self.tokenize(entity, options = {})
19
+ raise 'Error' if entity.has_children?
20
+ chunks = self.split(entity.to_s)
21
+ chunks.each do |chunk|
22
+ next if chunk =~ /([[:space:]]+)/
23
+ entity << Treat::Entities::Entity.from_string(chunk)
24
+ end
25
+ entity
26
+ end
27
+ # Helper method to split the string into tokens.
28
+ def self.split(string)
29
+ s = " " + string + " "
30
+ s.gsub!(/\s+/," ")
31
+ s.gsub!(/(\s+)''/,'\1"')
32
+ s.gsub!(/(\s+)``/,'\1"')
33
+ s.gsub!(/''(\s+)/,'"\1')
34
+ s.gsub!(/``(\s+)/,'"\1')
35
+ s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
36
+ s.gsub!(/([ (\[{<])"/,'\1 `` ')
37
+ s.gsub!(/\.\.\./,' ... ')
38
+ s.gsub!(/[,;:@\#$%&]/,' \& ')
39
+ s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
40
+ s.gsub!(/[?!]/,' \& ')
41
+ s.gsub!(/[\]\[(){}<>]/,' \& ')
42
+ s.gsub!(/--/,' -- ')
43
+ s.sub!(/$/,' ')
44
+ s.sub!(/^/,' ')
45
+ s.gsub!(/"/,' \'\' ')
46
+ s.gsub!(/([^'])' /,'\1 \' ')
47
+ s.gsub!(/'([sSmMdD]) /,' \'\1 ')
48
+ s.gsub!(/'ll /,' \'ll ')
49
+ s.gsub!(/'re /,' \'re ')
50
+ s.gsub!(/'ve /,' \'ve ')
51
+ s.gsub!(/n't /,' n\'t ')
52
+ s.gsub!(/'LL /,' \'LL ')
53
+ s.gsub!(/'RE /,' \'RE ')
54
+ s.gsub!(/'VE /,' \'VE ')
55
+ s.gsub!(/N'T /,' N\'T ')
56
+ s.gsub!(/ ([Cc])annot /,' \1an not ')
57
+ s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
58
+ s.gsub!(/ ([Gg])imme /,' \1im me ')
59
+ s.gsub!(/ ([Gg])onna /,' \1on na ')
60
+ s.gsub!(/ ([Gg])otta /,' \1ot ta ')
61
+ s.gsub!(/ ([Ll])emme /,' \1em me ')
62
+ s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
63
+ s.gsub!(/ '([Tt])is /,' \'\1 is ')
64
+ s.gsub!(/ '([Tt])was /,' \'\1 was ')
65
+ s.gsub!(/ ([Ww])anna /,' \1an na ')
66
+ while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4')
67
+ end
68
+ s.gsub!(/\//, ' / ')
69
+ s.gsub!(/\s+/,' ')
70
+ s.strip!
71
+ s.split(/\s+/)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,31 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # An adapter for the 'tokenizer' gem, which performs
5
+ # rule-based tokenizing of texts in English, German
6
+ # or French.
7
+ class Multilingual
8
+ # Hold one tokenizer per language.
9
+ @@tokenizers = {}
10
+ # Require the 'tokenizer' gem.
11
+ silently { require 'tokenizer' }
12
+ # Perform the tokenization of English, German or French text.
13
+ # Options:
14
+ # :language => (Symbol) Force a language for the tokenizer.
15
+ def self.tokenize(entity, options = {})
16
+ lang = options[:language] ? options[:language] : entity.language
17
+ lang = Treat::Resources::Languages.find(lang, 1)
18
+ if @@tokenizers[lang].nil?
19
+ @@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
20
+ end
21
+ tokens = @@tokenizers[lang].tokenize(entity.to_s)
22
+ tokens.each do |token|
23
+ next if token =~ /([[:space:]]+)/
24
+ entity << Treat::Entities::Entity.from_string(token)
25
+ end
26
+ entity
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+ module Treat
3
+ module Processors
4
+ module Tokenizers
5
+ # Tokenize the entity using a native rule-based algorithm.
6
+ # This tokenizer is a port from an unknown Perl module,
7
+ # which I have lifted from the 'rbtagger' gem.
8
+ #
9
+ # Author: Todd A. Fisher
10
+ # This code is free to use under the terms of the MIT license.
11
+ #
12
+ # Original project website:
13
+ # https://github.com/taf2/rb-brill-tagger
14
+ class Perl
15
+ # Tokenize the entity using a native rule-based algorithm.
16
+ # Options: none.
17
+ def self.tokenize(entity, options = {})
18
+ # Normalize all whitespace
19
+ text = entity.to_s.gsub(/\s+/,' ')
20
+ # Translate some common extended ascii characters to quotes
21
+ text.gsub!(/‘/,'`')
22
+ text.gsub!(/’/,"'")
23
+ text.gsub!(/“/,"``")
24
+ text.gsub!(/”/,"''")
25
+ # Attempt to get correct directional quotes
26
+ # s{\"\b} { `` }g;
27
+ text.gsub!(/\"\b/,' `` ')
28
+ # s{\b\"} { '' }g;
29
+ text.gsub!(/\b\"/," '' ")
30
+ #s{\"(?=\s)} { '' }g;
31
+ text.gsub!(/\"(?=\s)/," '' ")
32
+ #s{\"} { `` }g;
33
+ text.gsub!(/\"(?=\s)/," `` ")
34
+ # Isolate ellipses
35
+ # s{\.\.\.} { ... }g;
36
+ text.gsub!(/\.\.\./,' ... ')
37
+ # Isolate any embedded punctuation chars
38
+ # s{([,;:\@\#\$\%&])} { $1 }g;
39
+ text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
40
+ # Assume sentence tokenization has been done first, so split FINAL
41
+ # periods only.
42
+ # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
43
+ text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
44
+ # however, we may as well split ALL question marks and exclamation points,
45
+ # since they shouldn't have the abbrev.-marker ambiguity problem
46
+ #s{([?!])} { $1 }g;
47
+ text.gsub!(/([?!])/, ' \1 ')
48
+ # parentheses, brackets, etc.
49
+ #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
50
+ text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
51
+ #s/(-{2,})/ $1 /g;
52
+ text.gsub!(/(-{2,})/,' \1 ')
53
+ # Add a space to the beginning and end of each line, to reduce
54
+ # necessary number of regexps below.
55
+ #s/$/ /;
56
+ text.gsub!(/$/," ")
57
+ #s/^/ /;
58
+ text.gsub!(/^/," ")
59
+ # possessive or close-single-quote
60
+ #s/\([^\']\)\' /$1 \' /g;
61
+ text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
62
+ # as in it's, I'm, we'd
63
+ #s/\'([smd]) / \'$1 /ig;
64
+ text.gsub!(/\'([smd]) /i,%q( '\1 ))
65
+ #s/\'(ll|re|ve) / \'$1 /ig;
66
+ text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
67
+ #s/n\'t / n\'t /ig;
68
+ text.gsub!(/n\'t /i," n't ")
69
+ #s/ (can)(not) / $1 $2 /ig;
70
+ text.gsub!(/ (can)(not) /i,' \1 \2 ')
71
+ #s/ (d\')(ye) / $1 $2 /ig;
72
+ text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
73
+ #s/ (gim)(me) / $1 $2 /ig;
74
+ text.gsub!(/ (gim)(me) /i,' \1 \2 ')
75
+ #s/ (gon)(na) / $1 $2 /ig;
76
+ text.gsub!(/ (gon)(na) /i,' \1 \2 ')
77
+ #s/ (got)(ta) / $1 $2 /ig;
78
+ text.gsub!(/ (got)(ta) /i,' \1 \2 ')
79
+ #s/ (lem)(me) / $1 $2 /ig;
80
+ text.gsub!(/ (lem)(me) /i,' \1 \2 ')
81
+ #s/ (more)(\'n) / $1 $2 /ig;
82
+ text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
83
+ #s/ (\'t)(is|was) / $1 $2 /ig;
84
+ text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
85
+ #s/ (wan)(na) / $1 $2 /ig;
86
+ text.gsub!(/ (wan)(na) /i,' \1 \2 ')
87
+ tokens = text.split(/\s/)
88
+ tokens.each do |token|
89
+ next if token =~ /([[:space:]]+)/
90
+ entity << Treat::Entities::Entity.from_string(token)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,42 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A tokenizer that was lifted from the 'punkt-segmenter'
5
+ # Ruby gem.
6
+ #
7
+ # This code follows the terms and conditions of Apache
8
+ # License v2 (http://www.apache.org/licenses/LICENSE-2.0)
9
+ #
10
+ # Authors: Willy <willy@csse.unimelb.edu.au>
11
+ # (original Python port), Steven Bird
12
+ # <sb@csse.unimelb.edu.au> (additions),
13
+ # Edward Loper <edloper@gradient.cis.upenn.edu>
14
+ # (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
15
+ # (almost rewrite).
16
+ #
17
+ # Project website: https://github.com/lfcipriani/punkt-segmenter
18
+ class Punkt
19
+ SentEndChars = ['.', '?', '!']
20
+ ReSentEndChars = /[.?!]/
21
+ InternalPunctuation = [',', ':', ';']
22
+ ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
23
+ ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
24
+ ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
25
+ ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
26
+ ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
+ RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
+ # Tokenize the text using the algorithm lifted from
29
+ # the Punkt tokenizer.
30
+ #
31
+ # Options: none.
32
+ def self.tokenize(entity, options = {})
33
+ entity.to_s.scan(ReWordTokenizer).each do |token|
34
+ puts token
35
+ entity << Treat::Entities::Entity.from_string(token)
36
+ end
37
+ entity
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,33 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ # Load the Stanford Parser Java files.
9
+ jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
10
+ unless File.readable?(jar)
11
+ raise "Could not find stanford parser JAR file in #{jar}."+
12
+ " You may need to set Treat.bin to a custom value."
13
+ end
14
+ # Load the Stanford Parser classes.
15
+ PTBTokenizer = ::Rjb::import('edu.stanford.nlp.process.PTBTokenizer')
16
+ CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
17
+ StringReader = ::Rjb::import('java.io.StringReader')
18
+ end
19
+ def self.tokenize(entity, options = {})
20
+ ptbt = PTBTokenizer.new(
21
+ StringReader.new(entity.to_s),
22
+ CoreLabelTokenFactory.new, '')
23
+ while ptbt.has_next
24
+ w = ptbt.next.word
25
+ next if w[0] == '-' && w[-1] == '-'
26
+ entity << Treat::Entities::Entity.from_string(w)
27
+ end
28
+ entity
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,59 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A tokenizer class lifted from the 'tactful-tokenizer' gem.
5
+ #
6
+ # Copyright © 2010 Matthew Bunday. All rights reserved.
7
+ # Released under the GNU GPL v3. Modified by Louis Mullie.
8
+ #
9
+ # Project website: https://github.com/SlyShy/Tactful_Tokenizer
10
+ class Tactful
11
+ ReTokenize = [
12
+ # Uniform Quotes
13
+ [/''|``/, '"'],
14
+ # Separate punctuation from words.
15
+ [/(^|\s)(')/, '\1\2'],
16
+ [/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
17
+ [/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
18
+ # Treat double-hyphen as a single token.
19
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
20
+ [/(\s|^)(,)(?=(\S))/, '\1\2 '],
21
+ # Only separate a comma if a space follows.
22
+ [/(.)(,)(\s|$)/, '\1 \2\3'],
23
+ # Combine dots separated by whitespace to be a single token.
24
+ [/\.\s\.\s\./, '...'],
25
+ # Separate "No.6"
26
+ [/([\W]\.)(\d+)/, '\1 \2'],
27
+ # Separate words from ellipses
28
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
29
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
30
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
31
+ ##### Some additional fixes.
32
+ # Fix %, $, &
33
+ [/(\d)%/, '\1 %'],
34
+ [/\$(\.?\d)/, '$ \1'],
35
+ [/(\W)& (\W)/, '\1&\2'],
36
+ [/(\W\W+)&(\W\W+)/, '\1 & \2'],
37
+ # Fix (n 't) -> ( n't)
38
+ [/n 't( |$)/, " n't\\1"],
39
+ [/N 'T( |$)/, " N'T\\1"],
40
+ # Treebank tokenizer special words
41
+ [/([Cc])annot/, '\1an not']
42
+ ]
43
+ # Tokenize the entity using a rule-based algorithm
44
+ # which has been lifted from the 'tactful-tokenizer'
45
+ # gem.
46
+ def self.tokenize(entity, options = {})
47
+ s = entity.to_s
48
+ ReTokenize.each do |rules|
49
+ s.gsub!(rules[0], rules[1])
50
+ end
51
+ s.split(' ').each do |token|
52
+ entity << Entities::Entity.from_string(token)
53
+ end
54
+ entity
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,66 @@
1
+ module Treat
2
+ # Proxies install Treat functions on Rubycore classes.
3
+ module Proxies
4
+ # The module proxy provides functionanaty common
5
+ # to the different types of proxies.
6
+ module Proxy
7
+ def method_missing(sym, *args, &block)
8
+ if Categories.have_method?(sym)
9
+ to_entity.send(sym, *args)
10
+ else
11
+ super(sym, *args, &block)
12
+ end
13
+ end
14
+ def to_entity(builder = nil)
15
+ Treat::Entities::Unknown(self.to_s)
16
+ end
17
+ end
18
+ # Install Treat functions on String objects.
19
+ module StringProxy
20
+ include Proxy
21
+ # Save the string to the specified file.
22
+ def save(file)
23
+ File.open(file, 'w') { |f| f.write(self) }
24
+ end
25
+ # Return the entity corresponding to the string.
26
+ def to_entity
27
+ Treat::Entities::Entity.from_string(self.to_s)
28
+ end
29
+ end
30
+ # Install Treat functions on Numeric objects.
31
+ module NumericProxy
32
+ include Proxy
33
+ # Return the entity corresponding to the number.
34
+ def to_entity(builder = nil)
35
+ Treat::Entities::Entity.from_numeric(self)
36
+ end
37
+ end
38
+ # Install Treat functions on Array objects.
39
+ module ArrayProxy
40
+ include Proxy
41
+ def method_missing(sym, *args, &block)
42
+ if Category.has_method?(sym)
43
+ array = []
44
+ each do |element|
45
+ if element.is_a? Treat::Entities::Entity
46
+ array << element.send(sym, *args)
47
+ else
48
+ unless [Numeric, String, Array].include?(element.class)
49
+ raise Treat::Exception "Cannot convert object with type " +
50
+ "#{element.class} into an entity."
51
+ end
52
+ array << element.to_entity.send(sym, *args)
53
+ end
54
+ end
55
+ array
56
+ else
57
+ super(sym, *args, &block)
58
+ end
59
+ end
60
+ end
61
+ # Include the proxies in the core classes.
62
+ String.class_eval { include StringProxy }
63
+ Numeric.class_eval { include NumericProxy }
64
+ Array.class_eval { include ArrayProxy }
65
+ end
66
+ end