treat 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,45 @@
1
+ module Treat
2
+ module Processors
3
+ module Segmenters
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
9
+ unless File.readable?(jar)
10
+ raise "Could not find stanford parser JAR file in #{jar}."+
11
+ " You may need to set Treat.bin to a custom value."
12
+ end
13
+ DocumentPreprocessor =
14
+ ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
15
+ StringReader = ::Rjb::import('java.io.StringReader')
16
+ end
17
+ def self.segment(entity, options = {})
18
+ sr = StringReader.new(entity.to_s)
19
+ sit = DocumentPreprocessor.new(sr).iterator
20
+ while sit.has_next
21
+ str = sit.next.to_string
22
+ str.gsub!(', ', ' ') # Fix - find better way to implode.
23
+ str.gsub!(' \'s', '\'s')
24
+ str.gsub!(' .', '.')
25
+ str.gsub!(' ,', ',')
26
+ str.gsub!(' ;', ';')
27
+ str.gsub!(/-[A-Z]{3}-/, '')
28
+ str = str[1..-2]
29
+ sentence = Entities::Entity.from_string(str)
30
+ if options[:tokenize] == true
31
+ tit = s.iterator
32
+ while tit.has_next
33
+ w = tit.next.word
34
+ next if w[0] == '-' && w[-1] == '-'
35
+ sentence << Entities::Entity.from_string(w)
36
+ end
37
+ end
38
+ entity << sentence
39
+ end
40
+ entity
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,34 @@
1
+ module Treat
2
+ module Processors
3
+ module Segmenters
4
+ # An adapter for the 'tactful_tokenizer' gem, which
5
+ # detects sentence boundaries (the name is a misnomer;
6
+ # it isn't a tokenizer, but a sentence boundary detector).
7
+ # It uses a Naive Bayesian statistical model, and is
8
+ # based on Splitta, but has support for ‘?’ and ‘!’
9
+ # as well as primitive handling of XHTML markup.
10
+ #
11
+ # Project website:
12
+ class Tactful
13
+ # Require the 'tactful_tokenizer' gem.
14
+ silently { require 'tactful_tokenizer' }
15
+ # Somewhere in the depths of the code this is defined...
16
+ String.class_eval { undef :tokenize }
17
+ # Keep only one copy of the segmenter.
18
+ @@segmenter = nil
19
+ # Segment a text or zone into sentences
20
+ # using the 'tactful_tokenizer' gem.
21
+ #
22
+ # Options: none.
23
+ def self.segment(entity, options = {})
24
+ @@segmenter ||= TactfulTokenizer::Model.new
25
+ sentences = @@segmenter.tokenize_text(entity.to_s)
26
+ sentences.each do |sentence|
27
+ entity << Entities::Entity.from_string(sentence)
28
+ end
29
+ entity
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,76 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A native rule-basd tokenizer based on the one
5
+ # developped by Robert Macyntyre in 1995 for the Penn
6
+ # Treebank project. This tokenizer follows the
7
+ # conventions used by the Penn Treebank.
8
+ #
9
+ # Original script:
10
+ # http://www.cis.upenn.edu/~treebank/tokenizer.sed
11
+ #
12
+ # Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
13
+ # All rights reserved. This program is free software;
14
+ # you can redistribute it and/or modify it under the
15
+ # same terms as Ruby itself.
16
+ class Macintyre
17
+ # Tokenize the entity using a native rule-based algorithm.
18
+ def self.tokenize(entity, options = {})
19
+ raise 'Error' if entity.has_children?
20
+ chunks = self.split(entity.to_s)
21
+ chunks.each do |chunk|
22
+ next if chunk =~ /([[:space:]]+)/
23
+ entity << Treat::Entities::Entity.from_string(chunk)
24
+ end
25
+ entity
26
+ end
27
+ # Helper method to split the string into tokens.
28
+ def self.split(string)
29
+ s = " " + string + " "
30
+ s.gsub!(/\s+/," ")
31
+ s.gsub!(/(\s+)''/,'\1"')
32
+ s.gsub!(/(\s+)``/,'\1"')
33
+ s.gsub!(/''(\s+)/,'"\1')
34
+ s.gsub!(/``(\s+)/,'"\1')
35
+ s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
36
+ s.gsub!(/([ (\[{<])"/,'\1 `` ')
37
+ s.gsub!(/\.\.\./,' ... ')
38
+ s.gsub!(/[,;:@\#$%&]/,' \& ')
39
+ s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
40
+ s.gsub!(/[?!]/,' \& ')
41
+ s.gsub!(/[\]\[(){}<>]/,' \& ')
42
+ s.gsub!(/--/,' -- ')
43
+ s.sub!(/$/,' ')
44
+ s.sub!(/^/,' ')
45
+ s.gsub!(/"/,' \'\' ')
46
+ s.gsub!(/([^'])' /,'\1 \' ')
47
+ s.gsub!(/'([sSmMdD]) /,' \'\1 ')
48
+ s.gsub!(/'ll /,' \'ll ')
49
+ s.gsub!(/'re /,' \'re ')
50
+ s.gsub!(/'ve /,' \'ve ')
51
+ s.gsub!(/n't /,' n\'t ')
52
+ s.gsub!(/'LL /,' \'LL ')
53
+ s.gsub!(/'RE /,' \'RE ')
54
+ s.gsub!(/'VE /,' \'VE ')
55
+ s.gsub!(/N'T /,' N\'T ')
56
+ s.gsub!(/ ([Cc])annot /,' \1an not ')
57
+ s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
58
+ s.gsub!(/ ([Gg])imme /,' \1im me ')
59
+ s.gsub!(/ ([Gg])onna /,' \1on na ')
60
+ s.gsub!(/ ([Gg])otta /,' \1ot ta ')
61
+ s.gsub!(/ ([Ll])emme /,' \1em me ')
62
+ s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
63
+ s.gsub!(/ '([Tt])is /,' \'\1 is ')
64
+ s.gsub!(/ '([Tt])was /,' \'\1 was ')
65
+ s.gsub!(/ ([Ww])anna /,' \1an na ')
66
+ while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4')
67
+ end
68
+ s.gsub!(/\//, ' / ')
69
+ s.gsub!(/\s+/,' ')
70
+ s.strip!
71
+ s.split(/\s+/)
72
+ end
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,31 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # An adapter for the 'tokenizer' gem, which performs
5
+ # rule-based tokenizing of texts in English, German
6
+ # or French.
7
+ class Multilingual
8
+ # Hold one tokenizer per language.
9
+ @@tokenizers = {}
10
+ # Require the 'tokenizer' gem.
11
+ silently { require 'tokenizer' }
12
+ # Perform the tokenization of English, German or French text.
13
+ # Options:
14
+ # :language => (Symbol) Force a language for the tokenizer.
15
+ def self.tokenize(entity, options = {})
16
+ lang = options[:language] ? options[:language] : entity.language
17
+ lang = Treat::Resources::Languages.find(lang, 1)
18
+ if @@tokenizers[lang].nil?
19
+ @@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
20
+ end
21
+ tokens = @@tokenizers[lang].tokenize(entity.to_s)
22
+ tokens.each do |token|
23
+ next if token =~ /([[:space:]]+)/
24
+ entity << Treat::Entities::Entity.from_string(token)
25
+ end
26
+ entity
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,96 @@
1
+ # encoding: utf-8
2
+ module Treat
3
+ module Processors
4
+ module Tokenizers
5
+ # Tokenize the entity using a native rule-based algorithm.
6
+ # This tokenizer is a port from an unknown Perl module,
7
+ # which I have lifted from the 'rbtagger' gem.
8
+ #
9
+ # Author: Todd A. Fisher
10
+ # This code is free to use under the terms of the MIT license.
11
+ #
12
+ # Original project website:
13
+ # https://github.com/taf2/rb-brill-tagger
14
+ class Perl
15
+ # Tokenize the entity using a native rule-based algorithm.
16
+ # Options: none.
17
+ def self.tokenize(entity, options = {})
18
+ # Normalize all whitespace
19
+ text = entity.to_s.gsub(/\s+/,' ')
20
+ # Translate some common extended ascii characters to quotes
21
+ text.gsub!(/‘/,'`')
22
+ text.gsub!(/’/,"'")
23
+ text.gsub!(/“/,"``")
24
+ text.gsub!(/”/,"''")
25
+ # Attempt to get correct directional quotes
26
+ # s{\"\b} { `` }g;
27
+ text.gsub!(/\"\b/,' `` ')
28
+ # s{\b\"} { '' }g;
29
+ text.gsub!(/\b\"/," '' ")
30
+ #s{\"(?=\s)} { '' }g;
31
+ text.gsub!(/\"(?=\s)/," '' ")
32
+ #s{\"} { `` }g;
33
+ text.gsub!(/\"(?=\s)/," `` ")
34
+ # Isolate ellipses
35
+ # s{\.\.\.} { ... }g;
36
+ text.gsub!(/\.\.\./,' ... ')
37
+ # Isolate any embedded punctuation chars
38
+ # s{([,;:\@\#\$\%&])} { $1 }g;
39
+ text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
40
+ # Assume sentence tokenization has been done first, so split FINAL
41
+ # periods only.
42
+ # s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
43
+ text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
44
+ # however, we may as well split ALL question marks and exclamation points,
45
+ # since they shouldn't have the abbrev.-marker ambiguity problem
46
+ #s{([?!])} { $1 }g;
47
+ text.gsub!(/([?!])/, ' \1 ')
48
+ # parentheses, brackets, etc.
49
+ #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
50
+ text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
51
+ #s/(-{2,})/ $1 /g;
52
+ text.gsub!(/(-{2,})/,' \1 ')
53
+ # Add a space to the beginning and end of each line, to reduce
54
+ # necessary number of regexps below.
55
+ #s/$/ /;
56
+ text.gsub!(/$/," ")
57
+ #s/^/ /;
58
+ text.gsub!(/^/," ")
59
+ # possessive or close-single-quote
60
+ #s/\([^\']\)\' /$1 \' /g;
61
+ text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
62
+ # as in it's, I'm, we'd
63
+ #s/\'([smd]) / \'$1 /ig;
64
+ text.gsub!(/\'([smd]) /i,%q( '\1 ))
65
+ #s/\'(ll|re|ve) / \'$1 /ig;
66
+ text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
67
+ #s/n\'t / n\'t /ig;
68
+ text.gsub!(/n\'t /i," n't ")
69
+ #s/ (can)(not) / $1 $2 /ig;
70
+ text.gsub!(/ (can)(not) /i,' \1 \2 ')
71
+ #s/ (d\')(ye) / $1 $2 /ig;
72
+ text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
73
+ #s/ (gim)(me) / $1 $2 /ig;
74
+ text.gsub!(/ (gim)(me) /i,' \1 \2 ')
75
+ #s/ (gon)(na) / $1 $2 /ig;
76
+ text.gsub!(/ (gon)(na) /i,' \1 \2 ')
77
+ #s/ (got)(ta) / $1 $2 /ig;
78
+ text.gsub!(/ (got)(ta) /i,' \1 \2 ')
79
+ #s/ (lem)(me) / $1 $2 /ig;
80
+ text.gsub!(/ (lem)(me) /i,' \1 \2 ')
81
+ #s/ (more)(\'n) / $1 $2 /ig;
82
+ text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
83
+ #s/ (\'t)(is|was) / $1 $2 /ig;
84
+ text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
85
+ #s/ (wan)(na) / $1 $2 /ig;
86
+ text.gsub!(/ (wan)(na) /i,' \1 \2 ')
87
+ tokens = text.split(/\s/)
88
+ tokens.each do |token|
89
+ next if token =~ /([[:space:]]+)/
90
+ entity << Treat::Entities::Entity.from_string(token)
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,42 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A tokenizer that was lifted from the 'punkt-segmenter'
5
+ # Ruby gem.
6
+ #
7
+ # This code follows the terms and conditions of Apache
8
+ # License v2 (http://www.apache.org/licenses/LICENSE-2.0)
9
+ #
10
+ # Authors: Willy <willy@csse.unimelb.edu.au>
11
+ # (original Python port), Steven Bird
12
+ # <sb@csse.unimelb.edu.au> (additions),
13
+ # Edward Loper <edloper@gradient.cis.upenn.edu>
14
+ # (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
15
+ # (almost rewrite).
16
+ #
17
+ # Project website: https://github.com/lfcipriani/punkt-segmenter
18
+ class Punkt
19
+ SentEndChars = ['.', '?', '!']
20
+ ReSentEndChars = /[.?!]/
21
+ InternalPunctuation = [',', ':', ';']
22
+ ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
23
+ ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
24
+ ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
25
+ ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
26
+ ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
27
+ RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
28
+ # Tokenize the text using the algorithm lifted from
29
+ # the Punkt tokenizer.
30
+ #
31
+ # Options: none.
32
+ def self.tokenize(entity, options = {})
33
+ entity.to_s.scan(ReWordTokenizer).each do |token|
34
+ puts token
35
+ entity << Treat::Entities::Entity.from_string(token)
36
+ end
37
+ entity
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,33 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ class Stanford
5
+ # Require the Ruby-Java bridge.
6
+ silently do
7
+ require 'rjb'
8
+ # Load the Stanford Parser Java files.
9
+ jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
10
+ unless File.readable?(jar)
11
+ raise "Could not find stanford parser JAR file in #{jar}."+
12
+ " You may need to set Treat.bin to a custom value."
13
+ end
14
+ # Load the Stanford Parser classes.
15
+ PTBTokenizer = ::Rjb::import('edu.stanford.nlp.process.PTBTokenizer')
16
+ CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
17
+ StringReader = ::Rjb::import('java.io.StringReader')
18
+ end
19
+ def self.tokenize(entity, options = {})
20
+ ptbt = PTBTokenizer.new(
21
+ StringReader.new(entity.to_s),
22
+ CoreLabelTokenFactory.new, '')
23
+ while ptbt.has_next
24
+ w = ptbt.next.word
25
+ next if w[0] == '-' && w[-1] == '-'
26
+ entity << Treat::Entities::Entity.from_string(w)
27
+ end
28
+ entity
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,59 @@
1
+ module Treat
2
+ module Processors
3
+ module Tokenizers
4
+ # A tokenizer class lifted from the 'tactful-tokenizer' gem.
5
+ #
6
+ # Copyright © 2010 Matthew Bunday. All rights reserved.
7
+ # Released under the GNU GPL v3. Modified by Louis Mullie.
8
+ #
9
+ # Project website: https://github.com/SlyShy/Tactful_Tokenizer
10
+ class Tactful
11
+ ReTokenize = [
12
+ # Uniform Quotes
13
+ [/''|``/, '"'],
14
+ # Separate punctuation from words.
15
+ [/(^|\s)(')/, '\1\2'],
16
+ [/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
17
+ [/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
18
+ # Treat double-hyphen as a single token.
19
+ [/([^-])(--+)([^-])/, '\1 \2 \3'],
20
+ [/(\s|^)(,)(?=(\S))/, '\1\2 '],
21
+ # Only separate a comma if a space follows.
22
+ [/(.)(,)(\s|$)/, '\1 \2\3'],
23
+ # Combine dots separated by whitespace to be a single token.
24
+ [/\.\s\.\s\./, '...'],
25
+ # Separate "No.6"
26
+ [/([\W]\.)(\d+)/, '\1 \2'],
27
+ # Separate words from ellipses
28
+ [/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
29
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
30
+ [/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
31
+ ##### Some additional fixes.
32
+ # Fix %, $, &
33
+ [/(\d)%/, '\1 %'],
34
+ [/\$(\.?\d)/, '$ \1'],
35
+ [/(\W)& (\W)/, '\1&\2'],
36
+ [/(\W\W+)&(\W\W+)/, '\1 & \2'],
37
+ # Fix (n 't) -> ( n't)
38
+ [/n 't( |$)/, " n't\\1"],
39
+ [/N 'T( |$)/, " N'T\\1"],
40
+ # Treebank tokenizer special words
41
+ [/([Cc])annot/, '\1an not']
42
+ ]
43
+ # Tokenize the entity using a rule-based algorithm
44
+ # which has been lifted from the 'tactful-tokenizer'
45
+ # gem.
46
+ def self.tokenize(entity, options = {})
47
+ s = entity.to_s
48
+ ReTokenize.each do |rules|
49
+ s.gsub!(rules[0], rules[1])
50
+ end
51
+ s.split(' ').each do |token|
52
+ entity << Entities::Entity.from_string(token)
53
+ end
54
+ entity
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
@@ -0,0 +1,66 @@
1
+ module Treat
2
+ # Proxies install Treat functions on Rubycore classes.
3
+ module Proxies
4
+ # The module proxy provides functionanaty common
5
+ # to the different types of proxies.
6
+ module Proxy
7
+ def method_missing(sym, *args, &block)
8
+ if Categories.have_method?(sym)
9
+ to_entity.send(sym, *args)
10
+ else
11
+ super(sym, *args, &block)
12
+ end
13
+ end
14
+ def to_entity(builder = nil)
15
+ Treat::Entities::Unknown(self.to_s)
16
+ end
17
+ end
18
+ # Install Treat functions on String objects.
19
+ module StringProxy
20
+ include Proxy
21
+ # Save the string to the specified file.
22
+ def save(file)
23
+ File.open(file, 'w') { |f| f.write(self) }
24
+ end
25
+ # Return the entity corresponding to the string.
26
+ def to_entity
27
+ Treat::Entities::Entity.from_string(self.to_s)
28
+ end
29
+ end
30
+ # Install Treat functions on Numeric objects.
31
+ module NumericProxy
32
+ include Proxy
33
+ # Return the entity corresponding to the number.
34
+ def to_entity(builder = nil)
35
+ Treat::Entities::Entity.from_numeric(self)
36
+ end
37
+ end
38
+ # Install Treat functions on Array objects.
39
+ module ArrayProxy
40
+ include Proxy
41
+ def method_missing(sym, *args, &block)
42
+ if Category.has_method?(sym)
43
+ array = []
44
+ each do |element|
45
+ if element.is_a? Treat::Entities::Entity
46
+ array << element.send(sym, *args)
47
+ else
48
+ unless [Numeric, String, Array].include?(element.class)
49
+ raise Treat::Exception "Cannot convert object with type " +
50
+ "#{element.class} into an entity."
51
+ end
52
+ array << element.to_entity.send(sym, *args)
53
+ end
54
+ end
55
+ array
56
+ else
57
+ super(sym, *args, &block)
58
+ end
59
+ end
60
+ end
61
+ # Include the proxies in the core classes.
62
+ String.class_eval { include StringProxy }
63
+ Numeric.class_eval { include NumericProxy }
64
+ Array.class_eval { include ArrayProxy }
65
+ end
66
+ end