treat 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Segmenters
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently do
|
7
|
+
require 'rjb'
|
8
|
+
jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
|
9
|
+
unless File.readable?(jar)
|
10
|
+
raise "Could not find stanford parser JAR file in #{jar}."+
|
11
|
+
" You may need to set Treat.bin to a custom value."
|
12
|
+
end
|
13
|
+
DocumentPreprocessor =
|
14
|
+
::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
|
15
|
+
StringReader = ::Rjb::import('java.io.StringReader')
|
16
|
+
end
|
17
|
+
def self.segment(entity, options = {})
|
18
|
+
sr = StringReader.new(entity.to_s)
|
19
|
+
sit = DocumentPreprocessor.new(sr).iterator
|
20
|
+
while sit.has_next
|
21
|
+
str = sit.next.to_string
|
22
|
+
str.gsub!(', ', ' ') # Fix - find better way to implode.
|
23
|
+
str.gsub!(' \'s', '\'s')
|
24
|
+
str.gsub!(' .', '.')
|
25
|
+
str.gsub!(' ,', ',')
|
26
|
+
str.gsub!(' ;', ';')
|
27
|
+
str.gsub!(/-[A-Z]{3}-/, '')
|
28
|
+
str = str[1..-2]
|
29
|
+
sentence = Entities::Entity.from_string(str)
|
30
|
+
if options[:tokenize] == true
|
31
|
+
tit = s.iterator
|
32
|
+
while tit.has_next
|
33
|
+
w = tit.next.word
|
34
|
+
next if w[0] == '-' && w[-1] == '-'
|
35
|
+
sentence << Entities::Entity.from_string(w)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
entity << sentence
|
39
|
+
end
|
40
|
+
entity
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Segmenters
|
4
|
+
# An adapter for the 'tactful_tokenizer' gem, which
|
5
|
+
# detects sentence boundaries (the name is a misnomer;
|
6
|
+
# it isn't a tokenizer, but a sentence boundary detector).
|
7
|
+
# It uses a Naive Bayesian statistical model, and is
|
8
|
+
# based on Splitta, but has support for ‘?’ and ‘!’
|
9
|
+
# as well as primitive handling of XHTML markup.
|
10
|
+
#
|
11
|
+
# Project website:
|
12
|
+
class Tactful
|
13
|
+
# Require the 'tactful_tokenizer' gem.
|
14
|
+
silently { require 'tactful_tokenizer' }
|
15
|
+
# Somewhere in the depths of the code this is defined...
|
16
|
+
String.class_eval { undef :tokenize }
|
17
|
+
# Keep only one copy of the segmenter.
|
18
|
+
@@segmenter = nil
|
19
|
+
# Segment a text or zone into sentences
|
20
|
+
# using the 'tactful_tokenizer' gem.
|
21
|
+
#
|
22
|
+
# Options: none.
|
23
|
+
def self.segment(entity, options = {})
|
24
|
+
@@segmenter ||= TactfulTokenizer::Model.new
|
25
|
+
sentences = @@segmenter.tokenize_text(entity.to_s)
|
26
|
+
sentences.each do |sentence|
|
27
|
+
entity << Entities::Entity.from_string(sentence)
|
28
|
+
end
|
29
|
+
entity
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A native rule-basd tokenizer based on the one
|
5
|
+
# developped by Robert Macyntyre in 1995 for the Penn
|
6
|
+
# Treebank project. This tokenizer follows the
|
7
|
+
# conventions used by the Penn Treebank.
|
8
|
+
#
|
9
|
+
# Original script:
|
10
|
+
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
|
11
|
+
#
|
12
|
+
# Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
|
13
|
+
# All rights reserved. This program is free software;
|
14
|
+
# you can redistribute it and/or modify it under the
|
15
|
+
# same terms as Ruby itself.
|
16
|
+
class Macintyre
|
17
|
+
# Tokenize the entity using a native rule-based algorithm.
|
18
|
+
def self.tokenize(entity, options = {})
|
19
|
+
raise 'Error' if entity.has_children?
|
20
|
+
chunks = self.split(entity.to_s)
|
21
|
+
chunks.each do |chunk|
|
22
|
+
next if chunk =~ /([[:space:]]+)/
|
23
|
+
entity << Treat::Entities::Entity.from_string(chunk)
|
24
|
+
end
|
25
|
+
entity
|
26
|
+
end
|
27
|
+
# Helper method to split the string into tokens.
|
28
|
+
def self.split(string)
|
29
|
+
s = " " + string + " "
|
30
|
+
s.gsub!(/\s+/," ")
|
31
|
+
s.gsub!(/(\s+)''/,'\1"')
|
32
|
+
s.gsub!(/(\s+)``/,'\1"')
|
33
|
+
s.gsub!(/''(\s+)/,'"\1')
|
34
|
+
s.gsub!(/``(\s+)/,'"\1')
|
35
|
+
s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
|
36
|
+
s.gsub!(/([ (\[{<])"/,'\1 `` ')
|
37
|
+
s.gsub!(/\.\.\./,' ... ')
|
38
|
+
s.gsub!(/[,;:@\#$%&]/,' \& ')
|
39
|
+
s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
|
40
|
+
s.gsub!(/[?!]/,' \& ')
|
41
|
+
s.gsub!(/[\]\[(){}<>]/,' \& ')
|
42
|
+
s.gsub!(/--/,' -- ')
|
43
|
+
s.sub!(/$/,' ')
|
44
|
+
s.sub!(/^/,' ')
|
45
|
+
s.gsub!(/"/,' \'\' ')
|
46
|
+
s.gsub!(/([^'])' /,'\1 \' ')
|
47
|
+
s.gsub!(/'([sSmMdD]) /,' \'\1 ')
|
48
|
+
s.gsub!(/'ll /,' \'ll ')
|
49
|
+
s.gsub!(/'re /,' \'re ')
|
50
|
+
s.gsub!(/'ve /,' \'ve ')
|
51
|
+
s.gsub!(/n't /,' n\'t ')
|
52
|
+
s.gsub!(/'LL /,' \'LL ')
|
53
|
+
s.gsub!(/'RE /,' \'RE ')
|
54
|
+
s.gsub!(/'VE /,' \'VE ')
|
55
|
+
s.gsub!(/N'T /,' N\'T ')
|
56
|
+
s.gsub!(/ ([Cc])annot /,' \1an not ')
|
57
|
+
s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
|
58
|
+
s.gsub!(/ ([Gg])imme /,' \1im me ')
|
59
|
+
s.gsub!(/ ([Gg])onna /,' \1on na ')
|
60
|
+
s.gsub!(/ ([Gg])otta /,' \1ot ta ')
|
61
|
+
s.gsub!(/ ([Ll])emme /,' \1em me ')
|
62
|
+
s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
|
63
|
+
s.gsub!(/ '([Tt])is /,' \'\1 is ')
|
64
|
+
s.gsub!(/ '([Tt])was /,' \'\1 was ')
|
65
|
+
s.gsub!(/ ([Ww])anna /,' \1an na ')
|
66
|
+
while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4')
|
67
|
+
end
|
68
|
+
s.gsub!(/\//, ' / ')
|
69
|
+
s.gsub!(/\s+/,' ')
|
70
|
+
s.strip!
|
71
|
+
s.split(/\s+/)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# An adapter for the 'tokenizer' gem, which performs
|
5
|
+
# rule-based tokenizing of texts in English, German
|
6
|
+
# or French.
|
7
|
+
class Multilingual
|
8
|
+
# Hold one tokenizer per language.
|
9
|
+
@@tokenizers = {}
|
10
|
+
# Require the 'tokenizer' gem.
|
11
|
+
silently { require 'tokenizer' }
|
12
|
+
# Perform the tokenization of English, German or French text.
|
13
|
+
# Options:
|
14
|
+
# :language => (Symbol) Force a language for the tokenizer.
|
15
|
+
def self.tokenize(entity, options = {})
|
16
|
+
lang = options[:language] ? options[:language] : entity.language
|
17
|
+
lang = Treat::Resources::Languages.find(lang, 1)
|
18
|
+
if @@tokenizers[lang].nil?
|
19
|
+
@@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
|
20
|
+
end
|
21
|
+
tokens = @@tokenizers[lang].tokenize(entity.to_s)
|
22
|
+
tokens.each do |token|
|
23
|
+
next if token =~ /([[:space:]]+)/
|
24
|
+
entity << Treat::Entities::Entity.from_string(token)
|
25
|
+
end
|
26
|
+
entity
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Treat
|
3
|
+
module Processors
|
4
|
+
module Tokenizers
|
5
|
+
# Tokenize the entity using a native rule-based algorithm.
|
6
|
+
# This tokenizer is a port from an unknown Perl module,
|
7
|
+
# which I have lifted from the 'rbtagger' gem.
|
8
|
+
#
|
9
|
+
# Author: Todd A. Fisher
|
10
|
+
# This code is free to use under the terms of the MIT license.
|
11
|
+
#
|
12
|
+
# Original project website:
|
13
|
+
# https://github.com/taf2/rb-brill-tagger
|
14
|
+
class Perl
|
15
|
+
# Tokenize the entity using a native rule-based algorithm.
|
16
|
+
# Options: none.
|
17
|
+
def self.tokenize(entity, options = {})
|
18
|
+
# Normalize all whitespace
|
19
|
+
text = entity.to_s.gsub(/\s+/,' ')
|
20
|
+
# Translate some common extended ascii characters to quotes
|
21
|
+
text.gsub!(/‘/,'`')
|
22
|
+
text.gsub!(/’/,"'")
|
23
|
+
text.gsub!(/“/,"``")
|
24
|
+
text.gsub!(/”/,"''")
|
25
|
+
# Attempt to get correct directional quotes
|
26
|
+
# s{\"\b} { `` }g;
|
27
|
+
text.gsub!(/\"\b/,' `` ')
|
28
|
+
# s{\b\"} { '' }g;
|
29
|
+
text.gsub!(/\b\"/," '' ")
|
30
|
+
#s{\"(?=\s)} { '' }g;
|
31
|
+
text.gsub!(/\"(?=\s)/," '' ")
|
32
|
+
#s{\"} { `` }g;
|
33
|
+
text.gsub!(/\"(?=\s)/," `` ")
|
34
|
+
# Isolate ellipses
|
35
|
+
# s{\.\.\.} { ... }g;
|
36
|
+
text.gsub!(/\.\.\./,' ... ')
|
37
|
+
# Isolate any embedded punctuation chars
|
38
|
+
# s{([,;:\@\#\$\%&])} { $1 }g;
|
39
|
+
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
40
|
+
# Assume sentence tokenization has been done first, so split FINAL
|
41
|
+
# periods only.
|
42
|
+
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
43
|
+
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
44
|
+
# however, we may as well split ALL question marks and exclamation points,
|
45
|
+
# since they shouldn't have the abbrev.-marker ambiguity problem
|
46
|
+
#s{([?!])} { $1 }g;
|
47
|
+
text.gsub!(/([?!])/, ' \1 ')
|
48
|
+
# parentheses, brackets, etc.
|
49
|
+
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
50
|
+
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
51
|
+
#s/(-{2,})/ $1 /g;
|
52
|
+
text.gsub!(/(-{2,})/,' \1 ')
|
53
|
+
# Add a space to the beginning and end of each line, to reduce
|
54
|
+
# necessary number of regexps below.
|
55
|
+
#s/$/ /;
|
56
|
+
text.gsub!(/$/," ")
|
57
|
+
#s/^/ /;
|
58
|
+
text.gsub!(/^/," ")
|
59
|
+
# possessive or close-single-quote
|
60
|
+
#s/\([^\']\)\' /$1 \' /g;
|
61
|
+
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
62
|
+
# as in it's, I'm, we'd
|
63
|
+
#s/\'([smd]) / \'$1 /ig;
|
64
|
+
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
65
|
+
#s/\'(ll|re|ve) / \'$1 /ig;
|
66
|
+
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
67
|
+
#s/n\'t / n\'t /ig;
|
68
|
+
text.gsub!(/n\'t /i," n't ")
|
69
|
+
#s/ (can)(not) / $1 $2 /ig;
|
70
|
+
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
71
|
+
#s/ (d\')(ye) / $1 $2 /ig;
|
72
|
+
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
73
|
+
#s/ (gim)(me) / $1 $2 /ig;
|
74
|
+
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
75
|
+
#s/ (gon)(na) / $1 $2 /ig;
|
76
|
+
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
77
|
+
#s/ (got)(ta) / $1 $2 /ig;
|
78
|
+
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
79
|
+
#s/ (lem)(me) / $1 $2 /ig;
|
80
|
+
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
81
|
+
#s/ (more)(\'n) / $1 $2 /ig;
|
82
|
+
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
83
|
+
#s/ (\'t)(is|was) / $1 $2 /ig;
|
84
|
+
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
85
|
+
#s/ (wan)(na) / $1 $2 /ig;
|
86
|
+
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
87
|
+
tokens = text.split(/\s/)
|
88
|
+
tokens.each do |token|
|
89
|
+
next if token =~ /([[:space:]]+)/
|
90
|
+
entity << Treat::Entities::Entity.from_string(token)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A tokenizer that was lifted from the 'punkt-segmenter'
|
5
|
+
# Ruby gem.
|
6
|
+
#
|
7
|
+
# This code follows the terms and conditions of Apache
|
8
|
+
# License v2 (http://www.apache.org/licenses/LICENSE-2.0)
|
9
|
+
#
|
10
|
+
# Authors: Willy <willy@csse.unimelb.edu.au>
|
11
|
+
# (original Python port), Steven Bird
|
12
|
+
# <sb@csse.unimelb.edu.au> (additions),
|
13
|
+
# Edward Loper <edloper@gradient.cis.upenn.edu>
|
14
|
+
# (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
|
15
|
+
# (almost rewrite).
|
16
|
+
#
|
17
|
+
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
18
|
+
class Punkt
|
19
|
+
SentEndChars = ['.', '?', '!']
|
20
|
+
ReSentEndChars = /[.?!]/
|
21
|
+
InternalPunctuation = [',', ':', ';']
|
22
|
+
ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
|
23
|
+
ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
|
24
|
+
ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
|
25
|
+
ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
|
26
|
+
ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
|
27
|
+
RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
|
28
|
+
# Tokenize the text using the algorithm lifted from
|
29
|
+
# the Punkt tokenizer.
|
30
|
+
#
|
31
|
+
# Options: none.
|
32
|
+
def self.tokenize(entity, options = {})
|
33
|
+
entity.to_s.scan(ReWordTokenizer).each do |token|
|
34
|
+
puts token
|
35
|
+
entity << Treat::Entities::Entity.from_string(token)
|
36
|
+
end
|
37
|
+
entity
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently do
|
7
|
+
require 'rjb'
|
8
|
+
# Load the Stanford Parser Java files.
|
9
|
+
jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
|
10
|
+
unless File.readable?(jar)
|
11
|
+
raise "Could not find stanford parser JAR file in #{jar}."+
|
12
|
+
" You may need to set Treat.bin to a custom value."
|
13
|
+
end
|
14
|
+
# Load the Stanford Parser classes.
|
15
|
+
PTBTokenizer = ::Rjb::import('edu.stanford.nlp.process.PTBTokenizer')
|
16
|
+
CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
|
17
|
+
StringReader = ::Rjb::import('java.io.StringReader')
|
18
|
+
end
|
19
|
+
def self.tokenize(entity, options = {})
|
20
|
+
ptbt = PTBTokenizer.new(
|
21
|
+
StringReader.new(entity.to_s),
|
22
|
+
CoreLabelTokenFactory.new, '')
|
23
|
+
while ptbt.has_next
|
24
|
+
w = ptbt.next.word
|
25
|
+
next if w[0] == '-' && w[-1] == '-'
|
26
|
+
entity << Treat::Entities::Entity.from_string(w)
|
27
|
+
end
|
28
|
+
entity
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A tokenizer class lifted from the 'tactful-tokenizer' gem.
|
5
|
+
#
|
6
|
+
# Copyright © 2010 Matthew Bunday. All rights reserved.
|
7
|
+
# Released under the GNU GPL v3. Modified by Louis Mullie.
|
8
|
+
#
|
9
|
+
# Project website: https://github.com/SlyShy/Tactful_Tokenizer
|
10
|
+
class Tactful
|
11
|
+
ReTokenize = [
|
12
|
+
# Uniform Quotes
|
13
|
+
[/''|``/, '"'],
|
14
|
+
# Separate punctuation from words.
|
15
|
+
[/(^|\s)(')/, '\1\2'],
|
16
|
+
[/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
|
17
|
+
[/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
|
18
|
+
# Treat double-hyphen as a single token.
|
19
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
20
|
+
[/(\s|^)(,)(?=(\S))/, '\1\2 '],
|
21
|
+
# Only separate a comma if a space follows.
|
22
|
+
[/(.)(,)(\s|$)/, '\1 \2\3'],
|
23
|
+
# Combine dots separated by whitespace to be a single token.
|
24
|
+
[/\.\s\.\s\./, '...'],
|
25
|
+
# Separate "No.6"
|
26
|
+
[/([\W]\.)(\d+)/, '\1 \2'],
|
27
|
+
# Separate words from ellipses
|
28
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
29
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
30
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
31
|
+
##### Some additional fixes.
|
32
|
+
# Fix %, $, &
|
33
|
+
[/(\d)%/, '\1 %'],
|
34
|
+
[/\$(\.?\d)/, '$ \1'],
|
35
|
+
[/(\W)& (\W)/, '\1&\2'],
|
36
|
+
[/(\W\W+)&(\W\W+)/, '\1 & \2'],
|
37
|
+
# Fix (n 't) -> ( n't)
|
38
|
+
[/n 't( |$)/, " n't\\1"],
|
39
|
+
[/N 'T( |$)/, " N'T\\1"],
|
40
|
+
# Treebank tokenizer special words
|
41
|
+
[/([Cc])annot/, '\1an not']
|
42
|
+
]
|
43
|
+
# Tokenize the entity using a rule-based algorithm
|
44
|
+
# which has been lifted from the 'tactful-tokenizer'
|
45
|
+
# gem.
|
46
|
+
def self.tokenize(entity, options = {})
|
47
|
+
s = entity.to_s
|
48
|
+
ReTokenize.each do |rules|
|
49
|
+
s.gsub!(rules[0], rules[1])
|
50
|
+
end
|
51
|
+
s.split(' ').each do |token|
|
52
|
+
entity << Entities::Entity.from_string(token)
|
53
|
+
end
|
54
|
+
entity
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Treat
|
2
|
+
# Proxies install Treat functions on Rubycore classes.
|
3
|
+
module Proxies
|
4
|
+
# The module proxy provides functionanaty common
|
5
|
+
# to the different types of proxies.
|
6
|
+
module Proxy
|
7
|
+
def method_missing(sym, *args, &block)
|
8
|
+
if Categories.have_method?(sym)
|
9
|
+
to_entity.send(sym, *args)
|
10
|
+
else
|
11
|
+
super(sym, *args, &block)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def to_entity(builder = nil)
|
15
|
+
Treat::Entities::Unknown(self.to_s)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
# Install Treat functions on String objects.
|
19
|
+
module StringProxy
|
20
|
+
include Proxy
|
21
|
+
# Save the string to the specified file.
|
22
|
+
def save(file)
|
23
|
+
File.open(file, 'w') { |f| f.write(self) }
|
24
|
+
end
|
25
|
+
# Return the entity corresponding to the string.
|
26
|
+
def to_entity
|
27
|
+
Treat::Entities::Entity.from_string(self.to_s)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
# Install Treat functions on Numeric objects.
|
31
|
+
module NumericProxy
|
32
|
+
include Proxy
|
33
|
+
# Return the entity corresponding to the number.
|
34
|
+
def to_entity(builder = nil)
|
35
|
+
Treat::Entities::Entity.from_numeric(self)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# Install Treat functions on Array objects.
|
39
|
+
module ArrayProxy
|
40
|
+
include Proxy
|
41
|
+
def method_missing(sym, *args, &block)
|
42
|
+
if Category.has_method?(sym)
|
43
|
+
array = []
|
44
|
+
each do |element|
|
45
|
+
if element.is_a? Treat::Entities::Entity
|
46
|
+
array << element.send(sym, *args)
|
47
|
+
else
|
48
|
+
unless [Numeric, String, Array].include?(element.class)
|
49
|
+
raise Treat::Exception "Cannot convert object with type " +
|
50
|
+
"#{element.class} into an entity."
|
51
|
+
end
|
52
|
+
array << element.to_entity.send(sym, *args)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
array
|
56
|
+
else
|
57
|
+
super(sym, *args, &block)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
# Include the proxies in the core classes.
|
62
|
+
String.class_eval { include StringProxy }
|
63
|
+
Numeric.class_eval { include NumericProxy }
|
64
|
+
Array.class_eval { include ArrayProxy }
|
65
|
+
end
|
66
|
+
end
|