treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Segmenters
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently do
|
7
|
+
require 'rjb'
|
8
|
+
jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
|
9
|
+
unless File.readable?(jar)
|
10
|
+
raise "Could not find stanford parser JAR file in #{jar}."+
|
11
|
+
" You may need to set Treat.bin to a custom value."
|
12
|
+
end
|
13
|
+
DocumentPreprocessor =
|
14
|
+
::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
|
15
|
+
StringReader = ::Rjb::import('java.io.StringReader')
|
16
|
+
end
|
17
|
+
def self.segment(entity, options = {})
|
18
|
+
sr = StringReader.new(entity.to_s)
|
19
|
+
sit = DocumentPreprocessor.new(sr).iterator
|
20
|
+
while sit.has_next
|
21
|
+
str = sit.next.to_string
|
22
|
+
str.gsub!(', ', ' ') # Fix - find better way to implode.
|
23
|
+
str.gsub!(' \'s', '\'s')
|
24
|
+
str.gsub!(' .', '.')
|
25
|
+
str.gsub!(' ,', ',')
|
26
|
+
str.gsub!(' ;', ';')
|
27
|
+
str.gsub!(/-[A-Z]{3}-/, '')
|
28
|
+
str = str[1..-2]
|
29
|
+
sentence = Entities::Entity.from_string(str)
|
30
|
+
if options[:tokenize] == true
|
31
|
+
tit = s.iterator
|
32
|
+
while tit.has_next
|
33
|
+
w = tit.next.word
|
34
|
+
next if w[0] == '-' && w[-1] == '-'
|
35
|
+
sentence << Entities::Entity.from_string(w)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
entity << sentence
|
39
|
+
end
|
40
|
+
entity
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Segmenters
|
4
|
+
# An adapter for the 'tactful_tokenizer' gem, which
|
5
|
+
# detects sentence boundaries (the name is a misnomer;
|
6
|
+
# it isn't a tokenizer, but a sentence boundary detector).
|
7
|
+
# It uses a Naive Bayesian statistical model, and is
|
8
|
+
# based on Splitta, but has support for ‘?’ and ‘!’
|
9
|
+
# as well as primitive handling of XHTML markup.
|
10
|
+
#
|
11
|
+
# Project website:
|
12
|
+
class Tactful
|
13
|
+
# Require the 'tactful_tokenizer' gem.
|
14
|
+
silently { require 'tactful_tokenizer' }
|
15
|
+
# Somewhere in the depths of the code this is defined...
|
16
|
+
String.class_eval { undef :tokenize }
|
17
|
+
# Keep only one copy of the segmenter.
|
18
|
+
@@segmenter = nil
|
19
|
+
# Segment a text or zone into sentences
|
20
|
+
# using the 'tactful_tokenizer' gem.
|
21
|
+
#
|
22
|
+
# Options: none.
|
23
|
+
def self.segment(entity, options = {})
|
24
|
+
@@segmenter ||= TactfulTokenizer::Model.new
|
25
|
+
sentences = @@segmenter.tokenize_text(entity.to_s)
|
26
|
+
sentences.each do |sentence|
|
27
|
+
entity << Entities::Entity.from_string(sentence)
|
28
|
+
end
|
29
|
+
entity
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,76 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A native rule-basd tokenizer based on the one
|
5
|
+
# developped by Robert Macyntyre in 1995 for the Penn
|
6
|
+
# Treebank project. This tokenizer follows the
|
7
|
+
# conventions used by the Penn Treebank.
|
8
|
+
#
|
9
|
+
# Original script:
|
10
|
+
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
|
11
|
+
#
|
12
|
+
# Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
|
13
|
+
# All rights reserved. This program is free software;
|
14
|
+
# you can redistribute it and/or modify it under the
|
15
|
+
# same terms as Ruby itself.
|
16
|
+
class Macintyre
|
17
|
+
# Tokenize the entity using a native rule-based algorithm.
|
18
|
+
def self.tokenize(entity, options = {})
|
19
|
+
raise 'Error' if entity.has_children?
|
20
|
+
chunks = self.split(entity.to_s)
|
21
|
+
chunks.each do |chunk|
|
22
|
+
next if chunk =~ /([[:space:]]+)/
|
23
|
+
entity << Treat::Entities::Entity.from_string(chunk)
|
24
|
+
end
|
25
|
+
entity
|
26
|
+
end
|
27
|
+
# Helper method to split the string into tokens.
|
28
|
+
def self.split(string)
|
29
|
+
s = " " + string + " "
|
30
|
+
s.gsub!(/\s+/," ")
|
31
|
+
s.gsub!(/(\s+)''/,'\1"')
|
32
|
+
s.gsub!(/(\s+)``/,'\1"')
|
33
|
+
s.gsub!(/''(\s+)/,'"\1')
|
34
|
+
s.gsub!(/``(\s+)/,'"\1')
|
35
|
+
s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
|
36
|
+
s.gsub!(/([ (\[{<])"/,'\1 `` ')
|
37
|
+
s.gsub!(/\.\.\./,' ... ')
|
38
|
+
s.gsub!(/[,;:@\#$%&]/,' \& ')
|
39
|
+
s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
|
40
|
+
s.gsub!(/[?!]/,' \& ')
|
41
|
+
s.gsub!(/[\]\[(){}<>]/,' \& ')
|
42
|
+
s.gsub!(/--/,' -- ')
|
43
|
+
s.sub!(/$/,' ')
|
44
|
+
s.sub!(/^/,' ')
|
45
|
+
s.gsub!(/"/,' \'\' ')
|
46
|
+
s.gsub!(/([^'])' /,'\1 \' ')
|
47
|
+
s.gsub!(/'([sSmMdD]) /,' \'\1 ')
|
48
|
+
s.gsub!(/'ll /,' \'ll ')
|
49
|
+
s.gsub!(/'re /,' \'re ')
|
50
|
+
s.gsub!(/'ve /,' \'ve ')
|
51
|
+
s.gsub!(/n't /,' n\'t ')
|
52
|
+
s.gsub!(/'LL /,' \'LL ')
|
53
|
+
s.gsub!(/'RE /,' \'RE ')
|
54
|
+
s.gsub!(/'VE /,' \'VE ')
|
55
|
+
s.gsub!(/N'T /,' N\'T ')
|
56
|
+
s.gsub!(/ ([Cc])annot /,' \1an not ')
|
57
|
+
s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
|
58
|
+
s.gsub!(/ ([Gg])imme /,' \1im me ')
|
59
|
+
s.gsub!(/ ([Gg])onna /,' \1on na ')
|
60
|
+
s.gsub!(/ ([Gg])otta /,' \1ot ta ')
|
61
|
+
s.gsub!(/ ([Ll])emme /,' \1em me ')
|
62
|
+
s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
|
63
|
+
s.gsub!(/ '([Tt])is /,' \'\1 is ')
|
64
|
+
s.gsub!(/ '([Tt])was /,' \'\1 was ')
|
65
|
+
s.gsub!(/ ([Ww])anna /,' \1an na ')
|
66
|
+
while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4')
|
67
|
+
end
|
68
|
+
s.gsub!(/\//, ' / ')
|
69
|
+
s.gsub!(/\s+/,' ')
|
70
|
+
s.strip!
|
71
|
+
s.split(/\s+/)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# An adapter for the 'tokenizer' gem, which performs
|
5
|
+
# rule-based tokenizing of texts in English, German
|
6
|
+
# or French.
|
7
|
+
class Multilingual
|
8
|
+
# Hold one tokenizer per language.
|
9
|
+
@@tokenizers = {}
|
10
|
+
# Require the 'tokenizer' gem.
|
11
|
+
silently { require 'tokenizer' }
|
12
|
+
# Perform the tokenization of English, German or French text.
|
13
|
+
# Options:
|
14
|
+
# :language => (Symbol) Force a language for the tokenizer.
|
15
|
+
def self.tokenize(entity, options = {})
|
16
|
+
lang = options[:language] ? options[:language] : entity.language
|
17
|
+
lang = Treat::Resources::Languages.find(lang, 1)
|
18
|
+
if @@tokenizers[lang].nil?
|
19
|
+
@@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
|
20
|
+
end
|
21
|
+
tokens = @@tokenizers[lang].tokenize(entity.to_s)
|
22
|
+
tokens.each do |token|
|
23
|
+
next if token =~ /([[:space:]]+)/
|
24
|
+
entity << Treat::Entities::Entity.from_string(token)
|
25
|
+
end
|
26
|
+
entity
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Treat
|
3
|
+
module Processors
|
4
|
+
module Tokenizers
|
5
|
+
# Tokenize the entity using a native rule-based algorithm.
|
6
|
+
# This tokenizer is a port from an unknown Perl module,
|
7
|
+
# which I have lifted from the 'rbtagger' gem.
|
8
|
+
#
|
9
|
+
# Author: Todd A. Fisher
|
10
|
+
# This code is free to use under the terms of the MIT license.
|
11
|
+
#
|
12
|
+
# Original project website:
|
13
|
+
# https://github.com/taf2/rb-brill-tagger
|
14
|
+
class Perl
|
15
|
+
# Tokenize the entity using a native rule-based algorithm.
|
16
|
+
# Options: none.
|
17
|
+
def self.tokenize(entity, options = {})
|
18
|
+
# Normalize all whitespace
|
19
|
+
text = entity.to_s.gsub(/\s+/,' ')
|
20
|
+
# Translate some common extended ascii characters to quotes
|
21
|
+
text.gsub!(/‘/,'`')
|
22
|
+
text.gsub!(/’/,"'")
|
23
|
+
text.gsub!(/“/,"``")
|
24
|
+
text.gsub!(/”/,"''")
|
25
|
+
# Attempt to get correct directional quotes
|
26
|
+
# s{\"\b} { `` }g;
|
27
|
+
text.gsub!(/\"\b/,' `` ')
|
28
|
+
# s{\b\"} { '' }g;
|
29
|
+
text.gsub!(/\b\"/," '' ")
|
30
|
+
#s{\"(?=\s)} { '' }g;
|
31
|
+
text.gsub!(/\"(?=\s)/," '' ")
|
32
|
+
#s{\"} { `` }g;
|
33
|
+
text.gsub!(/\"(?=\s)/," `` ")
|
34
|
+
# Isolate ellipses
|
35
|
+
# s{\.\.\.} { ... }g;
|
36
|
+
text.gsub!(/\.\.\./,' ... ')
|
37
|
+
# Isolate any embedded punctuation chars
|
38
|
+
# s{([,;:\@\#\$\%&])} { $1 }g;
|
39
|
+
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
40
|
+
# Assume sentence tokenization has been done first, so split FINAL
|
41
|
+
# periods only.
|
42
|
+
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
43
|
+
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
44
|
+
# however, we may as well split ALL question marks and exclamation points,
|
45
|
+
# since they shouldn't have the abbrev.-marker ambiguity problem
|
46
|
+
#s{([?!])} { $1 }g;
|
47
|
+
text.gsub!(/([?!])/, ' \1 ')
|
48
|
+
# parentheses, brackets, etc.
|
49
|
+
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
50
|
+
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
51
|
+
#s/(-{2,})/ $1 /g;
|
52
|
+
text.gsub!(/(-{2,})/,' \1 ')
|
53
|
+
# Add a space to the beginning and end of each line, to reduce
|
54
|
+
# necessary number of regexps below.
|
55
|
+
#s/$/ /;
|
56
|
+
text.gsub!(/$/," ")
|
57
|
+
#s/^/ /;
|
58
|
+
text.gsub!(/^/," ")
|
59
|
+
# possessive or close-single-quote
|
60
|
+
#s/\([^\']\)\' /$1 \' /g;
|
61
|
+
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
62
|
+
# as in it's, I'm, we'd
|
63
|
+
#s/\'([smd]) / \'$1 /ig;
|
64
|
+
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
65
|
+
#s/\'(ll|re|ve) / \'$1 /ig;
|
66
|
+
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
67
|
+
#s/n\'t / n\'t /ig;
|
68
|
+
text.gsub!(/n\'t /i," n't ")
|
69
|
+
#s/ (can)(not) / $1 $2 /ig;
|
70
|
+
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
71
|
+
#s/ (d\')(ye) / $1 $2 /ig;
|
72
|
+
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
73
|
+
#s/ (gim)(me) / $1 $2 /ig;
|
74
|
+
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
75
|
+
#s/ (gon)(na) / $1 $2 /ig;
|
76
|
+
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
77
|
+
#s/ (got)(ta) / $1 $2 /ig;
|
78
|
+
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
79
|
+
#s/ (lem)(me) / $1 $2 /ig;
|
80
|
+
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
81
|
+
#s/ (more)(\'n) / $1 $2 /ig;
|
82
|
+
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
83
|
+
#s/ (\'t)(is|was) / $1 $2 /ig;
|
84
|
+
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
85
|
+
#s/ (wan)(na) / $1 $2 /ig;
|
86
|
+
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
87
|
+
tokens = text.split(/\s/)
|
88
|
+
tokens.each do |token|
|
89
|
+
next if token =~ /([[:space:]]+)/
|
90
|
+
entity << Treat::Entities::Entity.from_string(token)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A tokenizer that was lifted from the 'punkt-segmenter'
|
5
|
+
# Ruby gem.
|
6
|
+
#
|
7
|
+
# This code follows the terms and conditions of Apache
|
8
|
+
# License v2 (http://www.apache.org/licenses/LICENSE-2.0)
|
9
|
+
#
|
10
|
+
# Authors: Willy <willy@csse.unimelb.edu.au>
|
11
|
+
# (original Python port), Steven Bird
|
12
|
+
# <sb@csse.unimelb.edu.au> (additions),
|
13
|
+
# Edward Loper <edloper@gradient.cis.upenn.edu>
|
14
|
+
# (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
|
15
|
+
# (almost rewrite).
|
16
|
+
#
|
17
|
+
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
18
|
+
class Punkt
|
19
|
+
SentEndChars = ['.', '?', '!']
|
20
|
+
ReSentEndChars = /[.?!]/
|
21
|
+
InternalPunctuation = [',', ':', ';']
|
22
|
+
ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
|
23
|
+
ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
|
24
|
+
ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
|
25
|
+
ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
|
26
|
+
ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
|
27
|
+
RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
|
28
|
+
# Tokenize the text using the algorithm lifted from
|
29
|
+
# the Punkt tokenizer.
|
30
|
+
#
|
31
|
+
# Options: none.
|
32
|
+
def self.tokenize(entity, options = {})
|
33
|
+
entity.to_s.scan(ReWordTokenizer).each do |token|
|
34
|
+
puts token
|
35
|
+
entity << Treat::Entities::Entity.from_string(token)
|
36
|
+
end
|
37
|
+
entity
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
class Stanford
|
5
|
+
# Require the Ruby-Java bridge.
|
6
|
+
silently do
|
7
|
+
require 'rjb'
|
8
|
+
# Load the Stanford Parser Java files.
|
9
|
+
jar = "#{Treat.bin}/stanford_parser/stanford-parser.jar"
|
10
|
+
unless File.readable?(jar)
|
11
|
+
raise "Could not find stanford parser JAR file in #{jar}."+
|
12
|
+
" You may need to set Treat.bin to a custom value."
|
13
|
+
end
|
14
|
+
# Load the Stanford Parser classes.
|
15
|
+
PTBTokenizer = ::Rjb::import('edu.stanford.nlp.process.PTBTokenizer')
|
16
|
+
CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
|
17
|
+
StringReader = ::Rjb::import('java.io.StringReader')
|
18
|
+
end
|
19
|
+
def self.tokenize(entity, options = {})
|
20
|
+
ptbt = PTBTokenizer.new(
|
21
|
+
StringReader.new(entity.to_s),
|
22
|
+
CoreLabelTokenFactory.new, '')
|
23
|
+
while ptbt.has_next
|
24
|
+
w = ptbt.next.word
|
25
|
+
next if w[0] == '-' && w[-1] == '-'
|
26
|
+
entity << Treat::Entities::Entity.from_string(w)
|
27
|
+
end
|
28
|
+
entity
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Treat
|
2
|
+
module Processors
|
3
|
+
module Tokenizers
|
4
|
+
# A tokenizer class lifted from the 'tactful-tokenizer' gem.
|
5
|
+
#
|
6
|
+
# Copyright © 2010 Matthew Bunday. All rights reserved.
|
7
|
+
# Released under the GNU GPL v3. Modified by Louis Mullie.
|
8
|
+
#
|
9
|
+
# Project website: https://github.com/SlyShy/Tactful_Tokenizer
|
10
|
+
class Tactful
|
11
|
+
ReTokenize = [
|
12
|
+
# Uniform Quotes
|
13
|
+
[/''|``/, '"'],
|
14
|
+
# Separate punctuation from words.
|
15
|
+
[/(^|\s)(')/, '\1\2'],
|
16
|
+
[/(?=[\("`{\[:;&#*@\.])(.)/, '\1 '],
|
17
|
+
[/(.)(?=[?!\)";}\]*:@\.'])|(?=[\)}\]])(.)|(.)(?=[({\[])|((^|\s)-)(?=[^-])/, '\1 '],
|
18
|
+
# Treat double-hyphen as a single token.
|
19
|
+
[/([^-])(--+)([^-])/, '\1 \2 \3'],
|
20
|
+
[/(\s|^)(,)(?=(\S))/, '\1\2 '],
|
21
|
+
# Only separate a comma if a space follows.
|
22
|
+
[/(.)(,)(\s|$)/, '\1 \2\3'],
|
23
|
+
# Combine dots separated by whitespace to be a single token.
|
24
|
+
[/\.\s\.\s\./, '...'],
|
25
|
+
# Separate "No.6"
|
26
|
+
[/([\W]\.)(\d+)/, '\1 \2'],
|
27
|
+
# Separate words from ellipses
|
28
|
+
[/([^\.]|^)(\.{2,})(.?)/, '\1 \2 \3'],
|
29
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1\2 \3'],
|
30
|
+
[/(^|\s)(\.{2,})([^\.\s])/, '\1 \2\3'],
|
31
|
+
##### Some additional fixes.
|
32
|
+
# Fix %, $, &
|
33
|
+
[/(\d)%/, '\1 %'],
|
34
|
+
[/\$(\.?\d)/, '$ \1'],
|
35
|
+
[/(\W)& (\W)/, '\1&\2'],
|
36
|
+
[/(\W\W+)&(\W\W+)/, '\1 & \2'],
|
37
|
+
# Fix (n 't) -> ( n't)
|
38
|
+
[/n 't( |$)/, " n't\\1"],
|
39
|
+
[/N 'T( |$)/, " N'T\\1"],
|
40
|
+
# Treebank tokenizer special words
|
41
|
+
[/([Cc])annot/, '\1an not']
|
42
|
+
]
|
43
|
+
# Tokenize the entity using a rule-based algorithm
|
44
|
+
# which has been lifted from the 'tactful-tokenizer'
|
45
|
+
# gem.
|
46
|
+
def self.tokenize(entity, options = {})
|
47
|
+
s = entity.to_s
|
48
|
+
ReTokenize.each do |rules|
|
49
|
+
s.gsub!(rules[0], rules[1])
|
50
|
+
end
|
51
|
+
s.split(' ').each do |token|
|
52
|
+
entity << Entities::Entity.from_string(token)
|
53
|
+
end
|
54
|
+
entity
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
module Treat
|
2
|
+
# Proxies install Treat functions on Rubycore classes.
|
3
|
+
module Proxies
|
4
|
+
# The module proxy provides functionanaty common
|
5
|
+
# to the different types of proxies.
|
6
|
+
module Proxy
|
7
|
+
def method_missing(sym, *args, &block)
|
8
|
+
if Categories.have_method?(sym)
|
9
|
+
to_entity.send(sym, *args)
|
10
|
+
else
|
11
|
+
super(sym, *args, &block)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
def to_entity(builder = nil)
|
15
|
+
Treat::Entities::Unknown(self.to_s)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
# Install Treat functions on String objects.
|
19
|
+
module StringProxy
|
20
|
+
include Proxy
|
21
|
+
# Save the string to the specified file.
|
22
|
+
def save(file)
|
23
|
+
File.open(file, 'w') { |f| f.write(self) }
|
24
|
+
end
|
25
|
+
# Return the entity corresponding to the string.
|
26
|
+
def to_entity
|
27
|
+
Treat::Entities::Entity.from_string(self.to_s)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
# Install Treat functions on Numeric objects.
|
31
|
+
module NumericProxy
|
32
|
+
include Proxy
|
33
|
+
# Return the entity corresponding to the number.
|
34
|
+
def to_entity(builder = nil)
|
35
|
+
Treat::Entities::Entity.from_numeric(self)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
# Install Treat functions on Array objects.
|
39
|
+
module ArrayProxy
|
40
|
+
include Proxy
|
41
|
+
def method_missing(sym, *args, &block)
|
42
|
+
if Category.has_method?(sym)
|
43
|
+
array = []
|
44
|
+
each do |element|
|
45
|
+
if element.is_a? Treat::Entities::Entity
|
46
|
+
array << element.send(sym, *args)
|
47
|
+
else
|
48
|
+
unless [Numeric, String, Array].include?(element.class)
|
49
|
+
raise Treat::Exception "Cannot convert object with type " +
|
50
|
+
"#{element.class} into an entity."
|
51
|
+
end
|
52
|
+
array << element.to_entity.send(sym, *args)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
array
|
56
|
+
else
|
57
|
+
super(sym, *args, &block)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
# Include the proxies in the core classes.
|
62
|
+
String.class_eval { include StringProxy }
|
63
|
+
Numeric.class_eval { include NumericProxy }
|
64
|
+
Array.class_eval { include ArrayProxy }
|
65
|
+
end
|
66
|
+
end
|