treat 0.2.5 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +3 -3
- data/README.md +33 -0
- data/files/INFO +1 -0
- data/lib/treat.rb +40 -105
- data/lib/treat/ai.rb +12 -0
- data/lib/treat/ai/classifiers/id3.rb +27 -0
- data/lib/treat/categories.rb +82 -35
- data/lib/treat/categorizable.rb +44 -0
- data/lib/treat/classification.rb +61 -0
- data/lib/treat/configurable.rb +115 -0
- data/lib/treat/data_set.rb +42 -0
- data/lib/treat/dependencies.rb +24 -0
- data/lib/treat/downloader.rb +87 -0
- data/lib/treat/entities.rb +68 -66
- data/lib/treat/entities/abilities.rb +10 -0
- data/lib/treat/entities/abilities/buildable.rb +327 -0
- data/lib/treat/entities/abilities/checkable.rb +31 -0
- data/lib/treat/entities/abilities/copyable.rb +45 -0
- data/lib/treat/entities/abilities/countable.rb +51 -0
- data/lib/treat/entities/abilities/debuggable.rb +83 -0
- data/lib/treat/entities/abilities/delegatable.rb +123 -0
- data/lib/treat/entities/abilities/doable.rb +62 -0
- data/lib/treat/entities/abilities/exportable.rb +11 -0
- data/lib/treat/entities/abilities/iterable.rb +115 -0
- data/lib/treat/entities/abilities/magical.rb +83 -0
- data/lib/treat/entities/abilities/registrable.rb +74 -0
- data/lib/treat/entities/abilities/stringable.rb +91 -0
- data/lib/treat/entities/entities.rb +104 -0
- data/lib/treat/entities/entity.rb +122 -245
- data/lib/treat/exception.rb +4 -4
- data/lib/treat/extractors.rb +77 -80
- data/lib/treat/extractors/keywords/tf_idf.rb +56 -22
- data/lib/treat/extractors/language/what_language.rb +50 -45
- data/lib/treat/extractors/name_tag/stanford.rb +55 -0
- data/lib/treat/extractors/tf_idf/native.rb +87 -0
- data/lib/treat/extractors/time/chronic.rb +55 -0
- data/lib/treat/extractors/time/nickel.rb +86 -62
- data/lib/treat/extractors/time/ruby.rb +53 -0
- data/lib/treat/extractors/topic_words/lda.rb +67 -58
- data/lib/treat/extractors/topics/reuters.rb +100 -87
- data/lib/treat/formatters.rb +39 -35
- data/lib/treat/formatters/readers/abw.rb +49 -29
- data/lib/treat/formatters/readers/autoselect.rb +37 -33
- data/lib/treat/formatters/readers/doc.rb +19 -13
- data/lib/treat/formatters/readers/html.rb +52 -30
- data/lib/treat/formatters/readers/image.rb +41 -40
- data/lib/treat/formatters/readers/odt.rb +59 -45
- data/lib/treat/formatters/readers/pdf.rb +28 -25
- data/lib/treat/formatters/readers/txt.rb +12 -15
- data/lib/treat/formatters/readers/xml.rb +73 -36
- data/lib/treat/formatters/serializers/xml.rb +80 -79
- data/lib/treat/formatters/serializers/yaml.rb +19 -18
- data/lib/treat/formatters/unserializers/autoselect.rb +12 -22
- data/lib/treat/formatters/unserializers/xml.rb +94 -99
- data/lib/treat/formatters/unserializers/yaml.rb +20 -19
- data/lib/treat/formatters/visualizers/dot.rb +132 -132
- data/lib/treat/formatters/visualizers/standoff.rb +52 -44
- data/lib/treat/formatters/visualizers/tree.rb +26 -29
- data/lib/treat/groupable.rb +153 -0
- data/lib/treat/helpers/decimal_point_escaper.rb +22 -0
- data/lib/treat/inflectors.rb +50 -45
- data/lib/treat/inflectors/cardinalizers/linguistics.rb +40 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +55 -0
- data/lib/treat/inflectors/declensors/active_support.rb +31 -0
- data/lib/treat/inflectors/declensors/english.rb +38 -0
- data/lib/treat/inflectors/declensors/english/inflect.rb +288 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +49 -0
- data/lib/treat/inflectors/ordinalizers/linguistics.rb +17 -0
- data/lib/treat/inflectors/stemmers/porter.rb +160 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +24 -0
- data/lib/treat/inflectors/stemmers/uea.rb +28 -0
- data/lib/treat/installer.rb +308 -0
- data/lib/treat/kernel.rb +105 -27
- data/lib/treat/languages.rb +122 -88
- data/lib/treat/languages/arabic.rb +15 -15
- data/lib/treat/languages/chinese.rb +15 -15
- data/lib/treat/languages/dutch.rb +15 -15
- data/lib/treat/languages/english.rb +61 -62
- data/lib/treat/languages/french.rb +19 -19
- data/lib/treat/languages/german.rb +20 -20
- data/lib/treat/languages/greek.rb +15 -15
- data/lib/treat/languages/italian.rb +16 -16
- data/lib/treat/languages/polish.rb +15 -15
- data/lib/treat/languages/portuguese.rb +15 -15
- data/lib/treat/languages/russian.rb +15 -15
- data/lib/treat/languages/spanish.rb +16 -16
- data/lib/treat/languages/swedish.rb +16 -16
- data/lib/treat/lexicalizers.rb +34 -55
- data/lib/treat/lexicalizers/categorizers/from_tag.rb +54 -0
- data/lib/treat/lexicalizers/sensers/wordnet.rb +57 -0
- data/lib/treat/lexicalizers/sensers/wordnet/synset.rb +71 -0
- data/lib/treat/lexicalizers/taggers/brill.rb +70 -0
- data/lib/treat/lexicalizers/taggers/brill/patch.rb +61 -0
- data/lib/treat/lexicalizers/taggers/lingua.rb +90 -0
- data/lib/treat/lexicalizers/taggers/stanford.rb +97 -0
- data/lib/treat/linguistics.rb +9 -0
- data/lib/treat/linguistics/categories.rb +11 -0
- data/lib/treat/linguistics/tags.rb +422 -0
- data/lib/treat/loaders/linguistics.rb +30 -0
- data/lib/treat/loaders/stanford.rb +27 -0
- data/lib/treat/object.rb +1 -0
- data/lib/treat/processors.rb +37 -44
- data/lib/treat/processors/chunkers/autoselect.rb +16 -0
- data/lib/treat/processors/chunkers/html.rb +71 -0
- data/lib/treat/processors/chunkers/txt.rb +18 -24
- data/lib/treat/processors/parsers/enju.rb +253 -208
- data/lib/treat/processors/parsers/stanford.rb +130 -131
- data/lib/treat/processors/segmenters/punkt.rb +79 -45
- data/lib/treat/processors/segmenters/stanford.rb +46 -48
- data/lib/treat/processors/segmenters/tactful.rb +43 -36
- data/lib/treat/processors/tokenizers/perl.rb +124 -92
- data/lib/treat/processors/tokenizers/ptb.rb +81 -0
- data/lib/treat/processors/tokenizers/punkt.rb +48 -42
- data/lib/treat/processors/tokenizers/stanford.rb +39 -38
- data/lib/treat/processors/tokenizers/tactful.rb +64 -55
- data/lib/treat/proxies.rb +52 -35
- data/lib/treat/retrievers.rb +26 -16
- data/lib/treat/retrievers/indexers/ferret.rb +47 -26
- data/lib/treat/retrievers/searchers/ferret.rb +69 -50
- data/lib/treat/tree.rb +241 -183
- data/spec/collection.rb +123 -0
- data/spec/document.rb +93 -0
- data/spec/entity.rb +408 -0
- data/spec/languages.rb +25 -0
- data/spec/phrase.rb +146 -0
- data/spec/samples/mathematicians/archimedes.abw +34 -0
- data/spec/samples/mathematicians/euler.html +21 -0
- data/spec/samples/mathematicians/gauss.pdf +0 -0
- data/spec/samples/mathematicians/leibniz.txt +13 -0
- data/spec/samples/mathematicians/newton.doc +0 -0
- data/spec/sandbox.rb +5 -0
- data/spec/token.rb +109 -0
- data/spec/treat.rb +52 -0
- data/spec/tree.rb +117 -0
- data/spec/word.rb +110 -0
- data/spec/zone.rb +66 -0
- data/tmp/INFO +1 -1
- metadata +100 -201
- data/INSTALL +0 -1
- data/README +0 -3
- data/TODO +0 -28
- data/lib/economist/half_cocked_basel.txt +0 -16
- data/lib/economist/hungarys_troubles.txt +0 -46
- data/lib/economist/indias_slowdown.txt +0 -15
- data/lib/economist/merkozy_rides_again.txt +0 -24
- data/lib/economist/prada_is_not_walmart.txt +0 -9
- data/lib/economist/to_infinity_and_beyond.txt +0 -15
- data/lib/ferret/_11.cfs +0 -0
- data/lib/ferret/_14.cfs +0 -0
- data/lib/ferret/_p.cfs +0 -0
- data/lib/ferret/_s.cfs +0 -0
- data/lib/ferret/_v.cfs +0 -0
- data/lib/ferret/_y.cfs +0 -0
- data/lib/ferret/segments +0 -0
- data/lib/ferret/segments_15 +0 -0
- data/lib/treat/buildable.rb +0 -157
- data/lib/treat/category.rb +0 -33
- data/lib/treat/delegatable.rb +0 -116
- data/lib/treat/doable.rb +0 -45
- data/lib/treat/entities/collection.rb +0 -14
- data/lib/treat/entities/document.rb +0 -12
- data/lib/treat/entities/phrases.rb +0 -17
- data/lib/treat/entities/tokens.rb +0 -61
- data/lib/treat/entities/zones.rb +0 -41
- data/lib/treat/extractors/coreferences/stanford.rb +0 -69
- data/lib/treat/extractors/date/chronic.rb +0 -32
- data/lib/treat/extractors/date/ruby.rb +0 -25
- data/lib/treat/extractors/keywords/topics_tf_idf.rb +0 -48
- data/lib/treat/extractors/language/language_extractor.rb +0 -27
- data/lib/treat/extractors/named_entity_tag/stanford.rb +0 -53
- data/lib/treat/extractors/roles/naive.rb +0 -73
- data/lib/treat/extractors/statistics/frequency_in.rb +0 -16
- data/lib/treat/extractors/statistics/position_in.rb +0 -14
- data/lib/treat/extractors/statistics/tf_idf.rb +0 -104
- data/lib/treat/extractors/statistics/transition_matrix.rb +0 -105
- data/lib/treat/extractors/statistics/transition_probability.rb +0 -57
- data/lib/treat/extractors/topic_words/lda/data.dat +0 -46
- data/lib/treat/extractors/topic_words/lda/wiki.yml +0 -121
- data/lib/treat/extractors/topics/reuters/industry.xml +0 -2717
- data/lib/treat/extractors/topics/reuters/region.xml +0 -13586
- data/lib/treat/extractors/topics/reuters/topics.xml +0 -17977
- data/lib/treat/feature.rb +0 -58
- data/lib/treat/features.rb +0 -7
- data/lib/treat/formatters/visualizers/short_value.rb +0 -29
- data/lib/treat/formatters/visualizers/txt.rb +0 -45
- data/lib/treat/group.rb +0 -106
- data/lib/treat/helpers/linguistics_loader.rb +0 -18
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +0 -42
- data/lib/treat/inflectors/conjugations/linguistics.rb +0 -36
- data/lib/treat/inflectors/declensions/english.rb +0 -319
- data/lib/treat/inflectors/declensions/linguistics.rb +0 -42
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +0 -20
- data/lib/treat/inflectors/stem/porter.rb +0 -162
- data/lib/treat/inflectors/stem/porter_c.rb +0 -26
- data/lib/treat/inflectors/stem/uea.rb +0 -30
- data/lib/treat/install.rb +0 -59
- data/lib/treat/languages/tags.rb +0 -377
- data/lib/treat/lexicalizers/category/from_tag.rb +0 -49
- data/lib/treat/lexicalizers/linkages/naive.rb +0 -63
- data/lib/treat/lexicalizers/synsets/wordnet.rb +0 -76
- data/lib/treat/lexicalizers/tag/brill.rb +0 -91
- data/lib/treat/lexicalizers/tag/lingua.rb +0 -123
- data/lib/treat/lexicalizers/tag/stanford.rb +0 -70
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +0 -9716
- data/lib/treat/processors/segmenters/punkt/english.yaml +0 -10340
- data/lib/treat/processors/segmenters/punkt/french.yaml +0 -43159
- data/lib/treat/processors/segmenters/punkt/german.yaml +0 -9572
- data/lib/treat/processors/segmenters/punkt/greek.yaml +0 -6050
- data/lib/treat/processors/segmenters/punkt/italian.yaml +0 -14748
- data/lib/treat/processors/segmenters/punkt/polish.yaml +0 -9751
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +0 -13662
- data/lib/treat/processors/segmenters/punkt/russian.yaml +0 -4237
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +0 -24034
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +0 -10001
- data/lib/treat/processors/tokenizers/macintyre.rb +0 -77
- data/lib/treat/processors/tokenizers/multilingual.rb +0 -30
- data/lib/treat/registrable.rb +0 -28
- data/lib/treat/sugar.rb +0 -50
- data/lib/treat/viewable.rb +0 -29
- data/lib/treat/visitable.rb +0 -28
- data/test/profile.rb +0 -2
- data/test/tc_entity.rb +0 -117
- data/test/tc_extractors.rb +0 -73
- data/test/tc_formatters.rb +0 -41
- data/test/tc_inflectors.rb +0 -34
- data/test/tc_lexicalizers.rb +0 -32
- data/test/tc_processors.rb +0 -50
- data/test/tc_resources.rb +0 -22
- data/test/tc_treat.rb +0 -60
- data/test/tc_tree.rb +0 -60
- data/test/tests.rb +0 -20
- data/test/texts.rb +0 -19
- data/test/texts/english/half_cocked_basel.txt +0 -16
- data/test/texts/english/hose_and_dry.doc +0 -0
- data/test/texts/english/hungarys_troubles.abw +0 -70
- data/test/texts/english/long.html +0 -24
- data/test/texts/english/long.txt +0 -22
- data/test/texts/english/medium.txt +0 -5
- data/test/texts/english/republican_nomination.pdf +0 -0
- data/test/texts/english/saving_the_euro.odt +0 -0
- data/test/texts/english/short.txt +0 -3
- data/test/texts/english/zero_sum.html +0 -111
@@ -1,38 +1,45 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
1
|
+
# An adapter for the 'tactful_tokenizer' gem, which
|
2
|
+
# detects sentence boundaries based on a Naive Bayesian
|
3
|
+
# statistical model.
|
4
|
+
#
|
5
|
+
# Project website: https://github.com/SlyShy/Tackful-Tokenizer
|
6
|
+
#
|
7
|
+
# Original paper: Dan Gillick. 2009. Sentence Boundary Detection
|
8
|
+
# and the Problem with the U.S. University of California, Berkeley.
|
9
|
+
# http://dgillick.com/resource/sbd_naacl_2009.pdf
|
10
|
+
module Treat::Processors::Segmenters::Tactful
|
11
|
+
|
12
|
+
# Require the 'tactful_tokenizer' gem.
|
13
|
+
silence_warnings { require 'tactful_tokenizer' }
|
14
|
+
|
15
|
+
# Remove function definition 'tactful_tokenizer' by gem.
|
16
|
+
String.class_eval { undef :tokenize }
|
17
|
+
|
18
|
+
require 'treat/helpers/decimal_point_escaper'
|
19
|
+
|
20
|
+
# Keep only one copy of the segmenter.
|
21
|
+
@@segmenter = nil
|
22
|
+
|
23
|
+
# Segment a text or zone into sentences
|
24
|
+
# using the 'tactful_tokenizer' gem.
|
25
|
+
#
|
26
|
+
# Options: none.
|
27
|
+
def self.segment(entity, options = {})
|
28
|
+
|
29
|
+
entity.check_hasnt_children
|
30
|
+
|
31
|
+
s = entity.to_s
|
32
|
+
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
33
|
+
|
34
|
+
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
|
35
|
+
|
36
|
+
@@segmenter ||= TactfulTokenizer::Model.new
|
37
|
+
|
38
|
+
sentences = @@segmenter.tokenize_text(s)
|
39
|
+
sentences.each do |sentence|
|
40
|
+
Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
|
41
|
+
entity << Treat::Entities::Phrase.from_string(sentence)
|
36
42
|
end
|
37
43
|
end
|
38
|
-
|
44
|
+
|
45
|
+
end
|
@@ -1,96 +1,128 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
# Isolate ellipses
|
35
|
-
# s{\.\.\.} { ... }g;
|
36
|
-
text.gsub!(/\.\.\./,' ... ')
|
37
|
-
# Isolate any embedded punctuation chars
|
38
|
-
# s{([,;:\@\#\$\%&])} { $1 }g;
|
39
|
-
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
40
|
-
# Assume sentence tokenization has been done first, so split FINAL
|
41
|
-
# periods only.
|
42
|
-
# s/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /$1 .$2 /gx;
|
43
|
-
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
44
|
-
# however, we may as well split ALL question marks and exclamation points,
|
45
|
-
# since they shouldn't have the abbrev.-marker ambiguity problem
|
46
|
-
#s{([?!])} { $1 }g;
|
47
|
-
text.gsub!(/([?!])/, ' \1 ')
|
48
|
-
# parentheses, brackets, etc.
|
49
|
-
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
50
|
-
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
51
|
-
#s/(-{2,})/ $1 /g;
|
52
|
-
text.gsub!(/(-{2,})/,' \1 ')
|
53
|
-
# Add a space to the beginning and end of each line, to reduce
|
54
|
-
# necessary number of regexps below.
|
55
|
-
#s/$/ /;
|
56
|
-
text.gsub!(/$/," ")
|
57
|
-
#s/^/ /;
|
58
|
-
text.gsub!(/^/," ")
|
59
|
-
# possessive or close-single-quote
|
60
|
-
#s/\([^\']\)\' /$1 \' /g;
|
61
|
-
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
62
|
-
# as in it's, I'm, we'd
|
63
|
-
#s/\'([smd]) / \'$1 /ig;
|
64
|
-
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
65
|
-
#s/\'(ll|re|ve) / \'$1 /ig;
|
66
|
-
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
67
|
-
#s/n\'t / n\'t /ig;
|
68
|
-
text.gsub!(/n\'t /i," n't ")
|
69
|
-
#s/ (can)(not) / $1 $2 /ig;
|
70
|
-
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
71
|
-
#s/ (d\')(ye) / $1 $2 /ig;
|
72
|
-
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
73
|
-
#s/ (gim)(me) / $1 $2 /ig;
|
74
|
-
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
75
|
-
#s/ (gon)(na) / $1 $2 /ig;
|
76
|
-
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
77
|
-
#s/ (got)(ta) / $1 $2 /ig;
|
78
|
-
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
79
|
-
#s/ (lem)(me) / $1 $2 /ig;
|
80
|
-
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
81
|
-
#s/ (more)(\'n) / $1 $2 /ig;
|
82
|
-
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
83
|
-
#s/ (\'t)(is|was) / $1 $2 /ig;
|
84
|
-
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
85
|
-
#s/ (wan)(na) / $1 $2 /ig;
|
86
|
-
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
87
|
-
tokens = text.split(/\s/)
|
88
|
-
tokens[1..-1].each do |token|
|
89
|
-
next if token =~ /([[:space:]]+)/
|
90
|
-
entity << Treat::Entities::Token.from_string(token)
|
91
|
-
end
|
92
|
-
end
|
93
|
-
end
|
2
|
+
#
|
3
|
+
# Tokenize the entity using a native rule-based
|
4
|
+
# algorithm. This tokenizer is a port from an
|
5
|
+
# unknown Perl module, which I have lifted from
|
6
|
+
# the 'rbtagger' gem.
|
7
|
+
#
|
8
|
+
# Author: Todd A. Fisher
|
9
|
+
#
|
10
|
+
# This code is free to use under the terms of
|
11
|
+
# the MIT license.
|
12
|
+
#
|
13
|
+
# Original project website:
|
14
|
+
#
|
15
|
+
# https://github.com/taf2/rb-brill-tagger
|
16
|
+
module Treat::Processors::Tokenizers::Perl
|
17
|
+
|
18
|
+
require 'treat/helpers/decimal_point_escaper'
|
19
|
+
|
20
|
+
# Tokenize the entity using a rule-based algorithm
|
21
|
+
# ported from Perl by Todd A. Fisher.
|
22
|
+
#
|
23
|
+
# Options: none.
|
24
|
+
def self.tokenize(entity, options = {})
|
25
|
+
|
26
|
+
entity.check_hasnt_children
|
27
|
+
s = entity.to_s
|
28
|
+
|
29
|
+
tokens = get_tokens(entity.to_s)
|
30
|
+
tokens[1..-1].each do |token|
|
31
|
+
next if token =~ /^\s*$/
|
32
|
+
entity << Treat::Entities::Token.
|
33
|
+
from_string(token)
|
94
34
|
end
|
35
|
+
|
95
36
|
end
|
37
|
+
|
38
|
+
# Helper method to perform the tokenization.
|
39
|
+
def self.get_tokens(string)
|
40
|
+
|
41
|
+
# Normalize all whitespace
|
42
|
+
text = string.gsub(/\s+/,' ')
|
43
|
+
|
44
|
+
# Replace all decimal points by ^^
|
45
|
+
Treat::Helpers::DecimalPointEscaper.escape!(text)
|
46
|
+
|
47
|
+
# Translate some common extended ascii
|
48
|
+
# characters to quotes
|
49
|
+
text.gsub!(/‘/,'`')
|
50
|
+
text.gsub!(/’/,"'")
|
51
|
+
text.gsub!(/“/,"``")
|
52
|
+
text.gsub!(/”/,"''")
|
53
|
+
|
54
|
+
# Attempt to get correct directional quotes
|
55
|
+
# s{\"\b} { `` }g;
|
56
|
+
text.gsub!(/\"\b/,' `` ')
|
57
|
+
# s{\b\"} { '' }g;
|
58
|
+
text.gsub!(/\b\"/," '' ")
|
59
|
+
#s{\"(?=\s)} { '' }g;
|
60
|
+
text.gsub!(/\"(?=\s)/," '' ")
|
61
|
+
#s{\"} { `` }g;
|
62
|
+
text.gsub!(/\"(?=\s)/," `` ")
|
63
|
+
# Isolate ellipses
|
64
|
+
# s{\.\.\.} { ... }g;
|
65
|
+
text.gsub!(/\.\.\./,' ... ')
|
66
|
+
# Isolate any embedded punctuation chars
|
67
|
+
# s{([,;:\@\#\$\%&])} { $1 }g;
|
68
|
+
text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
|
69
|
+
|
70
|
+
# Assume sentence tokenization has been
|
71
|
+
# done first, so split FINAL
|
72
|
+
# periods only.
|
73
|
+
# s/ ([^.]) \. ([\]\)\}\>\"\']*)
|
74
|
+
# [ \t]* $ /$1 .$2 /gx;
|
75
|
+
text.gsub!(/ ([^.]) \. ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
|
76
|
+
# however, we may as well split ALL
|
77
|
+
# question marks and exclamation points,
|
78
|
+
# since they shouldn't have the abbrev.
|
79
|
+
# -marker ambiguity problem
|
80
|
+
#s{([?!])} { $1 }g;
|
81
|
+
text.gsub!(/([?!])/, ' \1 ')
|
82
|
+
# parentheses, brackets, etc.
|
83
|
+
#s{([\]\[\(\)\{\}\<\>])} { $1 }g;
|
84
|
+
text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
|
85
|
+
#s/(-{2,})/ $1 /g;
|
86
|
+
text.gsub!(/(-{2,})/,' \1 ')
|
87
|
+
|
88
|
+
# Add a space to the beginning and end of
|
89
|
+
# each line, to reduce # of regexps below.
|
90
|
+
#s/$/ /;
|
91
|
+
text.gsub!(/$/," ")
|
92
|
+
#s/^/ /;
|
93
|
+
text.gsub!(/^/," ")
|
94
|
+
|
95
|
+
# possessive or close-single-quote
|
96
|
+
#s/\([^\']\)\' /$1 \' /g;
|
97
|
+
text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
|
98
|
+
# as in it's, I'm, we'd
|
99
|
+
#s/\'([smd]) / \'$1 /ig;
|
100
|
+
text.gsub!(/\'([smd]) /i,%q( '\1 ))
|
101
|
+
#s/\'(ll|re|ve) / \'$1 /ig;
|
102
|
+
text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
|
103
|
+
#s/n\'t / n\'t /ig;
|
104
|
+
text.gsub!(/n\'t /i," n't ")
|
105
|
+
|
106
|
+
#s/ (can)(not) / $1 $2 /ig;
|
107
|
+
text.gsub!(/ (can)(not) /i,' \1 \2 ')
|
108
|
+
#s/ (d\')(ye) / $1 $2 /ig;
|
109
|
+
text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
|
110
|
+
#s/ (gim)(me) / $1 $2 /ig;
|
111
|
+
text.gsub!(/ (gim)(me) /i,' \1 \2 ')
|
112
|
+
#s/ (gon)(na) / $1 $2 /ig;
|
113
|
+
text.gsub!(/ (gon)(na) /i,' \1 \2 ')
|
114
|
+
#s/ (got)(ta) / $1 $2 /ig;
|
115
|
+
text.gsub!(/ (got)(ta) /i,' \1 \2 ')
|
116
|
+
#s/ (lem)(me) / $1 $2 /ig;
|
117
|
+
text.gsub!(/ (lem)(me) /i,' \1 \2 ')
|
118
|
+
#s/ (more)(\'n) / $1 $2 /ig;
|
119
|
+
text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
|
120
|
+
#s/ (\'t)(is|was) / $1 $2 /ig;
|
121
|
+
text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
|
122
|
+
#s/ (wan)(na) / $1 $2 /ig;
|
123
|
+
text.gsub!(/ (wan)(na) /i,' \1 \2 ')
|
124
|
+
text.split(/\s/)
|
125
|
+
|
126
|
+
end
|
127
|
+
|
96
128
|
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# A native rule-basd tokenizer based on the one
|
2
|
+
# developped by Robert Macyntyre in 1995 for the Penn
|
3
|
+
# Treebank project. This tokenizer follows the
|
4
|
+
# conventions used by the Penn Treebank.
|
5
|
+
#
|
6
|
+
# Original script:
|
7
|
+
# http://www.cis.upenn.edu/~treebank/tokenizer.sed
|
8
|
+
#
|
9
|
+
# Copyright (c) 2004 UTIYAMA Masao <mutiyama@nict.go.jp>
|
10
|
+
# All rights reserved. This program is free software;
|
11
|
+
# you can redistribute it and/or modify it under the
|
12
|
+
# same terms as Ruby itself.
|
13
|
+
module Treat::Processors::Tokenizers::PTB
|
14
|
+
|
15
|
+
require 'treat/helpers/decimal_point_escaper'
|
16
|
+
|
17
|
+
# Tokenize the entity using a native rule-based algorithm.
|
18
|
+
def self.tokenize(entity, options = {})
|
19
|
+
|
20
|
+
entity.check_hasnt_children
|
21
|
+
|
22
|
+
if entity.has_children?
|
23
|
+
raise Treat::Exception,
|
24
|
+
"Cannot tokenize an #{entity.class} " +
|
25
|
+
"that already has children."
|
26
|
+
end
|
27
|
+
chunks = split(entity.to_s)
|
28
|
+
chunks.each do |chunk|
|
29
|
+
next if chunk =~ /([[:space:]]+)/
|
30
|
+
entity << Treat::Entities::Token.from_string(chunk)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Helper method to split the string into tokens.
|
35
|
+
def self.split(string)
|
36
|
+
s = " " + string + " "
|
37
|
+
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
38
|
+
s.gsub!(/\s+/," ")
|
39
|
+
s.gsub!(/(\s+)''/,'\1"')
|
40
|
+
s.gsub!(/(\s+)``/,'\1"')
|
41
|
+
s.gsub!(/''(\s+)/,'"\1')
|
42
|
+
s.gsub!(/``(\s+)/,'"\1')
|
43
|
+
s.gsub!(/ (['`]+)([^0-9].+) /,' \1 \2 ')
|
44
|
+
s.gsub!(/([ (\[{<])"/,'\1 `` ')
|
45
|
+
s.gsub!(/\.\.\./,' ... ')
|
46
|
+
s.gsub!(/[,;:@\#$%&]/,' \& ')
|
47
|
+
s.gsub!(/([^.])([.])([\])}>"']*)[ ]*$/,'\1 \2\3 ')
|
48
|
+
s.gsub!(/[?!]/,' \& ')
|
49
|
+
s.gsub!(/[\]\[(){}<>]/,' \& ')
|
50
|
+
s.gsub!(/--/,' -- ')
|
51
|
+
s.sub!(/$/,' ')
|
52
|
+
s.sub!(/^/,' ')
|
53
|
+
s.gsub!(/"/,' \'\' ')
|
54
|
+
s.gsub!(/([^'])' /,'\1 \' ')
|
55
|
+
s.gsub!(/'([sSmMdD]) /,' \'\1 ')
|
56
|
+
s.gsub!(/'ll /,' \'ll ')
|
57
|
+
s.gsub!(/'re /,' \'re ')
|
58
|
+
s.gsub!(/'ve /,' \'ve ')
|
59
|
+
s.gsub!(/n't /,' n\'t ')
|
60
|
+
s.gsub!(/'LL /,' \'LL ')
|
61
|
+
s.gsub!(/'RE /,' \'RE ')
|
62
|
+
s.gsub!(/'VE /,' \'VE ')
|
63
|
+
s.gsub!(/N'T /,' N\'T ')
|
64
|
+
s.gsub!(/ ([Cc])annot /,' \1an not ')
|
65
|
+
s.gsub!(/ ([Dd])'ye /,' \1\' ye ')
|
66
|
+
s.gsub!(/ ([Gg])imme /,' \1im me ')
|
67
|
+
s.gsub!(/ ([Gg])onna /,' \1on na ')
|
68
|
+
s.gsub!(/ ([Gg])otta /,' \1ot ta ')
|
69
|
+
s.gsub!(/ ([Ll])emme /,' \1em me ')
|
70
|
+
s.gsub!(/ ([Mm])ore'n /,' \1ore \'n ')
|
71
|
+
s.gsub!(/ '([Tt])is /,' \'\1 is ')
|
72
|
+
s.gsub!(/ '([Tt])was /,' \'\1 was ')
|
73
|
+
s.gsub!(/ ([Ww])anna /,' \1an na ')
|
74
|
+
while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
|
75
|
+
s.gsub!(/\//, ' / ')
|
76
|
+
s.gsub!(/\s+/,' ')
|
77
|
+
s.strip!
|
78
|
+
s.split(/\s+/)
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -1,45 +1,51 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
1
|
+
# A tokenizer that was lifted from the 'punkt-segmenter'
|
2
|
+
# Ruby gem.
|
3
|
+
#
|
4
|
+
# This code follows the terms and conditions of Apache
|
5
|
+
# License v2 (http://www.apache.org/licenses/LICENSE-2.0)
|
6
|
+
#
|
7
|
+
# Authors: Willy <willy@csse.unimelb.edu.au>
|
8
|
+
# (original Python port), Steven Bird
|
9
|
+
# <sb@csse.unimelb.edu.au> (additions),
|
10
|
+
# Edward Loper <edloper@gradient.cis.upenn.edu>
|
11
|
+
# (rewrite), Joel Nothman <jnothman@student.usyd.edu.au>
|
12
|
+
# (almost rewrite).
|
13
|
+
#
|
14
|
+
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
15
|
+
class Treat::Processors::Tokenizers::Punkt
|
16
|
+
|
17
|
+
require 'treat/helpers/decimal_point_escaper'
|
18
|
+
|
19
|
+
SentEndChars = ['.', '?', '!']
|
20
|
+
ReSentEndChars = /[.?!]/
|
21
|
+
InternalPunctuation = [',', ':', ';']
|
22
|
+
ReBoundaryRealignment = /^["\')\]}]+?(?:\s+|(?=--)|$)/m
|
23
|
+
ReWordStart = /[^\(\"\`{\[:;&\#\*@\)}\]\-,]/
|
24
|
+
ReNonWordChars = /(?:[?!)\";}\]\*:@\'\({\[])/
|
25
|
+
ReMultiCharPunct = /(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)/
|
26
|
+
ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
|
27
|
+
RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
|
28
|
+
|
29
|
+
# Tokenize the text using the algorithm lifted from
|
30
|
+
# the Punkt tokenizer gem.
|
31
|
+
#
|
32
|
+
# Options: none.
|
33
|
+
def self.tokenize(entity, options = {})
|
34
|
+
|
35
|
+
entity.check_hasnt_children
|
36
|
+
|
37
|
+
s = entity.to_s
|
38
|
+
Treat::Helpers::DecimalPointEscaper.escape!(s)
|
39
|
+
|
40
|
+
s.scan(ReWordTokenizer).each do |token|
|
41
|
+
if SentEndChars.include?(token[-1])
|
42
|
+
entity << Treat::Entities::Token.from_string(token[0..-2])
|
43
|
+
entity << Treat::Entities::Token.from_string(token[-1..-1])
|
44
|
+
else
|
45
|
+
entity << Treat::Entities::Token.from_string(token)
|
42
46
|
end
|
43
47
|
end
|
48
|
+
|
44
49
|
end
|
45
|
-
|
50
|
+
|
51
|
+
end
|