treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
@@ -1,4 +1,4 @@
|
|
1
|
-
class Treat::Processors::Chunkers::HTML
|
1
|
+
class Treat::Workers::Processors::Chunkers::HTML
|
2
2
|
|
3
3
|
require 'nokogiri'
|
4
4
|
|
@@ -24,6 +24,7 @@ class Treat::Processors::Chunkers::HTML
|
|
24
24
|
node.parent && node.parent.type == :section)
|
25
25
|
|
26
26
|
if $1
|
27
|
+
|
27
28
|
lvl = $1.to_i
|
28
29
|
if lvl <= level
|
29
30
|
node.ancestors_with_type(:section).
|
@@ -41,13 +42,13 @@ class Treat::Processors::Chunkers::HTML
|
|
41
42
|
node.set :level, level
|
42
43
|
|
43
44
|
end
|
44
|
-
|
45
|
+
|
45
46
|
t = node <<
|
46
47
|
Treat::Entities::Title.new(txt)
|
47
48
|
t.set :level, level
|
48
49
|
|
49
50
|
elsif child.name == 'p'
|
50
|
-
|
51
|
+
|
51
52
|
node << Treat::Entities::Zone.
|
52
53
|
from_string(txt)
|
53
54
|
|
@@ -0,0 +1,32 @@
|
|
1
|
+
class Treat::Workers::Processors::Chunkers::TXT
|
2
|
+
|
3
|
+
# Separates a string into
|
4
|
+
# zones on the basis of newlines.
|
5
|
+
#
|
6
|
+
# Options: none.
|
7
|
+
def self.chunk(entity, options = {})
|
8
|
+
|
9
|
+
entity.check_hasnt_children
|
10
|
+
zones = entity.to_s.split("\n")
|
11
|
+
current = entity
|
12
|
+
zones.each do |zone|
|
13
|
+
zone.strip!
|
14
|
+
next if zone == ''
|
15
|
+
c = Treat::Entities::
|
16
|
+
Zone.from_string(zone)
|
17
|
+
if c.type == :title
|
18
|
+
if current.type == :section
|
19
|
+
current = current.parent
|
20
|
+
current = entity << Treat::
|
21
|
+
Entities::Section.new
|
22
|
+
else
|
23
|
+
current = entity << Treat::
|
24
|
+
Entities::Section.new
|
25
|
+
end
|
26
|
+
end
|
27
|
+
current << c
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
end
|
@@ -10,7 +10,7 @@
|
|
10
10
|
# Takuya Matsuzaki, Yusuke Miyao, and Jun'ichi Tsujii.
|
11
11
|
# 2007. Efficient HPSG Parsing with Supertagging and
|
12
12
|
# CFG-filtering. In Proceedings of IJCAI 2007.
|
13
|
-
module Treat::Processors::Parsers::Enju
|
13
|
+
module Treat::Workers::Processors::Parsers::Enju
|
14
14
|
|
15
15
|
# Require the 'open3' library to connect
|
16
16
|
# with the background Enju process.
|
@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
|
|
23
23
|
@@parser = nil
|
24
24
|
|
25
25
|
# A hash of Enju cat tags mapped to word categories.
|
26
|
-
Ectc = Treat
|
26
|
+
Ectc = Treat.tags.enju.cat_to_category
|
27
27
|
|
28
28
|
# A hash of Enju cat/xcat pairs mapped to PTB tags.
|
29
|
-
Ecxtp = Treat
|
29
|
+
Ecxtp = Treat.tags.enju.xcat_to_ptb
|
30
30
|
|
31
31
|
# Parse the entity into its syntactical
|
32
32
|
# phrases using Enju.
|
@@ -1,8 +1,10 @@
|
|
1
1
|
# A wrapper class for the Stanford parser.
|
2
|
-
class Treat::Processors::Parsers::Stanford
|
2
|
+
class Treat::Workers::Processors::Parsers::Stanford
|
3
3
|
|
4
4
|
require 'treat/loaders/stanford'
|
5
5
|
|
6
|
+
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
7
|
+
|
6
8
|
# Hold one instance of the pipeline per language.
|
7
9
|
@@parsers = {}
|
8
10
|
|
@@ -27,9 +29,7 @@ class Treat::Processors::Parsers::Stanford
|
|
27
29
|
lang = entity.language
|
28
30
|
init(lang, options)
|
29
31
|
|
30
|
-
tag_set =
|
31
|
-
StanfordTagSetForLanguage[
|
32
|
-
Treat::Languages.describe(lang)]
|
32
|
+
tag_set = StanfordCoreNLP::Config::TagSets[language]
|
33
33
|
|
34
34
|
text = ::StanfordCoreNLP::Text.new(val)
|
35
35
|
@@parsers[lang].annotate(text)
|
@@ -58,8 +58,7 @@ class Treat::Processors::Parsers::Stanford
|
|
58
58
|
def self.init(lang, options)
|
59
59
|
return if @@parsers[lang]
|
60
60
|
|
61
|
-
|
62
|
-
Treat::Loaders::Stanford.load(language)
|
61
|
+
Treat::Loaders::Stanford.load(lang)
|
63
62
|
|
64
63
|
options = DefaultOptions.merge(options)
|
65
64
|
StanfordCoreNLP.use(lang)
|
@@ -117,8 +116,7 @@ class Treat::Processors::Parsers::Stanford
|
|
117
116
|
tag_s, tag_opt = *tag.split('-')
|
118
117
|
tag_s ||= ''
|
119
118
|
|
120
|
-
if
|
121
|
-
Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
|
119
|
+
if Pttc[tag_s] && Pttc[tag_s][tag_set]
|
122
120
|
ruby_child = Treat::Entities::Phrase.new
|
123
121
|
else
|
124
122
|
l = java_child.children[0].to_s
|
@@ -5,9 +5,7 @@
|
|
5
5
|
# Original paper: Kiss, Tibor and Strunk, Jan (2006):
|
6
6
|
# Unsupervised Multilingual Sentence Boundary Detection.
|
7
7
|
# Computational Linguistics 32: 485-525.
|
8
|
-
module Treat::Processors::Segmenters::Punkt
|
9
|
-
|
10
|
-
require 'treat/helpers/decimal_point_escaper'
|
8
|
+
module Treat::Workers::Processors::Segmenters::Punkt
|
11
9
|
|
12
10
|
# Require silently the punkt-segmenter gem.
|
13
11
|
silence_warnings { require 'punkt-segmenter' }
|
@@ -41,7 +39,8 @@ module Treat::Processors::Segmenters::Punkt
|
|
41
39
|
# Replace the point in all floating-point numbers
|
42
40
|
# by ^^; this is a fix since Punkt trips on decimal
|
43
41
|
# numbers.
|
44
|
-
|
42
|
+
|
43
|
+
escape_floats!(s)
|
45
44
|
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
46
45
|
|
47
46
|
result = @@segmenters[lang].
|
@@ -50,8 +49,7 @@ module Treat::Processors::Segmenters::Punkt
|
|
50
49
|
|
51
50
|
result.each do |sentence|
|
52
51
|
# Unescape the sentence.
|
53
|
-
|
54
|
-
unescape!(sentence)
|
52
|
+
unescape_floats!(sentence)
|
55
53
|
entity << Treat::Entities::Phrase.
|
56
54
|
from_string(sentence)
|
57
55
|
end
|
@@ -65,13 +63,11 @@ module Treat::Processors::Segmenters::Punkt
|
|
65
63
|
if options[:model]
|
66
64
|
model = options[:model]
|
67
65
|
else
|
68
|
-
|
69
|
-
model = "#{Treat.models}punkt/#{l}.yaml"
|
70
|
-
|
66
|
+
model = "#{Treat.paths.models}punkt/#{lang}.yaml"
|
71
67
|
unless File.readable?(model)
|
72
68
|
raise Treat::Exception,
|
73
69
|
"Could not get the language model " +
|
74
|
-
"for the Punkt segmenter for #{
|
70
|
+
"for the Punkt segmenter for #{lang.to_s.capitalize}."
|
75
71
|
end
|
76
72
|
end
|
77
73
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# A wrapper for the sentence splitter supplied by
|
2
2
|
# the Stanford parser.
|
3
|
-
class Treat::Processors::Segmenters::Stanford
|
3
|
+
class Treat::Workers::Processors::Segmenters::Stanford
|
4
4
|
|
5
5
|
require 'treat/loaders/stanford'
|
6
6
|
Treat::Loaders::Stanford.load
|
@@ -41,7 +41,7 @@ class Treat::Processors::Segmenters::Stanford
|
|
41
41
|
from_string(sentence, true)
|
42
42
|
entity << s
|
43
43
|
if options[:also_tokenize]
|
44
|
-
Treat::Processors::Tokenizers::Stanford.
|
44
|
+
Treat::Workers::Processors::Tokenizers::Stanford.
|
45
45
|
add_tokens(s, sentence.get(:tokens))
|
46
46
|
end
|
47
47
|
end
|
@@ -7,7 +7,7 @@
|
|
7
7
|
# Original paper: Dan Gillick. 2009. Sentence Boundary Detection
|
8
8
|
# and the Problem with the U.S. University of California, Berkeley.
|
9
9
|
# http://dgillick.com/resource/sbd_naacl_2009.pdf
|
10
|
-
module Treat::Processors::Segmenters::Tactful
|
10
|
+
module Treat::Workers::Processors::Segmenters::Tactful
|
11
11
|
|
12
12
|
# Require the 'tactful_tokenizer' gem.
|
13
13
|
silence_warnings { require 'tactful_tokenizer' }
|
@@ -15,8 +15,6 @@ module Treat::Processors::Segmenters::Tactful
|
|
15
15
|
# Remove function definition 'tactful_tokenizer' by gem.
|
16
16
|
String.class_eval { undef :tokenize }
|
17
17
|
|
18
|
-
require 'treat/helpers/decimal_point_escaper'
|
19
|
-
|
20
18
|
# Keep only one copy of the segmenter.
|
21
19
|
@@segmenter = nil
|
22
20
|
|
@@ -30,7 +28,7 @@ module Treat::Processors::Segmenters::Tactful
|
|
30
28
|
|
31
29
|
s = entity.to_s
|
32
30
|
|
33
|
-
|
31
|
+
escape_floats!(s)
|
34
32
|
|
35
33
|
s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
|
36
34
|
|
@@ -39,8 +37,7 @@ module Treat::Processors::Segmenters::Tactful
|
|
39
37
|
sentences = @@segmenter.tokenize_text(s)
|
40
38
|
|
41
39
|
sentences.each do |sentence|
|
42
|
-
|
43
|
-
puts sentence.to_s if sentence.to_s.include?('staff')
|
40
|
+
unescape_floats!(sentence)
|
44
41
|
entity << Treat::Entities::Phrase.from_string(sentence)
|
45
42
|
end
|
46
43
|
|
@@ -11,7 +11,7 @@
|
|
11
11
|
# All rights reserved. This program is free software;
|
12
12
|
# you can redistribute it and/or modify it under the
|
13
13
|
# same terms as Ruby itself.
|
14
|
-
module Treat::Processors::Tokenizers::PTB
|
14
|
+
module Treat::Workers::Processors::Tokenizers::PTB
|
15
15
|
|
16
16
|
# Tokenize the entity using a native rule-based algorithm.
|
17
17
|
def self.tokenize(entity, options = {})
|
@@ -35,14 +35,11 @@ module Treat::Processors::Tokenizers::PTB
|
|
35
35
|
|
36
36
|
s = " " + string + " "
|
37
37
|
|
38
|
-
# Translate some common extended ascii
|
39
|
-
# characters to quotes
|
40
38
|
s.gsub!(/‘/,'`')
|
41
39
|
s.gsub!(/’/,"'")
|
42
40
|
s.gsub!(/“/,"``")
|
43
41
|
s.gsub!(/”/,"''")
|
44
|
-
|
45
|
-
|
42
|
+
|
46
43
|
s.gsub!(/\s+/," ")
|
47
44
|
s.gsub!(/(\s+)''/,'\1"')
|
48
45
|
s.gsub!(/(\s+)``/,'\1"')
|
@@ -83,6 +80,10 @@ module Treat::Processors::Tokenizers::PTB
|
|
83
80
|
s.gsub!(/\//, ' / ')
|
84
81
|
s.gsub!(/\s+/,' ')
|
85
82
|
s.strip!
|
83
|
+
|
84
|
+
s.gsub!(/``/,'"')
|
85
|
+
s.gsub!(/''/,'"')
|
86
|
+
|
86
87
|
s.split(/\s+/)
|
87
88
|
end
|
88
89
|
|
@@ -12,7 +12,7 @@
|
|
12
12
|
# (almost rewrite).
|
13
13
|
#
|
14
14
|
# Project website: https://github.com/lfcipriani/punkt-segmenter
|
15
|
-
class Treat::Processors::Tokenizers::Punkt
|
15
|
+
class Treat::Workers::Processors::Tokenizers::Punkt
|
16
16
|
|
17
17
|
SentEndChars = ['.', '?', '!']
|
18
18
|
ReSentEndChars = /[.?!]/
|
@@ -4,10 +4,8 @@
|
|
4
4
|
# Released under the GNU GPL v3. Modified by Louis Mullie.
|
5
5
|
#
|
6
6
|
# Project website: https://github.com/SlyShy/Tactful_Tokenizer
|
7
|
-
class Treat::Processors::Tokenizers::Tactful
|
7
|
+
class Treat::Workers::Processors::Tokenizers::Tactful
|
8
8
|
|
9
|
-
require 'treat/helpers/decimal_point_escaper'
|
10
|
-
|
11
9
|
ReTokenize = [
|
12
10
|
# Uniform Quotes
|
13
11
|
[/''|``/, '"'],
|
@@ -52,7 +50,7 @@ class Treat::Processors::Tokenizers::Tactful
|
|
52
50
|
|
53
51
|
s = entity.to_s
|
54
52
|
|
55
|
-
|
53
|
+
escape_floats!(s)
|
56
54
|
|
57
55
|
ReTokenize.each do |rules|
|
58
56
|
s.gsub!(rules[0], rules[1])
|
@@ -60,7 +58,7 @@ class Treat::Processors::Tokenizers::Tactful
|
|
60
58
|
|
61
59
|
s.split(' ').each do |token|
|
62
60
|
|
63
|
-
|
61
|
+
unescape_floats!(token)
|
64
62
|
entity << Treat::Entities::Token.
|
65
63
|
from_string(token)
|
66
64
|
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# This module creates all the worker categories
|
2
|
+
# and the groups within these categories and adds
|
3
|
+
# the relevant hooks on the appropriate entities.
|
4
|
+
module Treat::Workers
|
5
|
+
|
6
|
+
require 'treat/workers/group'
|
7
|
+
|
8
|
+
# A lookup table for entity types.
|
9
|
+
@@lookup = {}
|
10
|
+
|
11
|
+
# Find a worker group based on method.
|
12
|
+
def self.lookup(method)
|
13
|
+
@@lookup[method]
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.create_categories
|
17
|
+
Treat.workers.list.each do |cat|
|
18
|
+
create_category(cat.to_s.
|
19
|
+
capitalize.intern,
|
20
|
+
load_category_conf(cat))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.load_category_conf(name)
|
25
|
+
config = Treat.workers[name]
|
26
|
+
if config.nil?
|
27
|
+
raise Treat::Exception,
|
28
|
+
"The configuration file " +
|
29
|
+
"for #{cat_sym} is missing."
|
30
|
+
end
|
31
|
+
config
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.create_category(name, conf)
|
35
|
+
category = self.const_set(name, Module.new)
|
36
|
+
conf.each_pair do |group, worker|
|
37
|
+
name = group.to_s.capitalize.intern
|
38
|
+
category.module_eval do
|
39
|
+
@@methods = []; def methods;
|
40
|
+
@@methods; end; def groups;
|
41
|
+
self.constants; end
|
42
|
+
end
|
43
|
+
self.create_group(name, worker, category)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.create_group(name, conf, category)
|
48
|
+
group = category.const_set(name, Module.new)
|
49
|
+
self.set_group_options(group, conf)
|
50
|
+
self.bind_group_targets(group)
|
51
|
+
self.register_group_presets(group, conf)
|
52
|
+
@@methods << group.method
|
53
|
+
@@lookup[group.method] = group
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.bind_group_targets(group)
|
57
|
+
group.targets.each do |entity_type|
|
58
|
+
entity = Treat::Entities.
|
59
|
+
const_get(cc(entity_type))
|
60
|
+
entity.class_eval do
|
61
|
+
add_workers group
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def self.register_group_presets(group, conf)
|
67
|
+
return unless conf.respond_to? :presets
|
68
|
+
conf.presets.each do |m|
|
69
|
+
@@methods << m
|
70
|
+
@@lookup[m] = group
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.set_group_options(group, conf)
|
75
|
+
group.module_eval do
|
76
|
+
extend Treat::Workers::Group
|
77
|
+
self.type = conf.type
|
78
|
+
self.targets = conf.targets
|
79
|
+
if conf.respond_to?(:default)
|
80
|
+
self.default = conf.default
|
81
|
+
end
|
82
|
+
if conf.respond_to?(:preset_option)
|
83
|
+
self.preset_option = conf.preset_option
|
84
|
+
end
|
85
|
+
if conf.respond_to?(:presets)
|
86
|
+
self.presets = conf.presets
|
87
|
+
end
|
88
|
+
if conf.respond_to?(:recursive)
|
89
|
+
self.recursive = conf.recursive
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
self.create_categories
|
95
|
+
|
96
|
+
end
|
data/lib/treat.rb
CHANGED
@@ -1,62 +1,36 @@
|
|
1
1
|
module Treat
|
2
2
|
|
3
|
-
#
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
if RUBY_VERSION <= '1.9'
|
8
|
-
raise Treat::Exception,
|
9
|
-
'Treat requires Ruby 1.9 or higher.'
|
10
|
-
end
|
11
|
-
|
12
|
-
# The current version of Treat.
|
13
|
-
VERSION = "1.0.6"
|
14
|
-
|
15
|
-
# Add methods to handle syntactic sugar,
|
16
|
-
# language configuration options, and paths.
|
17
|
-
require 'treat/configurable'
|
18
|
-
extend Treat::Configurable
|
19
|
-
|
20
|
-
# The folders in the library and descriptions.
|
21
|
-
Paths = {
|
22
|
-
:tmp => 'temporary files',
|
23
|
-
:lib => 'class and module definitions',
|
24
|
-
:bin => 'binary files',
|
25
|
-
:files => 'user-saved files',
|
26
|
-
:data => 'data set files',
|
27
|
-
:models => 'model files',
|
28
|
-
:spec => 'spec test files'
|
29
|
-
}
|
30
|
-
|
31
|
-
# Add methods to provide access to common paths.
|
32
|
-
class << self
|
33
|
-
Paths.each do |path, _|
|
34
|
-
define_method(path) do
|
35
|
-
(File.dirname(__FILE__).
|
36
|
-
split('/')[0..-2].join('/') +
|
37
|
-
'/' + path.to_s + '/').gsub(
|
38
|
-
'lib/../', '')
|
39
|
-
end
|
40
|
-
end
|
3
|
+
# Treat requires Ruby >= 1.9.2
|
4
|
+
if RUBY_VERSION < '1.9.2'
|
5
|
+
raise "Treat requires Ruby version 1.9.2 " +
|
6
|
+
"or higher, but current is #{RUBY_VERSION}."
|
41
7
|
end
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
require 'treat/
|
8
|
+
|
9
|
+
# Custom exception class.
|
10
|
+
class Exception < ::Exception; end
|
11
|
+
|
12
|
+
# Load configuration options.
|
13
|
+
require 'treat/config'
|
14
|
+
# Load all workers.
|
15
|
+
require 'treat/helpers'
|
16
|
+
# Require library loaders.
|
17
|
+
require 'treat/loaders'
|
18
|
+
# Require all core classes.
|
19
|
+
require 'treat/core'
|
20
|
+
# Require all entity classes.
|
48
21
|
require 'treat/entities'
|
49
|
-
|
50
|
-
require 'treat/
|
22
|
+
# Lazy load worker classes.
|
23
|
+
require 'treat/workers'
|
24
|
+
# Require proxies last.
|
51
25
|
require 'treat/proxies'
|
52
26
|
|
27
|
+
# Turn sugar on.
|
28
|
+
Treat::Config.sweeten!
|
29
|
+
|
53
30
|
# Install packages for a given language.
|
54
31
|
def self.install(language = :english)
|
55
32
|
require 'treat/installer'
|
56
33
|
Treat::Installer.install(language)
|
57
34
|
end
|
58
35
|
|
59
|
-
# Enable syntactic sugar by default.
|
60
|
-
Treat.sweeten!
|
61
|
-
|
62
36
|
end
|
data/spec/collection.rb
CHANGED
@@ -3,7 +3,7 @@ require_relative '../lib/treat'
|
|
3
3
|
describe Treat::Entities::Collection do
|
4
4
|
|
5
5
|
before :all do
|
6
|
-
@file = Treat.spec + 'samples/mathematicians'
|
6
|
+
@file = Treat.paths.spec + 'samples/mathematicians'
|
7
7
|
end
|
8
8
|
|
9
9
|
describe "#<<" do
|
@@ -12,7 +12,7 @@ describe Treat::Entities::Collection do
|
|
12
12
|
|
13
13
|
it "copies the document to the collection's folder " +
|
14
14
|
"and adds the document object to the collection" do
|
15
|
-
f = Treat.spec + 'samples/test'
|
15
|
+
f = Treat.paths.spec + 'samples/test'
|
16
16
|
ff = '3_2_release_notes.html'
|
17
17
|
u = 'http://guides.rubyonrails.org/' + ff
|
18
18
|
c = Treat::Entities::Collection.build(f)
|
@@ -26,7 +26,7 @@ describe Treat::Entities::Collection do
|
|
26
26
|
|
27
27
|
context "when supplied with anything else" do
|
28
28
|
it "adds the object to the collection" do
|
29
|
-
f = Treat.spec + 'samples/test'
|
29
|
+
f = Treat.paths.spec + 'samples/test'
|
30
30
|
c = Treat::Entities::Collection.build(f)
|
31
31
|
c << Treat::Entities::Document.new
|
32
32
|
c.size.should eql 2
|
@@ -53,7 +53,7 @@ describe Treat::Entities::Collection do
|
|
53
53
|
context "when supplied a folder name that doesn't exist" do
|
54
54
|
|
55
55
|
it "creates the directory and opens the collection" do
|
56
|
-
f = Treat.spec + 'samples/test'
|
56
|
+
f = Treat.paths.spec + 'samples/test'
|
57
57
|
c = Treat::Entities::Collection.build(f)
|
58
58
|
FileTest.directory?(f).should eql true
|
59
59
|
c.should be_an_instance_of Treat::Entities::Collection
|
data/spec/document.rb
CHANGED
@@ -8,7 +8,7 @@ describe Treat::Entities::Document do
|
|
8
8
|
|
9
9
|
it "returns a list of general topics the document belongs to" do
|
10
10
|
#doc = Treat::Entities::Document.new(
|
11
|
-
#Treat.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
|
11
|
+
#Treat.paths.spec + 'samples/mathematicians/archimedes.abw').read(:abw)
|
12
12
|
#doc.do(:chunk, :segment, :tokenize)
|
13
13
|
#puts doc.topics.inspect
|
14
14
|
end
|
@@ -24,7 +24,7 @@ describe Treat::Entities::Document do
|
|
24
24
|
context "when supplied with a readable file name" do
|
25
25
|
it "opens the file and reads its " +
|
26
26
|
"content into a document" do
|
27
|
-
f = Treat.spec + 'samples/mathematicians/leibniz.txt'
|
27
|
+
f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
|
28
28
|
d = Treat::Entities::Document.build(f)
|
29
29
|
d.should be_an_instance_of Treat::Entities::Document
|
30
30
|
d.to_s.index('Gottfried Leibniz').should_not eql nil
|
@@ -36,7 +36,7 @@ describe Treat::Entities::Document do
|
|
36
36
|
"a document with the contents of the file" do
|
37
37
|
url = 'http://www.rubyinside.com/nethttp-cheat-sheet-2940.html'
|
38
38
|
d = Treat::Entities::Document.build(url)
|
39
|
-
d.format.should eql
|
39
|
+
d.format.should eql 'html'
|
40
40
|
d.print_tree
|
41
41
|
d.should be_an_instance_of Treat::Entities::Document
|
42
42
|
d.to_s.index('Rubyist').should_not eql nil
|
@@ -75,7 +75,7 @@ describe Treat::Entities::Document do
|
|
75
75
|
|
76
76
|
context "when called on an HTML document" do
|
77
77
|
doc = Treat::Entities::Document.new(
|
78
|
-
Treat.spec + 'samples/mathematicians/euler.html').read(:html)
|
78
|
+
Treat.paths.spec + 'samples/mathematicians/euler.html').read(:html)
|
79
79
|
it "splits the HTML document into sections, " +
|
80
80
|
"titles, paragraphs and lists" do
|
81
81
|
doc.chunk
|
@@ -88,7 +88,7 @@ describe Treat::Entities::Document do
|
|
88
88
|
|
89
89
|
context "when called on a text document" do
|
90
90
|
|
91
|
-
doc = Treat::Entities::Document.new(Treat.spec +
|
91
|
+
doc = Treat::Entities::Document.new(Treat.paths.spec +
|
92
92
|
'samples/mathematicians/leibniz.txt').read(:txt)
|
93
93
|
it "splits the document into titles and paragraphs" do
|
94
94
|
doc.chunk
|