treat 1.0.6 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -4
- data/README.md +13 -12
- data/bin/MANIFEST +1 -0
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/{www.economist.com/21552208 → 21552208.html} +86 -89
- data/files/{guides.rubyonrails.org/3_2_release_notes.html → 3_2_release_notes.html} +0 -0
- data/files/{INFO → MANIFEST} +0 -0
- data/files/{www.rubyinside.com/nethttp-cheat-sheet-2940.html → nethttp-cheat-sheet-2940.html} +12 -16
- data/files/weather-central-canada-heat-wave.html +1370 -0
- data/lib/treat/config/core/acronyms.rb +4 -0
- data/lib/treat/config/core/encodings.rb +8 -0
- data/lib/treat/config/core/entities.rb +2 -0
- data/lib/treat/config/core/language.rb +3 -0
- data/lib/treat/config/core/paths.rb +8 -0
- data/lib/treat/config/core/syntax.rb +1 -0
- data/lib/treat/config/core/verbosity.rb +1 -0
- data/lib/treat/config/databases/mongo.rb +3 -0
- data/lib/treat/config/languages/agnostic.rb +34 -0
- data/lib/treat/config/languages/arabic.rb +13 -0
- data/lib/treat/config/languages/chinese.rb +13 -0
- data/lib/treat/config/languages/dutch.rb +12 -0
- data/lib/treat/config/languages/english.rb +60 -0
- data/lib/treat/config/languages/french.rb +18 -0
- data/lib/treat/config/languages/german.rb +18 -0
- data/lib/treat/config/languages/greek.rb +12 -0
- data/lib/treat/config/languages/italian.rb +12 -0
- data/lib/treat/config/languages/polish.rb +12 -0
- data/lib/treat/config/languages/portuguese.rb +12 -0
- data/lib/treat/config/languages/russian.rb +12 -0
- data/lib/treat/config/languages/spanish.rb +12 -0
- data/lib/treat/config/languages/swedish.rb +12 -0
- data/lib/treat/config/libraries/stanford.rb +1 -0
- data/lib/treat/config/linguistics/categories.rb +4 -0
- data/lib/treat/config/linguistics/punctuation.rb +33 -0
- data/lib/treat/config/tags/aligned.rb +221 -0
- data/lib/treat/config/tags/enju.rb +71 -0
- data/lib/treat/config/tags/paris7.rb +17 -0
- data/lib/treat/config/tags/ptb.rb +15 -0
- data/lib/treat/config/workers/extractors.rb +39 -0
- data/lib/treat/config/workers/formatters.rb +20 -0
- data/lib/treat/config/workers/inflectors.rb +27 -0
- data/lib/treat/config/workers/learners.rb +6 -0
- data/lib/treat/config/workers/lexicalizers.rb +18 -0
- data/lib/treat/config/workers/list.rb +1 -0
- data/lib/treat/config/workers/processors.rb +19 -0
- data/lib/treat/config/workers/retrievers.rb +12 -0
- data/lib/treat/config.rb +125 -0
- data/lib/treat/{classification.rb → core/classification.rb} +1 -1
- data/lib/treat/{data_set.rb → core/data_set.rb} +1 -4
- data/lib/treat/{tree.rb → core/node.rb} +5 -5
- data/lib/treat/core/server.rb +3 -0
- data/lib/treat/core.rb +5 -0
- data/lib/treat/entities/abilities/buildable.rb +61 -56
- data/lib/treat/entities/abilities/checkable.rb +2 -2
- data/lib/treat/entities/abilities/comparable.rb +21 -0
- data/lib/treat/entities/abilities/copyable.rb +2 -0
- data/lib/treat/entities/abilities/countable.rb +1 -1
- data/lib/treat/entities/abilities/debuggable.rb +1 -1
- data/lib/treat/entities/abilities/delegatable.rb +42 -36
- data/lib/treat/entities/abilities/doable.rb +2 -2
- data/lib/treat/entities/abilities/exportable.rb +1 -1
- data/lib/treat/entities/abilities/iterable.rb +21 -33
- data/lib/treat/entities/abilities/magical.rb +8 -8
- data/lib/treat/entities/abilities/registrable.rb +0 -38
- data/lib/treat/entities/abilities/stringable.rb +19 -19
- data/lib/treat/entities/collection.rb +31 -0
- data/lib/treat/entities/document.rb +10 -0
- data/lib/treat/entities/entity.rb +18 -13
- data/lib/treat/entities/group.rb +15 -0
- data/lib/treat/entities/section.rb +13 -0
- data/lib/treat/entities/token.rb +35 -0
- data/lib/treat/entities/zone.rb +11 -0
- data/lib/treat/entities.rb +5 -75
- data/lib/treat/helpers/didyoumean.rb +57 -0
- data/lib/treat/helpers/escaping.rb +15 -0
- data/lib/treat/helpers/formatting.rb +41 -0
- data/lib/treat/helpers/platform.rb +15 -0
- data/lib/treat/helpers/reflection.rb +17 -0
- data/lib/treat/helpers/temporary.rb +27 -0
- data/lib/treat/helpers/verbosity.rb +19 -0
- data/lib/treat/helpers.rb +5 -0
- data/lib/treat/installer.rb +46 -165
- data/lib/treat/loaders/linguistics.rb +22 -27
- data/lib/treat/loaders/stanford.rb +23 -41
- data/lib/treat/loaders.rb +10 -0
- data/lib/treat/proxies.rb +73 -24
- data/lib/treat/version.rb +3 -0
- data/lib/treat/{extractors → workers/extractors}/keywords/tf_idf.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/language/what_language.rb +11 -4
- data/lib/treat/{extractors → workers/extractors}/name_tag/stanford.rb +3 -4
- data/lib/treat/{extractors → workers/extractors}/tf_idf/native.rb +4 -5
- data/lib/treat/{extractors → workers/extractors}/time/chronic.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/nickel.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/time/ruby.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topic_words/lda.rb +1 -1
- data/lib/treat/{extractors → workers/extractors}/topics/reuters.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/abw.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/autoselect.rb +10 -3
- data/lib/treat/{formatters → workers/formatters}/readers/doc.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/html.rb +4 -4
- data/lib/treat/{formatters → workers/formatters}/readers/image.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/odt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/pdf.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/txt.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/readers/xml.rb +2 -2
- data/lib/treat/workers/formatters/serializers/mongo.rb +60 -0
- data/lib/treat/{formatters → workers/formatters}/serializers/xml.rb +1 -2
- data/lib/treat/{formatters → workers/formatters}/serializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/unserializers/autoselect.rb +3 -1
- data/lib/treat/workers/formatters/unserializers/mongo.rb +80 -0
- data/lib/treat/{formatters → workers/formatters}/unserializers/xml.rb +2 -2
- data/lib/treat/{formatters → workers/formatters}/unserializers/yaml.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/dot.rb +1 -1
- data/lib/treat/{formatters → workers/formatters}/visualizers/standoff.rb +2 -3
- data/lib/treat/{formatters → workers/formatters}/visualizers/tree.rb +1 -1
- data/lib/treat/{groupable.rb → workers/group.rb} +6 -12
- data/lib/treat/{inflectors → workers/inflectors}/cardinalizers/linguistics.rb +7 -2
- data/lib/treat/{inflectors → workers/inflectors}/conjugators/linguistics.rb +11 -11
- data/lib/treat/{inflectors → workers/inflectors}/declensors/active_support.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english/inflect.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/declensors/english.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/declensors/linguistics.rb +4 -4
- data/lib/treat/{inflectors → workers/inflectors}/ordinalizers/linguistics.rb +8 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter.rb +2 -2
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/porter_c.rb +1 -1
- data/lib/treat/{inflectors → workers/inflectors}/stemmers/uea.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/id3.rb +1 -1
- data/lib/treat/{ai → workers/learners}/classifiers/mlp.rb +1 -1
- data/lib/treat/{lexicalizers → workers/lexicalizers}/categorizers/from_tag.rb +9 -9
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet/synset.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/sensers/wordnet.rb +4 -4
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill/patch.rb +2 -2
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/brill.rb +2 -8
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/lingua.rb +1 -6
- data/lib/treat/{lexicalizers → workers/lexicalizers}/taggers/stanford.rb +31 -42
- data/lib/treat/workers/processors/chunkers/autoselect.rb +19 -0
- data/lib/treat/{processors → workers/processors}/chunkers/html.rb +4 -3
- data/lib/treat/workers/processors/chunkers/txt.rb +32 -0
- data/lib/treat/{processors → workers/processors}/parsers/enju.rb +3 -3
- data/lib/treat/{processors → workers/processors}/parsers/stanford.rb +6 -8
- data/lib/treat/{processors → workers/processors}/segmenters/punkt.rb +6 -10
- data/lib/treat/{processors → workers/processors}/segmenters/stanford.rb +2 -2
- data/lib/treat/{processors → workers/processors}/segmenters/tactful.rb +3 -6
- data/lib/treat/{processors → workers/processors}/tokenizers/ptb.rb +6 -5
- data/lib/treat/{processors → workers/processors}/tokenizers/punkt.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/stanford.rb +1 -1
- data/lib/treat/{processors → workers/processors}/tokenizers/tactful.rb +3 -5
- data/lib/treat/{retrievers → workers/retrievers}/indexers/ferret.rb +1 -1
- data/lib/treat/{retrievers → workers/retrievers}/searchers/ferret.rb +1 -1
- data/lib/treat/workers.rb +96 -0
- data/lib/treat.rb +23 -49
- data/spec/collection.rb +4 -4
- data/spec/document.rb +5 -5
- data/spec/entity.rb +33 -32
- data/spec/{tree.rb → node.rb} +5 -5
- data/spec/phrase.rb +5 -39
- data/spec/sandbox.rb +212 -6
- data/spec/token.rb +12 -9
- data/spec/treat.rb +12 -9
- data/spec/word.rb +10 -9
- data/spec/zone.rb +6 -2
- data/tmp/{INFO → MANIFEST} +0 -0
- data/tmp/english.yaml +10340 -0
- metadata +149 -139
- data/lib/treat/ai.rb +0 -12
- data/lib/treat/categories.rb +0 -90
- data/lib/treat/categorizable.rb +0 -44
- data/lib/treat/configurable.rb +0 -115
- data/lib/treat/dependencies.rb +0 -25
- data/lib/treat/downloader.rb +0 -87
- data/lib/treat/entities/abilities.rb +0 -10
- data/lib/treat/entities/entities.rb +0 -102
- data/lib/treat/exception.rb +0 -7
- data/lib/treat/extractors.rb +0 -79
- data/lib/treat/formatters/serializers/mongo.rb +0 -64
- data/lib/treat/formatters.rb +0 -41
- data/lib/treat/helpers/decimal_point_escaper.rb +0 -22
- data/lib/treat/inflectors.rb +0 -52
- data/lib/treat/kernel.rb +0 -208
- data/lib/treat/languages/arabic.rb +0 -16
- data/lib/treat/languages/chinese.rb +0 -16
- data/lib/treat/languages/dutch.rb +0 -16
- data/lib/treat/languages/english.rb +0 -63
- data/lib/treat/languages/french.rb +0 -20
- data/lib/treat/languages/german.rb +0 -20
- data/lib/treat/languages/greek.rb +0 -16
- data/lib/treat/languages/italian.rb +0 -17
- data/lib/treat/languages/language.rb +0 -10
- data/lib/treat/languages/list.txt +0 -504
- data/lib/treat/languages/polish.rb +0 -16
- data/lib/treat/languages/portuguese.rb +0 -16
- data/lib/treat/languages/russian.rb +0 -16
- data/lib/treat/languages/spanish.rb +0 -16
- data/lib/treat/languages/swedish.rb +0 -16
- data/lib/treat/languages.rb +0 -132
- data/lib/treat/lexicalizers.rb +0 -37
- data/lib/treat/object.rb +0 -7
- data/lib/treat/processors/chunkers/autoselect.rb +0 -16
- data/lib/treat/processors/chunkers/txt.rb +0 -21
- data/lib/treat/processors.rb +0 -38
- data/lib/treat/retrievers.rb +0 -27
- data/lib/treat/server.rb +0 -26
- data/lib/treat/universalisation/encodings.rb +0 -12
- data/lib/treat/universalisation/tags.rb +0 -453
- data/lib/treat/universalisation.rb +0 -9
- data/spec/languages.rb +0 -25
data/lib/treat/proxies.rb
CHANGED
@@ -1,57 +1,106 @@
|
|
1
|
-
# Proxies install builders on core Ruby objects
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
module Treat::Proxies
|
6
|
-
|
1
|
+
# Proxies install builders on core Ruby objects;
|
2
|
+
# when a method defined by Treat is called on these
|
3
|
+
# objects, the Ruby object is cast to a Treat entity
|
4
|
+
# and the method is called on the resultant type.
|
5
|
+
module Treat::Core::Proxies
|
6
|
+
|
7
7
|
# Provides a base functionality for proxies.
|
8
8
|
module Proxy
|
9
|
-
|
9
|
+
|
10
10
|
# Build the entity corresponding to the proxied
|
11
11
|
# object and send the method call to the entity.
|
12
12
|
def method_missing(sym, *args, &block)
|
13
|
-
if sym == :do || Treat::
|
13
|
+
if sym == :do || Treat::Workers.lookup(sym)
|
14
14
|
to_entity.send(sym, *args)
|
15
15
|
else
|
16
16
|
super(sym, *args, &block)
|
17
17
|
end
|
18
18
|
end
|
19
|
-
|
19
|
+
|
20
20
|
# Create an unknown type of entity by default.
|
21
21
|
def to_entity(builder = nil)
|
22
22
|
Treat::Entities::Unknown(self.to_s)
|
23
23
|
end
|
24
|
-
|
24
|
+
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
# Install Treat functions on String objects.
|
28
28
|
module String
|
29
|
-
|
29
|
+
|
30
30
|
# Include base proxy functionality.
|
31
|
-
include Treat::Proxies::Proxy
|
32
|
-
|
31
|
+
include Treat::Core::Proxies::Proxy
|
32
|
+
|
33
33
|
# Return the entity corresponding to the string.
|
34
34
|
def to_entity
|
35
|
-
Treat::Entities::Entity.from_string(self
|
35
|
+
Treat::Entities::Entity.from_string(self)
|
36
36
|
end
|
37
|
-
|
37
|
+
|
38
38
|
end
|
39
|
-
|
39
|
+
|
40
40
|
# Install Treat functions on Numeric objects.
|
41
41
|
module Numeric
|
42
|
-
|
42
|
+
|
43
43
|
# Include base proxy functionality.
|
44
|
-
include Treat::Proxies::Proxy
|
45
|
-
|
44
|
+
include Treat::Core::Proxies::Proxy
|
45
|
+
|
46
46
|
# Return the entity corresponding to the number.
|
47
47
|
def to_entity(builder = nil)
|
48
48
|
Treat::Entities::Number.from_numeric(self)
|
49
49
|
end
|
50
|
+
|
51
|
+
end
|
50
52
|
|
53
|
+
# Include Treat methods on strings.
|
54
|
+
::String.class_eval do
|
55
|
+
include Treat::Core::Proxies::String
|
51
56
|
end
|
52
57
|
|
53
|
-
# Include
|
54
|
-
::
|
55
|
-
|
58
|
+
# Include Treat methods on numerics.
|
59
|
+
::Numeric.class_eval do
|
60
|
+
include Treat::Core::Proxies::Numeric
|
61
|
+
end
|
62
|
+
|
63
|
+
# This is kind of ugly; need to find a
|
64
|
+
# better solution eventually (?)
|
65
|
+
Treat::Entities::Entity.class_eval do
|
66
|
+
|
67
|
+
# Rename the true language detection
|
68
|
+
# method to :language_proxied, and
|
69
|
+
# only call it if language detection
|
70
|
+
# is turned on in the configuration.
|
71
|
+
alias :language_proxied :language
|
72
|
+
|
73
|
+
# Proxy the #language method, defined on
|
74
|
+
# all textual entities, in order to catch
|
75
|
+
# the method call if language detection is
|
76
|
+
# turned off and return the default language
|
77
|
+
# in that case.
|
78
|
+
def language(extractor = nil, options = {})
|
56
79
|
|
57
|
-
|
80
|
+
return Treat.core.language.default if
|
81
|
+
!Treat.core.language.detect
|
82
|
+
|
83
|
+
if is_a?(Treat::Entities::Symbol) ||
|
84
|
+
is_a?(Treat::Entities::Number)
|
85
|
+
return Treat.core.language.default
|
86
|
+
end
|
87
|
+
|
88
|
+
dlvl = Treat.core.language.detect_at
|
89
|
+
dklass = Treat::Entities.const_get(cc(dlvl))
|
90
|
+
|
91
|
+
if self.class.compare_with(
|
92
|
+
dklass) < 1 && has_parent?
|
93
|
+
anc = ancestor_with_type(dlvl)
|
94
|
+
return anc.language if anc
|
95
|
+
end
|
96
|
+
|
97
|
+
extractor ||= Treat.workers.
|
98
|
+
extractors.language.default
|
99
|
+
|
100
|
+
language_proxied(extractor, options)
|
101
|
+
|
102
|
+
end
|
103
|
+
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# This retrieves a supplied number of keywords
|
2
2
|
# by selecting the N words with the highest TF*IDF
|
3
3
|
# for each document.
|
4
|
-
class Treat::Extractors::Keywords::TfIdf
|
4
|
+
class Treat::Workers::Extractors::Keywords::TfIdf
|
5
5
|
|
6
6
|
# Default options - retrieve 5 keywords.
|
7
7
|
DefaultOptions = { :number => 5 }
|
@@ -1,4 +1,4 @@
|
|
1
|
-
module Treat::Extractors::Language
|
1
|
+
module Treat::Workers::Extractors::Language
|
2
2
|
|
3
3
|
# Adaptor for the 'whatlanguage' gem, which
|
4
4
|
# performs probabilistic language detection.
|
@@ -15,7 +15,7 @@ module Treat::Extractors::Language
|
|
15
15
|
|
16
16
|
# By default, bias towards common languages.
|
17
17
|
DefaultOptions = {
|
18
|
-
:
|
18
|
+
:bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
|
19
19
|
}
|
20
20
|
|
21
21
|
# Keep only once instance of the gem class.
|
@@ -32,21 +32,28 @@ module Treat::Extractors::Language
|
|
32
32
|
# toward when more than one language is detected
|
33
33
|
# with equal probability.
|
34
34
|
def self.language(entity, options = {})
|
35
|
+
|
35
36
|
options = DefaultOptions.merge(options)
|
37
|
+
|
36
38
|
@@detector ||= ::WhatLanguage.new(:possibilities)
|
37
39
|
possibilities = @@detector.process_text(entity.to_s)
|
38
40
|
lang = {}
|
41
|
+
|
39
42
|
possibilities.each do |k,v|
|
40
|
-
lang[
|
43
|
+
lang[k.intern] = v
|
41
44
|
end
|
45
|
+
|
42
46
|
max = lang.values.max
|
43
47
|
ordered = lang.select { |i,j| j == max }.keys
|
48
|
+
|
44
49
|
ordered.each do |l|
|
45
|
-
if options[:
|
50
|
+
if options[:bias_toward].include?(l)
|
46
51
|
return l
|
47
52
|
end
|
48
53
|
end
|
54
|
+
|
49
55
|
return ordered.first
|
56
|
+
|
50
57
|
end
|
51
58
|
|
52
59
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Detects the named entity tag in sentences by using
|
2
2
|
# the stanford-core-nlp gem, which interfaces with
|
3
3
|
# the Stanford Deterministic Coreference Resolver.
|
4
|
-
class Treat::Extractors::NameTag::Stanford
|
4
|
+
class Treat::Workers::Extractors::NameTag::Stanford
|
5
5
|
|
6
6
|
require 'treat/loaders/stanford'
|
7
7
|
Treat::Loaders::Stanford.load
|
@@ -13,9 +13,8 @@ class Treat::Extractors::NameTag::Stanford
|
|
13
13
|
pp = nil
|
14
14
|
|
15
15
|
lang = entity.language
|
16
|
-
|
17
|
-
|
18
|
-
Treat::Loaders::Stanford.load(language)
|
16
|
+
|
17
|
+
Treat::Loaders::Stanford.load(lang)
|
19
18
|
|
20
19
|
isolated_token = entity.is_a?(Treat::Entities::Token)
|
21
20
|
tokens = isolated_token ? [entity] : entity.tokens
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# Calculates the TF*IDF score of words.
|
2
|
-
module Treat::Extractors::TfIdf::Native
|
2
|
+
module Treat::Workers::Extractors::TfIdf::Native
|
3
3
|
DefaultOptions = {
|
4
4
|
:tf => :natural,
|
5
5
|
:idf => :logarithm,
|
@@ -24,10 +24,9 @@ module Treat::Extractors::TfIdf::Native
|
|
24
24
|
@@wc = {} # Number of words in a given document (word count).
|
25
25
|
@@cw = {} # Common words to filter out.
|
26
26
|
def self.tf_idf(entity, options={})
|
27
|
-
l = Treat
|
28
|
-
if l.
|
29
|
-
@@cw[entity.language] =
|
30
|
-
l.const_get(:CommonWords)
|
27
|
+
l = Treat.languages.send(entity.language)
|
28
|
+
if l.stop_words
|
29
|
+
@@cw[entity.language] = l.stop_words.list
|
31
30
|
return 0 if @@cw[entity.language].include?(entity.value)
|
32
31
|
end
|
33
32
|
return 0 if entity.value.length <= 2
|
@@ -8,7 +8,7 @@
|
|
8
8
|
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
9
9
|
#
|
10
10
|
# Project website: https://github.com/ealdent/lda-ruby
|
11
|
-
module Treat::Extractors::TopicWords::LDA
|
11
|
+
module Treat::Workers::Extractors::TopicWords::LDA
|
12
12
|
|
13
13
|
# Require the lda-ruby gem.
|
14
14
|
silence_warnings { require 'lda-ruby' }
|
@@ -6,7 +6,7 @@
|
|
6
6
|
#
|
7
7
|
# Original project website:
|
8
8
|
# http://www.markwatson.com/opensource/
|
9
|
-
module Treat::Extractors::Topics::Reuters
|
9
|
+
module Treat::Workers::Extractors::Topics::Reuters
|
10
10
|
|
11
11
|
# Require the Nokogiri XML parser.
|
12
12
|
require 'nokogiri'
|
@@ -46,11 +46,11 @@ module Treat::Extractors::Topics::Reuters
|
|
46
46
|
# Read the topics from the XML files.
|
47
47
|
def self.get_topics
|
48
48
|
return unless @@industry.size == 0
|
49
|
-
@@industry = read_xml(Treat.models +
|
49
|
+
@@industry = read_xml(Treat.paths.models +
|
50
50
|
'reuters/industry.xml')
|
51
|
-
@@region = read_xml(Treat.models +
|
51
|
+
@@region = read_xml(Treat.paths.models +
|
52
52
|
'reuters/region.xml')
|
53
|
-
@@topics = read_xml(Treat.models +
|
53
|
+
@@topics = read_xml(Treat.paths.models +
|
54
54
|
'reuters/topics.xml')
|
55
55
|
end
|
56
56
|
|
@@ -8,7 +8,7 @@
|
|
8
8
|
# Todo: reimplement with Nokogiri and use
|
9
9
|
# XML node information to better translate
|
10
10
|
# the format of the text.
|
11
|
-
class Treat::Formatters::Readers::ABW
|
11
|
+
class Treat::Workers::Formatters::Readers::ABW
|
12
12
|
|
13
13
|
silence_warnings do
|
14
14
|
require 'rexml/document'
|
@@ -25,7 +25,7 @@ class Treat::Formatters::Readers::ABW
|
|
25
25
|
IO.read(document.file), xml_h)
|
26
26
|
|
27
27
|
document.value = xml_h.plain_text
|
28
|
-
document.set :format,
|
28
|
+
document.set :format, 'abw'
|
29
29
|
document
|
30
30
|
|
31
31
|
end
|
@@ -1,9 +1,9 @@
|
|
1
|
-
class Treat::Formatters::Readers::Autoselect
|
1
|
+
class Treat::Workers::Formatters::Readers::Autoselect
|
2
2
|
|
3
3
|
ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
|
4
4
|
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
5
5
|
DefaultOptions = {
|
6
|
-
:default_to =>
|
6
|
+
:default_to => 'txt'
|
7
7
|
}
|
8
8
|
|
9
9
|
# Choose a reader to use.
|
@@ -16,6 +16,7 @@ class Treat::Formatters::Readers::Autoselect
|
|
16
16
|
end
|
17
17
|
|
18
18
|
def self.detect_format(filename, default_to = nil)
|
19
|
+
|
19
20
|
default_to ||= DefaultOptions[:default_to]
|
20
21
|
ext = filename.scan(ExtensionRegexp)
|
21
22
|
ext = (ext.is_a?(Array) && ext[0] && ext[0][0]) ? ext[0][0] : ''
|
@@ -25,7 +26,13 @@ class Treat::Formatters::Readers::Autoselect
|
|
25
26
|
format = 'yaml' if format == 'yml'
|
26
27
|
|
27
28
|
format = default_to if format.to_s == ''
|
28
|
-
|
29
|
+
|
30
|
+
begin
|
31
|
+
Treat::Workers::Formatters::Readers.const_get(cc(format))
|
32
|
+
rescue Treat::Exception
|
33
|
+
format = default_to
|
34
|
+
end
|
35
|
+
|
29
36
|
format.intern
|
30
37
|
|
31
38
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# A wrapper for the 'antiword' command-line utility.
|
2
|
-
class Treat::Formatters::Readers::DOC
|
2
|
+
class Treat::Workers::Formatters::Readers::DOC
|
3
3
|
|
4
4
|
# Extract the readable text from a DOC file
|
5
5
|
# using the antiword command-line utility.
|
@@ -13,7 +13,7 @@ class Treat::Formatters::Readers::DOC
|
|
13
13
|
f.gsub!('#keep#', "\n\n")
|
14
14
|
|
15
15
|
document.value = f
|
16
|
-
document.set :format,
|
16
|
+
document.set :format, 'doc'
|
17
17
|
document
|
18
18
|
|
19
19
|
end
|
@@ -4,8 +4,8 @@
|
|
4
4
|
#
|
5
5
|
# Project homepage:
|
6
6
|
# https://github.com/iterationlabs/ruby-readability
|
7
|
-
class Treat::Formatters::Readers::HTML
|
8
|
-
|
7
|
+
class Treat::Workers::Formatters::Readers::HTML
|
8
|
+
|
9
9
|
silence_warnings { require 'ruby-readability' }
|
10
10
|
|
11
11
|
# By default, don't backup the original HTML
|
@@ -45,9 +45,9 @@ class Treat::Formatters::Readers::HTML
|
|
45
45
|
html.gsub!(/<!--[^>]*-->/m, '')
|
46
46
|
d = Readability::Document.new(html, options)
|
47
47
|
document.value = "<h1>#{d.title}</h1>\n" + d.content
|
48
|
-
document.set :format,
|
48
|
+
document.set :format, 'html'
|
49
49
|
end
|
50
|
-
|
50
|
+
|
51
51
|
document
|
52
52
|
|
53
53
|
end
|
@@ -11,7 +11,7 @@
|
|
11
11
|
#
|
12
12
|
# Breuel, Thomas M. The Ocropus Open Source OCR System.
|
13
13
|
# DFKI and U. Kaiserslautern, Germany.
|
14
|
-
class Treat::Formatters::Readers::Image
|
14
|
+
class Treat::Workers::Formatters::Readers::Image
|
15
15
|
|
16
16
|
# Read a file using the Google Ocropus reader.
|
17
17
|
#
|
@@ -29,7 +29,7 @@ class Treat::Formatters::Readers::Image
|
|
29
29
|
doc.set :file, "#{tmp}/output.html"
|
30
30
|
doc = doc.read(:html)
|
31
31
|
doc.set :file, f
|
32
|
-
doc.set :format,
|
32
|
+
doc.set :format, 'image'
|
33
33
|
end
|
34
34
|
end
|
35
35
|
|
@@ -10,7 +10,7 @@
|
|
10
10
|
# Todo: reimplement with Nokogiri and use
|
11
11
|
# XML node information to better translate
|
12
12
|
# the format of the text.
|
13
|
-
class Treat::Formatters::Readers::ODT
|
13
|
+
class Treat::Workers::Formatters::Readers::ODT
|
14
14
|
|
15
15
|
# Require the 'zip' gem to unarchive the ODT files
|
16
16
|
silence_warnings { require 'zip' }
|
@@ -30,7 +30,7 @@ class Treat::Formatters::Readers::ODT
|
|
30
30
|
REXML::Document.parse_stream(f, xml_h)
|
31
31
|
|
32
32
|
document.value = xml_h.plain_text
|
33
|
-
document.set :format,
|
33
|
+
document.set :format, 'odt'
|
34
34
|
document
|
35
35
|
|
36
36
|
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
# A wrapper for the Poppler pdf2text utility, which
|
3
3
|
# extracts the text from a PDF file.
|
4
|
-
module Treat::Formatters::Readers::PDF
|
4
|
+
module Treat::Workers::Formatters::Readers::PDF
|
5
5
|
|
6
6
|
# Read a PDF file using the Poppler pdf2text utility.
|
7
7
|
#
|
@@ -21,7 +21,7 @@ module Treat::Formatters::Readers::PDF
|
|
21
21
|
f.gsub!('#keep#', "\n\n")
|
22
22
|
|
23
23
|
document.value = f
|
24
|
-
document.set :format,
|
24
|
+
document.set :format, 'pdf'
|
25
25
|
document
|
26
26
|
|
27
27
|
end
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# This class simply reads a plain text file.
|
2
|
-
class Treat::Formatters::Readers::TXT
|
2
|
+
class Treat::Workers::Formatters::Readers::TXT
|
3
3
|
|
4
4
|
# Build an entity from a string
|
5
5
|
# in plain text format.
|
@@ -7,7 +7,7 @@ class Treat::Formatters::Readers::TXT
|
|
7
7
|
# Options: none.
|
8
8
|
def self.read(document, options = {})
|
9
9
|
document.value = File.read(document.file)
|
10
|
-
document.set :format,
|
10
|
+
document.set :format, 'txt'
|
11
11
|
document
|
12
12
|
end
|
13
13
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
class Treat::Formatters::Readers::XML
|
1
|
+
class Treat::Workers::Formatters::Readers::XML
|
2
2
|
|
3
3
|
require 'treat/loaders/stanford'
|
4
4
|
Treat::Loaders::Stanford.load
|
@@ -70,7 +70,7 @@ class Treat::Formatters::Readers::XML
|
|
70
70
|
|
71
71
|
end
|
72
72
|
|
73
|
-
document.set :format,
|
73
|
+
document.set :format, 'xml'
|
74
74
|
document
|
75
75
|
|
76
76
|
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# Stores an entity in a Mongo collection.
|
2
|
+
class Treat::Workers::Formatters::Serializers::Mongo
|
3
|
+
|
4
|
+
# Reauire the Mongo DB
|
5
|
+
require 'mongo'
|
6
|
+
|
7
|
+
DefaultOptions = {
|
8
|
+
:recursive => true,
|
9
|
+
:stop_at => :token
|
10
|
+
}
|
11
|
+
|
12
|
+
def self.serialize(entity, options = {})
|
13
|
+
|
14
|
+
options = DefaultOptions.merge(options)
|
15
|
+
stop_at = options[:stop_at] ?
|
16
|
+
Treat::Entities.const_get(
|
17
|
+
options[:stop_at].to_s.capitalize) :
|
18
|
+
Treat::Entities::Token
|
19
|
+
|
20
|
+
if !Treat.databases.mongo.db && !options[:db]
|
21
|
+
raise Treat::Exception,
|
22
|
+
'Must supply the database name in config. ' +
|
23
|
+
'(Treat.databases.mongo.db = ...) or pass ' +
|
24
|
+
'it as a parameter to #serialize.'
|
25
|
+
end
|
26
|
+
|
27
|
+
@@database ||= Mongo::Connection.
|
28
|
+
new(Treat.databases.mongo.host).
|
29
|
+
db(Treat.databases.mongo.db || options[:db])
|
30
|
+
|
31
|
+
type = cl(entity.class.superclass).downcase
|
32
|
+
type = entity.type.to_s if type == 'entity'
|
33
|
+
types = type + 's'
|
34
|
+
|
35
|
+
coll = @@database.collection(types)
|
36
|
+
|
37
|
+
entity_token = {
|
38
|
+
:id => entity.id,
|
39
|
+
:value => entity.value,
|
40
|
+
:string => entity.to_s,
|
41
|
+
:type => entity.type,
|
42
|
+
:children => entity.children.map { |c| [c.id, c.type] },
|
43
|
+
:parent => (entity.has_parent? ? entity.parent.id : nil),
|
44
|
+
:features => entity.features
|
45
|
+
}
|
46
|
+
|
47
|
+
coll.insert(entity_token)
|
48
|
+
|
49
|
+
if options[:recursive] && entity.has_children?
|
50
|
+
entity.each do |child|
|
51
|
+
next if child.class.compare_with(stop_at) < 0
|
52
|
+
self.serialize(child, options)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
|
@@ -1,5 +1,5 @@
|
|
1
1
|
# This class converts an entity to a storable XML format.
|
2
|
-
class Treat::Formatters::Serializers::XML
|
2
|
+
class Treat::Workers::Formatters::Serializers::XML
|
3
3
|
|
4
4
|
# Reauire the Nokogiri XML parser.
|
5
5
|
require 'nokogiri'
|
@@ -74,7 +74,6 @@ class Treat::Formatters::Serializers::XML
|
|
74
74
|
f.write(string)
|
75
75
|
end
|
76
76
|
end
|
77
|
-
# puts string
|
78
77
|
end
|
79
78
|
string
|
80
79
|
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
class Treat::Formatters::Unserializers::Autoselect
|
1
|
+
class Treat::Workers::Formatters::Unserializers::Autoselect
|
2
2
|
|
3
3
|
def self.unserialize(document, options = {})
|
4
4
|
file = document.file
|
@@ -6,6 +6,8 @@ class Treat::Formatters::Unserializers::Autoselect
|
|
6
6
|
document.unserialize(:yaml, options)
|
7
7
|
elsif file.index('xml')
|
8
8
|
document.unserialize(:xml, options)
|
9
|
+
elsif file.index('mongo')
|
10
|
+
document.unserialize(:mongo, options)
|
9
11
|
else
|
10
12
|
raise Treat::Exception,
|
11
13
|
"Unreadable serialized format for file #{file}."
|
@@ -0,0 +1,80 @@
|
|
1
|
+
module Treat::Workers::Formatters::Unserializers::Mongo
|
2
|
+
|
3
|
+
DefaultOptions = {
|
4
|
+
:recursive => true,
|
5
|
+
:stop_at => nil
|
6
|
+
}
|
7
|
+
|
8
|
+
require 'mongo'
|
9
|
+
|
10
|
+
def self.unserialize(entity, options={})
|
11
|
+
|
12
|
+
options = DefaultOptions.merge(options)
|
13
|
+
options[:stop_at] = options[:stop_at] ?
|
14
|
+
Treat::Entities.const_get(
|
15
|
+
options[:stop_at].to_s.capitalize) :
|
16
|
+
Treat::Entities::Token
|
17
|
+
|
18
|
+
if !Treat.databases.mongo.db && !options[:db]
|
19
|
+
raise Treat::Exception,
|
20
|
+
'Must supply the database name in config. ' +
|
21
|
+
'(Treat.databases.mongo.db = ...) or pass ' +
|
22
|
+
'it as a parameter to #unserialize.'
|
23
|
+
end
|
24
|
+
|
25
|
+
@@database ||= Mongo::Connection.
|
26
|
+
new(Treat.databases.mongo.host).
|
27
|
+
db(Treat.databases.mongo.db || options[:db])
|
28
|
+
|
29
|
+
self.do_unserialize(entity, options)
|
30
|
+
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.do_unserialize(entity, options)
|
34
|
+
|
35
|
+
supertype = cl(Treat::Entities.const_get(
|
36
|
+
entity.type.to_s.capitalize.intern).superclass).downcase
|
37
|
+
supertype = entity.type.to_s if supertype == 'entity'
|
38
|
+
supertypes = supertype + 's'
|
39
|
+
|
40
|
+
coll = @@database.collection(supertypes)
|
41
|
+
record = coll.find_one(:id => entity.id)
|
42
|
+
|
43
|
+
unless record
|
44
|
+
raise Treat::Exception,
|
45
|
+
"Couldn't find record ID #{entity.id}."
|
46
|
+
end
|
47
|
+
|
48
|
+
# Convert feature keys to symbols.
|
49
|
+
features = record['features']
|
50
|
+
new_feat = {}
|
51
|
+
features.each do |feature, value|
|
52
|
+
new_feat[feature.intern] = value
|
53
|
+
end
|
54
|
+
entity.features = new_feat
|
55
|
+
|
56
|
+
# Set the entity's value.
|
57
|
+
entity.value = record['value']
|
58
|
+
|
59
|
+
if entity.class.compare_with(
|
60
|
+
options[:stop_at]) == 0
|
61
|
+
entity.value = record['string']
|
62
|
+
end
|
63
|
+
|
64
|
+
return entity unless options[:recursive]
|
65
|
+
|
66
|
+
record['children'].each do |c|
|
67
|
+
cid, ctype = *c
|
68
|
+
cklass = Treat::Entities.const_get(
|
69
|
+
ctype.capitalize.intern)
|
70
|
+
next if cklass.compare_with(
|
71
|
+
options[:stop_at]) < 0
|
72
|
+
entity << self.do_unserialize(
|
73
|
+
cklass.new('', cid), options)
|
74
|
+
end
|
75
|
+
|
76
|
+
entity
|
77
|
+
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Recreates the entity tree corresponding to
|
2
2
|
# a serialized XML file.
|
3
|
-
module Treat::Formatters::Unserializers::XML
|
3
|
+
module Treat::Workers::Formatters::Unserializers::XML
|
4
4
|
|
5
5
|
require 'nokogiri'
|
6
6
|
|
@@ -78,7 +78,7 @@ module Treat::Formatters::Unserializers::XML
|
|
78
78
|
current_value = ''
|
79
79
|
type = xml_reader.name.intern
|
80
80
|
|
81
|
-
if Treat
|
81
|
+
if Treat.core.entities.list.include?(type)
|
82
82
|
if !current_element
|
83
83
|
current_element = self.revive(type, current_value, id)
|
84
84
|
else
|