treat 1.2.0 → 2.0.0rc1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +2 -2
- data/README.md +12 -21
- data/lib/treat/autoload.rb +44 -0
- data/lib/treat/config/config.rb +38 -0
- data/lib/treat/config/configurable.rb +51 -0
- data/lib/treat/config/data/config.rb +50 -0
- data/lib/treat/config/data/core.rb +52 -0
- data/lib/treat/config/data/databases.rb +10 -0
- data/lib/treat/config/data/entities.rb +15 -0
- data/lib/treat/config/data/languages/agnostic.rb +31 -0
- data/lib/treat/config/{languages → data/languages}/arabic.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/chinese.rb +0 -0
- data/lib/treat/config/{languages → data/languages}/dutch.rb +1 -1
- data/lib/treat/config/data/languages/english.rb +95 -0
- data/lib/treat/config/data/languages/french.rb +148 -0
- data/lib/treat/config/data/languages/german.rb +135 -0
- data/lib/treat/config/{languages → data/languages}/greek.rb +1 -1
- data/lib/treat/config/data/languages/italian.rb +162 -0
- data/lib/treat/config/data/languages/polish.rb +11 -0
- data/lib/treat/config/{languages → data/languages}/portuguese.rb +1 -1
- data/lib/treat/config/{languages → data/languages}/russian.rb +1 -1
- data/lib/treat/config/data/languages/spanish.rb +291 -0
- data/lib/treat/config/data/languages/swedish.rb +289 -0
- data/lib/treat/config/data/libraries.rb +12 -0
- data/lib/treat/config/data/linguistics.rb +44 -0
- data/lib/treat/config/data/tags.rb +328 -0
- data/lib/treat/config/{workers → data/workers}/extractors.rb +2 -10
- data/lib/treat/config/{workers → data/workers}/formatters.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/inflectors.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/learners.rb +0 -0
- data/lib/treat/config/{workers → data/workers}/lexicalizers.rb +4 -3
- data/lib/treat/config/{workers → data/workers}/processors.rb +3 -3
- data/lib/treat/config/{workers → data/workers}/retrievers.rb +0 -0
- data/lib/treat/config/importable.rb +31 -0
- data/lib/treat/config/paths.rb +23 -0
- data/lib/treat/config/tags.rb +37 -0
- data/lib/treat/core/dsl.rb +55 -0
- data/lib/treat/{installer.rb → core/installer.rb} +10 -12
- data/lib/treat/core/server.rb +40 -0
- data/lib/treat/entities/entities.rb +101 -0
- data/lib/treat/entities/{abilities/doable.rb → entity/applicable.rb} +5 -3
- data/lib/treat/entities/{abilities → entity}/buildable.rb +118 -63
- data/lib/treat/entities/{abilities → entity}/checkable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/comparable.rb +6 -6
- data/lib/treat/entities/{abilities → entity}/countable.rb +2 -1
- data/lib/treat/entities/entity/debuggable.rb +86 -0
- data/lib/treat/entities/{abilities → entity}/delegatable.rb +16 -26
- data/lib/treat/entities/{abilities → entity}/exportable.rb +2 -2
- data/lib/treat/entities/{abilities → entity}/iterable.rb +4 -16
- data/lib/treat/entities/{abilities → entity}/magical.rb +22 -17
- data/lib/treat/entities/entity/registrable.rb +36 -0
- data/lib/treat/entities/{abilities → entity}/stringable.rb +18 -15
- data/lib/treat/entities/entity.rb +86 -77
- data/lib/treat/exception.rb +3 -0
- data/lib/treat/helpers/hash.rb +29 -0
- data/lib/treat/helpers/help.rb +35 -0
- data/lib/treat/helpers/object.rb +55 -0
- data/lib/treat/helpers/string.rb +124 -0
- data/lib/treat/{core → learning}/data_set.rb +11 -11
- data/lib/treat/{core → learning}/export.rb +3 -3
- data/lib/treat/{core → learning}/problem.rb +26 -16
- data/lib/treat/{core → learning}/question.rb +5 -9
- data/lib/treat/loaders/linguistics.rb +8 -9
- data/lib/treat/loaders/stanford.rb +5 -11
- data/lib/treat/modules.rb +33 -0
- data/lib/treat/proxies/array.rb +27 -0
- data/lib/treat/proxies/language.rb +47 -0
- data/lib/treat/proxies/number.rb +18 -0
- data/lib/treat/proxies/proxy.rb +25 -0
- data/lib/treat/proxies/string.rb +18 -0
- data/lib/treat/version.rb +10 -1
- data/lib/treat/{workers.rb → workers/categorizable.rb} +18 -19
- data/lib/treat/workers/extractors/keywords/tf_idf.rb +11 -11
- data/lib/treat/workers/extractors/language/what_language.rb +8 -6
- data/lib/treat/workers/extractors/name_tag/stanford.rb +10 -4
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +36 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +27 -0
- data/lib/treat/workers/extractors/tf_idf/native.rb +4 -4
- data/lib/treat/workers/extractors/time/chronic.rb +2 -4
- data/lib/treat/workers/extractors/time/nickel.rb +19 -20
- data/lib/treat/workers/extractors/time/ruby.rb +2 -1
- data/lib/treat/workers/extractors/topic_words/lda.rb +12 -12
- data/lib/treat/workers/extractors/topics/reuters.rb +9 -13
- data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
- data/lib/treat/workers/formatters/readers/image.rb +19 -9
- data/lib/treat/workers/formatters/readers/odt.rb +2 -1
- data/lib/treat/workers/formatters/readers/pdf.rb +20 -3
- data/lib/treat/workers/formatters/readers/xml.rb +0 -1
- data/lib/treat/workers/formatters/serializers/mongo.rb +10 -20
- data/lib/treat/workers/formatters/serializers/xml.rb +17 -26
- data/lib/treat/workers/formatters/serializers/yaml.rb +5 -4
- data/lib/treat/workers/formatters/unserializers/mongo.rb +4 -4
- data/lib/treat/workers/formatters/unserializers/xml.rb +3 -4
- data/lib/treat/workers/formatters/unserializers/yaml.rb +3 -4
- data/lib/treat/workers/formatters/visualizers/dot.rb +1 -0
- data/lib/treat/workers/formatters/visualizers/standoff.rb +2 -3
- data/lib/treat/workers/formatters/visualizers/tree.rb +2 -3
- data/lib/treat/workers/{group.rb → groupable.rb} +9 -9
- data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +1 -3
- data/lib/treat/workers/inflectors/conjugators/linguistics.rb +5 -7
- data/lib/treat/workers/inflectors/declensors/english.rb +13 -20
- data/lib/treat/workers/inflectors/declensors/linguistics.rb +29 -28
- data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +0 -2
- data/lib/treat/workers/inflectors/stemmers/porter.rb +8 -10
- data/lib/treat/workers/inflectors/stemmers/porter_c.rb +7 -7
- data/lib/treat/workers/inflectors/stemmers/uea.rb +3 -8
- data/lib/treat/workers/learners/classifiers/id3.rb +17 -14
- data/lib/treat/workers/learners/classifiers/linear.rb +15 -27
- data/lib/treat/workers/learners/classifiers/mlp.rb +32 -19
- data/lib/treat/workers/learners/classifiers/svm.rb +28 -21
- data/lib/treat/workers/lexicalizers/categorizers/from_tag.rb +19 -3
- data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +15 -7
- data/lib/treat/workers/lexicalizers/taggers/brill/patch.rb +4 -1
- data/lib/treat/workers/lexicalizers/taggers/brill.rb +8 -19
- data/lib/treat/workers/lexicalizers/taggers/lingua.rb +4 -15
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +22 -13
- data/lib/treat/workers/processors/chunkers/autoselect.rb +2 -3
- data/lib/treat/workers/processors/chunkers/html.rb +1 -6
- data/lib/treat/workers/processors/parsers/enju.rb +2 -4
- data/lib/treat/workers/processors/parsers/stanford.rb +13 -7
- data/lib/treat/workers/processors/segmenters/punkt.rb +25 -11
- data/lib/treat/workers/processors/segmenters/scalpel.rb +20 -0
- data/lib/treat/workers/processors/segmenters/srx.rb +42 -0
- data/lib/treat/workers/processors/segmenters/stanford.rb +5 -5
- data/lib/treat/workers/processors/segmenters/tactful.rb +21 -11
- data/lib/treat/workers/processors/tokenizers/ptb.rb +40 -30
- data/lib/treat/workers/processors/tokenizers/punkt.rb +14 -19
- data/lib/treat/workers/processors/tokenizers/stanford.rb +38 -22
- data/lib/treat/workers/retrievers/indexers/ferret.rb +6 -3
- data/lib/treat/workers/retrievers/searchers/ferret.rb +2 -2
- data/lib/treat/workers/workers.rb +6 -0
- data/lib/treat.rb +18 -32
- data/models/MANIFEST +1 -0
- data/spec/core/data_set.rb +174 -0
- data/spec/core/export.rb +52 -0
- data/spec/core/problem.rb +144 -0
- data/spec/core/question.rb +52 -0
- data/spec/{collection.rb → entities/collection.rb} +20 -35
- data/spec/{document.rb → entities/document.rb} +3 -54
- data/spec/{entity.rb → entities/entity.rb} +10 -9
- data/spec/entities/phrase.rb +33 -0
- data/spec/{token.rb → entities/token.rb} +0 -57
- data/spec/entities/word.rb +3 -0
- data/spec/{zone.rb → entities/zone.rb} +0 -26
- data/spec/helper.rb +116 -32
- data/spec/sandbox.rb +258 -25
- data/spec/treat.rb +26 -34
- data/spec/workers/agnostic.rb +137 -0
- data/spec/workers/english.rb +194 -0
- data/spec/workers/examples/english/economist/hungarys_troubles.txt +46 -0
- data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/archimedes.abw +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/euler.html +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/gauss.pdf +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/leibniz.txt +0 -0
- data/spec/{samples → workers/examples/english}/mathematicians/newton.doc +0 -0
- data/spec/workers/examples/english/phrase.xml +5 -0
- data/spec/workers/examples/english/test.txt +1 -0
- data/spec/workers/language.rb +280 -0
- data/spec/workers.rb +28 -0
- metadata +122 -105
- data/lib/treat/config/core/acronyms.rb +0 -5
- data/lib/treat/config/core/encodings.rb +0 -8
- data/lib/treat/config/core/entities.rb +0 -2
- data/lib/treat/config/core/language.rb +0 -3
- data/lib/treat/config/core/paths.rb +0 -8
- data/lib/treat/config/core/syntax.rb +0 -1
- data/lib/treat/config/core/verbosity.rb +0 -1
- data/lib/treat/config/databases/default.rb +0 -1
- data/lib/treat/config/databases/mongo.rb +0 -1
- data/lib/treat/config/languages/agnostic.rb +0 -34
- data/lib/treat/config/languages/english.rb +0 -60
- data/lib/treat/config/languages/french.rb +0 -18
- data/lib/treat/config/languages/german.rb +0 -18
- data/lib/treat/config/languages/italian.rb +0 -12
- data/lib/treat/config/languages/polish.rb +0 -12
- data/lib/treat/config/languages/spanish.rb +0 -12
- data/lib/treat/config/languages/swedish.rb +0 -12
- data/lib/treat/config/libraries/punkt.rb +0 -1
- data/lib/treat/config/libraries/reuters.rb +0 -1
- data/lib/treat/config/libraries/stanford.rb +0 -1
- data/lib/treat/config/linguistics/categories.rb +0 -4
- data/lib/treat/config/linguistics/punctuation.rb +0 -33
- data/lib/treat/config/tags/aligned.rb +0 -221
- data/lib/treat/config/tags/enju.rb +0 -71
- data/lib/treat/config/tags/paris7.rb +0 -17
- data/lib/treat/config/tags/ptb.rb +0 -15
- data/lib/treat/config/workers/list.rb +0 -1
- data/lib/treat/config.rb +0 -135
- data/lib/treat/core.rb +0 -5
- data/lib/treat/entities/abilities/copyable.rb +0 -47
- data/lib/treat/entities/abilities/debuggable.rb +0 -83
- data/lib/treat/entities/abilities/registrable.rb +0 -46
- data/lib/treat/entities/collection.rb +0 -40
- data/lib/treat/entities/document.rb +0 -10
- data/lib/treat/entities/group.rb +0 -18
- data/lib/treat/entities/section.rb +0 -13
- data/lib/treat/entities/token.rb +0 -47
- data/lib/treat/entities/zone.rb +0 -12
- data/lib/treat/entities.rb +0 -6
- data/lib/treat/helpers/didyoumean.rb +0 -57
- data/lib/treat/helpers/escaping.rb +0 -15
- data/lib/treat/helpers/formatting.rb +0 -41
- data/lib/treat/helpers/objtohash.rb +0 -8
- data/lib/treat/helpers/platform.rb +0 -15
- data/lib/treat/helpers/reflection.rb +0 -17
- data/lib/treat/helpers/temporary.rb +0 -27
- data/lib/treat/helpers/verbosity.rb +0 -19
- data/lib/treat/helpers.rb +0 -5
- data/lib/treat/loaders.rb +0 -10
- data/lib/treat/proxies.rb +0 -106
- data/lib/treat/workers/formatters/unserializers/autoselect.rb +0 -17
- data/lib/treat/workers/inflectors/declensors/active_support.rb +0 -31
- data/lib/treat/workers/processors/tokenizers/tactful.rb +0 -68
- data/spec/core.rb +0 -441
- data/spec/phrase.rb +0 -112
- data/spec/word.rb +0 -111
@@ -2,7 +2,7 @@
|
|
2
2
|
# problem as well as data for entities that
|
3
3
|
# have already been classified, complete with
|
4
4
|
# references to these entities.
|
5
|
-
class Treat::
|
5
|
+
class Treat::Learning::DataSet
|
6
6
|
|
7
7
|
# The classification problem this
|
8
8
|
# data set holds data for.
|
@@ -13,26 +13,26 @@ class Treat::Core::DataSet
|
|
13
13
|
|
14
14
|
# Initialize the DataSet.
|
15
15
|
def initialize(problem)
|
16
|
-
unless problem.is_a?(Treat::
|
16
|
+
unless problem.is_a?(Treat::Learning::Problem)
|
17
17
|
raise Treat::Exception, "The first argument " +
|
18
18
|
"to initialize should be an instance of " +
|
19
|
-
"Treat::
|
19
|
+
"Treat::Learning::Problem."
|
20
20
|
end
|
21
21
|
@problem, @items = problem, []
|
22
22
|
end
|
23
23
|
|
24
24
|
def self.build(from)
|
25
25
|
if from.is_a?(Hash)
|
26
|
-
Treat::
|
26
|
+
Treat::Learning::DataSet.unserialize(
|
27
27
|
Treat.databases.default.adapter, from)
|
28
28
|
elsif from.is_a?(String)
|
29
29
|
unless File.readable?(from)
|
30
30
|
raise Treat::Exception,
|
31
|
-
"Attempting to initialize data set from "
|
32
|
-
"file #{from}, but it is not readable."
|
31
|
+
"Attempting to initialize data set from " +
|
32
|
+
"file '#{from}', but it is not readable."
|
33
33
|
end
|
34
|
-
Treat::
|
35
|
-
|
34
|
+
Treat::Learning::DataSet.unserialize(
|
35
|
+
File.extname(from)[1..-1], file: from)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
@@ -92,7 +92,7 @@ class Treat::Core::DataSet
|
|
92
92
|
next unless tag.proc_string
|
93
93
|
tag.proc = eval(tag.proc_string)
|
94
94
|
end
|
95
|
-
data_set = Treat::
|
95
|
+
data_set = Treat::Learning::DataSet.new(problem)
|
96
96
|
data_set.items = items
|
97
97
|
data_set
|
98
98
|
end
|
@@ -131,7 +131,7 @@ class Treat::Core::DataSet
|
|
131
131
|
raise Treat::Exception,
|
132
132
|
"Couldn't retrieve problem ID #{options[:problem]}."
|
133
133
|
end
|
134
|
-
problem = Treat::
|
134
|
+
problem = Treat::Learning::Problem.from_hash(p_record)
|
135
135
|
data = database.collection('data').find(options).to_a
|
136
136
|
items = []
|
137
137
|
data.each do |datum|
|
@@ -142,7 +142,7 @@ class Treat::Core::DataSet
|
|
142
142
|
item[:id] = datum['id']
|
143
143
|
items << item
|
144
144
|
end
|
145
|
-
data_set = Treat::
|
145
|
+
data_set = Treat::Learning::DataSet.new(problem)
|
146
146
|
data_set.items = items
|
147
147
|
data_set
|
148
148
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Represents a feature to be used
|
2
2
|
# in a classification task.
|
3
|
-
class Treat::
|
3
|
+
class Treat::Learning::Export
|
4
4
|
|
5
5
|
# The name of the feature. If no
|
6
6
|
# proc is supplied, this assumes
|
@@ -55,5 +55,5 @@ class Treat::Core::Export
|
|
55
55
|
|
56
56
|
end
|
57
57
|
|
58
|
-
class Treat::
|
59
|
-
class Treat::
|
58
|
+
class Treat::Learning::Feature < Treat::Learning::Export; end
|
59
|
+
class Treat::Learning::Tag < Treat::Learning::Export; end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# - What question are we trying to answer?
|
3
3
|
# - What features are we going to look at
|
4
4
|
# to attempt to answer that question?
|
5
|
-
class Treat::
|
5
|
+
class Treat::Learning::Problem
|
6
6
|
|
7
7
|
# A unique identifier for the problem.
|
8
8
|
attr_accessor :id
|
@@ -20,21 +20,21 @@ class Treat::Core::Problem
|
|
20
20
|
# Initialize the problem with a question
|
21
21
|
# and an arbitrary number of features. # FIXME: init with id!?
|
22
22
|
def initialize(question, *exports)
|
23
|
-
unless question.is_a?(Treat::
|
23
|
+
unless question.is_a?(Treat::Learning::Question)
|
24
24
|
raise Treat::Exception,
|
25
25
|
"The first argument to initialize " +
|
26
26
|
"should be an instance of " +
|
27
|
-
"Treat::
|
27
|
+
"Treat::Learning::Question."
|
28
28
|
end
|
29
|
-
if exports.any? { |f| !f.is_a?(Treat::
|
29
|
+
if exports.any? { |f| !f.is_a?(Treat::Learning::Export) }
|
30
30
|
raise Treat::Exception,
|
31
31
|
"The second argument and all subsequent ones " +
|
32
32
|
"to initialize should be instances of subclasses " +
|
33
|
-
"of Treat::
|
33
|
+
"of Treat::Learning::Export."
|
34
34
|
end
|
35
35
|
@question, @id = question, object_id
|
36
36
|
@features = exports.select do |exp|
|
37
|
-
exp.is_a?(Treat::
|
37
|
+
exp.is_a?(Treat::Learning::Feature)
|
38
38
|
end
|
39
39
|
if @features.size == 0
|
40
40
|
raise Treat::Exception,
|
@@ -42,7 +42,7 @@ class Treat::Core::Problem
|
|
42
42
|
"one feature to work with."
|
43
43
|
end
|
44
44
|
@tags = exports.select do |exp|
|
45
|
-
exp.is_a?(Treat::
|
45
|
+
exp.is_a?(Treat::Learning::Tag)
|
46
46
|
end
|
47
47
|
@feature_labels = @features.map { |f| f.name }
|
48
48
|
@tag_labels = @tags.map { |t| t.name }
|
@@ -63,7 +63,7 @@ class Treat::Core::Problem
|
|
63
63
|
# all of the features.
|
64
64
|
def export_features(e, include_answer = true)
|
65
65
|
features = export(e, @features)
|
66
|
-
return features
|
66
|
+
return features if !include_answer
|
67
67
|
features << (e.has?(@question.name) ?
|
68
68
|
e.get(@question.name) : @question.default)
|
69
69
|
features
|
@@ -80,9 +80,11 @@ class Treat::Core::Problem
|
|
80
80
|
|
81
81
|
def export(entity, exports)
|
82
82
|
unless @question.target == entity.type
|
83
|
+
targ, type = @question.target, entity.type
|
83
84
|
raise Treat::Exception,
|
84
|
-
"This classification problem targets
|
85
|
-
"but a(n) #{
|
85
|
+
"This classification problem targets " +
|
86
|
+
"#{targ}s, but a(n) #{type} " +
|
87
|
+
"was passed to export instead."
|
86
88
|
end
|
87
89
|
ret = []
|
88
90
|
exports.each do |export|
|
@@ -103,28 +105,36 @@ class Treat::Core::Problem
|
|
103
105
|
'id' => @id }
|
104
106
|
end
|
105
107
|
|
108
|
+
def object_to_hash(obj)
|
109
|
+
hash = {}
|
110
|
+
obj.instance_variables.each do |var|
|
111
|
+
val = obj.instance_variable_get(var)
|
112
|
+
hash[var.to_s.delete("@")] = val
|
113
|
+
end
|
114
|
+
hash
|
115
|
+
end
|
116
|
+
|
106
117
|
def self.from_hash(hash)
|
107
|
-
question = Treat::
|
118
|
+
question = Treat::Learning::Question.new(
|
108
119
|
hash['question']['name'],
|
109
120
|
hash['question']['target'],
|
110
|
-
hash['question']['type'],
|
111
121
|
hash['question']['default'],
|
112
|
-
hash['question']['
|
122
|
+
hash['question']['type']
|
113
123
|
)
|
114
124
|
features = []
|
115
125
|
hash['features'].each do |feature|
|
116
|
-
features << Treat::
|
126
|
+
features << Treat::Learning::Feature.new(
|
117
127
|
feature['name'], feature['default'],
|
118
128
|
feature['proc_string'])
|
119
129
|
end
|
120
130
|
tags = []
|
121
131
|
hash['tags'].each do |tag|
|
122
|
-
tags << Treat::
|
132
|
+
tags << Treat::Learning::Tag.new(
|
123
133
|
tag['name'], tag['default'],
|
124
134
|
tag['proc_string'])
|
125
135
|
end
|
126
136
|
features_and_tags = features + tags
|
127
|
-
p = Treat::
|
137
|
+
p = Treat::Learning::Problem.new(question, *features_and_tags)
|
128
138
|
p.id = hash['id']
|
129
139
|
p
|
130
140
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Defines a question to answer in the
|
2
2
|
# context of a classification problem.
|
3
|
-
class Treat::
|
3
|
+
class Treat::Learning::Question
|
4
4
|
|
5
5
|
# Defines an arbitrary label for the
|
6
6
|
# question we are trying to answer
|
@@ -16,12 +16,9 @@ class Treat::Core::Question
|
|
16
16
|
attr_reader :type
|
17
17
|
# Default for the answer to the question.
|
18
18
|
attr_reader :default
|
19
|
-
# A list of possible answers to the question.
|
20
|
-
attr_reader :labels
|
21
19
|
|
22
20
|
# Initialize the question.
|
23
|
-
def initialize(name, target,
|
24
|
-
type = :continuous, default = nil, labels = [])
|
21
|
+
def initialize(name, target, default = nil, type = :continuous)
|
25
22
|
unless name.is_a?(Symbol)
|
26
23
|
raise Treat::Exception,
|
27
24
|
"Question name should be a symbol."
|
@@ -35,8 +32,8 @@ class Treat::Core::Question
|
|
35
32
|
raise Treat::Exception, "Type should be " +
|
36
33
|
"continuous or discrete."
|
37
34
|
end
|
38
|
-
@name, @target, @type, @default
|
39
|
-
name, target, type, default
|
35
|
+
@name, @target, @type, @default =
|
36
|
+
name, target, type, default
|
40
37
|
end
|
41
38
|
|
42
39
|
# Custom comparison operator for questions.
|
@@ -44,8 +41,7 @@ class Treat::Core::Question
|
|
44
41
|
@name == question.name &&
|
45
42
|
@type == question.type &&
|
46
43
|
@target == question.target &&
|
47
|
-
@default == question.default
|
48
|
-
@labels = question.labels
|
44
|
+
@default == question.default
|
49
45
|
end
|
50
46
|
|
51
47
|
end
|
@@ -1,10 +1,8 @@
|
|
1
1
|
# A helper class to load a language class
|
2
|
-
# registered with the Linguistics gem
|
2
|
+
# registered with the Linguistics gem, for
|
3
|
+
# example Linguistics::EN.
|
3
4
|
class Treat::Loaders::Linguistics
|
4
5
|
|
5
|
-
# Linguistics throws warnings; silence them.
|
6
|
-
silence_warnings { require 'linguistics' }
|
7
|
-
|
8
6
|
# Linguistics classes for each language.
|
9
7
|
@@languages = {}
|
10
8
|
|
@@ -13,16 +11,17 @@ class Treat::Loaders::Linguistics
|
|
13
11
|
# if there is no such language class registered.
|
14
12
|
def self.load(language)
|
15
13
|
silence_warnings do
|
14
|
+
# Linguistics throws warnings; silence them.
|
15
|
+
silence_warnings { require 'linguistics' }
|
16
|
+
code = language.to_s[0..1].upcase
|
16
17
|
@@languages[language] ||=
|
17
|
-
::Linguistics.const_get(
|
18
|
-
language.to_s[0..1].upcase)
|
18
|
+
::Linguistics.const_get(code)
|
19
19
|
end
|
20
20
|
return @@languages[language]
|
21
21
|
rescue RuntimeError
|
22
22
|
raise Treat::Exception,
|
23
|
-
"Ruby Linguistics does " +
|
24
|
-
"
|
25
|
-
"for the #{language} language."
|
23
|
+
"Ruby Linguistics does not have a module " +
|
24
|
+
"installed for the #{language} language."
|
26
25
|
end
|
27
26
|
|
28
27
|
end
|
@@ -1,30 +1,24 @@
|
|
1
1
|
# A helper class to load the CoreNLP package.
|
2
2
|
class Treat::Loaders::Stanford
|
3
|
-
|
4
|
-
require 'stanford-core-nlp'
|
5
3
|
|
4
|
+
# Keep track of whether its loaded or not.
|
6
5
|
@@loaded = false
|
7
6
|
|
8
7
|
# Load CoreNLP package for a given language.
|
9
8
|
def self.load(language = nil)
|
10
9
|
return if @@loaded
|
10
|
+
require 'stanford-core-nlp'
|
11
11
|
language ||= Treat.core.language.default
|
12
|
-
|
13
12
|
StanfordCoreNLP.jar_path =
|
14
13
|
Treat.libraries.stanford.jar_path ||
|
15
14
|
Treat.paths.bin + 'stanford/'
|
16
|
-
|
17
15
|
StanfordCoreNLP.model_path =
|
18
16
|
Treat.libraries.stanford.model_path ||
|
19
17
|
Treat.paths.models + 'stanford/'
|
20
|
-
|
21
18
|
StanfordCoreNLP.use(language)
|
22
|
-
if
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
StanfordCoreNLP.bind
|
27
|
-
@@loaded = true
|
19
|
+
StanfordCoreNLP.log_file = '/dev/null' if
|
20
|
+
Treat.core.verbosity.silence
|
21
|
+
StanfordCoreNLP.bind; @@loaded = true
|
28
22
|
end
|
29
23
|
|
30
24
|
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
module Treat
|
2
|
+
|
3
|
+
# Contains common utility/helper functions.
|
4
|
+
module Helpers; include Autoload; end
|
5
|
+
|
6
|
+
# Contains all the configuration options.
|
7
|
+
module Config; include Autoload; end
|
8
|
+
|
9
|
+
# Import all the configuration options.
|
10
|
+
Treat::Config.import!
|
11
|
+
|
12
|
+
# Contains classes to load external libraries.
|
13
|
+
module Loaders; include Autoload; end
|
14
|
+
|
15
|
+
# Contains machine learning core classes.
|
16
|
+
module Learning; include Autoload; end
|
17
|
+
|
18
|
+
# Contains the document object models.
|
19
|
+
module Entities; include Autoload; end
|
20
|
+
|
21
|
+
# Contains all the worker categories.
|
22
|
+
module Workers; include Autoload; end
|
23
|
+
|
24
|
+
# Make all the worker categories.
|
25
|
+
Treat::Workers.categorize!
|
26
|
+
|
27
|
+
# Installs builders on core Ruby objects.
|
28
|
+
module Proxies; include Autoload; end
|
29
|
+
|
30
|
+
# Core classes (installer, server, etc.)
|
31
|
+
module Core; include Autoload; end
|
32
|
+
|
33
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Treat::Proxies
|
2
|
+
|
3
|
+
module Array
|
4
|
+
# Include base proxy functionality.
|
5
|
+
include Treat::Proxies::Proxy
|
6
|
+
def method_missing(sym, *args, &block)
|
7
|
+
if [:do, :apply].include?(sym) ||
|
8
|
+
Treat::Workers.lookup(sym)
|
9
|
+
map do |el|
|
10
|
+
if el.is_a?(Treat::Entities::Entity)
|
11
|
+
el.send(sym, *args)
|
12
|
+
else
|
13
|
+
el.to_entity.send(sym, *args)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
else
|
17
|
+
super(sym, *args, &block)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# Include Treat methods on numerics.
|
23
|
+
::Array.class_eval do
|
24
|
+
include Treat::Proxies::Array
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
module Treat::Proxies
|
2
|
+
|
3
|
+
# This is kind of ugly; need to find a
|
4
|
+
# better solution eventually (?)
|
5
|
+
Treat::Entities::Entity.class_eval do
|
6
|
+
|
7
|
+
# Rename the true language detection
|
8
|
+
# method to :language_proxied, and
|
9
|
+
# only call it if language detection
|
10
|
+
# is turned on in the configuration.
|
11
|
+
alias :language_proxied :language
|
12
|
+
|
13
|
+
# Proxy the #language method, defined on
|
14
|
+
# all textual entities, in order to catch
|
15
|
+
# the method call if language detection is
|
16
|
+
# turned off and return the default language
|
17
|
+
# in that case.
|
18
|
+
def language(extractor = nil, options = {})
|
19
|
+
|
20
|
+
return Treat.core.language.default if
|
21
|
+
!Treat.core.language.detect
|
22
|
+
|
23
|
+
if is_a?(Treat::Entities::Symbol) ||
|
24
|
+
is_a?(Treat::Entities::Number) ||
|
25
|
+
is_a?(Treat::Entities::Punctuation)
|
26
|
+
return Treat.core.language.default
|
27
|
+
end
|
28
|
+
|
29
|
+
dlvl = Treat.core.language.detect_at
|
30
|
+
dklass = Treat::Entities.const_get(dlvl.cc)
|
31
|
+
|
32
|
+
if self.class.compare_with(dklass) < 1
|
33
|
+
anc = ancestor_with_type(dlvl)
|
34
|
+
return anc.language if anc
|
35
|
+
return self.parent.language if has_parent?
|
36
|
+
end
|
37
|
+
|
38
|
+
extractor ||= Treat.workers.
|
39
|
+
extractors.language.default
|
40
|
+
|
41
|
+
language_proxied(extractor, options)
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Treat::Proxies
|
2
|
+
|
3
|
+
# Install Treat functions on Numeric objects.
|
4
|
+
module Numeric
|
5
|
+
# Include base proxy functionality.
|
6
|
+
include Treat::Proxies::Proxy
|
7
|
+
# Return the entity corresponding to the number.
|
8
|
+
def to_entity(builder = nil)
|
9
|
+
Treat::Entities::Number.from_numeric(self)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Include Treat methods on numerics.
|
14
|
+
::Numeric.class_eval do
|
15
|
+
include Treat::Proxies::Numeric
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Proxies install builders on core Ruby objects;
|
2
|
+
# when a method defined by Treat is called on these
|
3
|
+
# objects, the Ruby object is cast to a Treat entity
|
4
|
+
# and the method is called on the resultant type.
|
5
|
+
module Treat::Proxies
|
6
|
+
|
7
|
+
# Provides a base functionality for proxies.
|
8
|
+
module Proxy
|
9
|
+
# Build the entity corresponding to the proxied
|
10
|
+
# object and send the method call to the entity.
|
11
|
+
def method_missing(sym, *args, &block)
|
12
|
+
if [:do, :apply].include?(sym) ||
|
13
|
+
Treat::Workers.lookup(sym)
|
14
|
+
to_entity.send(sym, *args)
|
15
|
+
else
|
16
|
+
super(sym, *args, &block)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
# Create an unknown type of entity by default.
|
20
|
+
def to_entity(builder = nil)
|
21
|
+
Treat::Entities::Unknown(self.to_s)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
module Treat::Proxies
|
2
|
+
|
3
|
+
# Install Treat functions on String objects.
|
4
|
+
module String
|
5
|
+
# Include base proxy functionality.
|
6
|
+
include Treat::Proxies::Proxy
|
7
|
+
# Return the entity corresponding to the string.
|
8
|
+
def to_entity
|
9
|
+
Treat::Entities::Entity.from_string(self)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
# Include Treat methods on strings.
|
14
|
+
::String.class_eval do
|
15
|
+
include Treat::Proxies::String
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
data/lib/treat/version.rb
CHANGED
@@ -1,3 +1,12 @@
|
|
1
1
|
module Treat
|
2
|
-
|
2
|
+
|
3
|
+
# The current version of Treat.
|
4
|
+
VERSION = "2.0.0rc1"
|
5
|
+
|
6
|
+
# Treat requires Ruby >= 1.9.2
|
7
|
+
if RUBY_VERSION < '1.9.2'
|
8
|
+
raise "Treat requires Ruby version 1.9.2 " +
|
9
|
+
"or higher, but current is #{RUBY_VERSION}."
|
10
|
+
end
|
11
|
+
|
3
12
|
end
|
@@ -1,27 +1,27 @@
|
|
1
1
|
# This module creates all the worker categories
|
2
2
|
# and the groups within these categories and adds
|
3
3
|
# the relevant hooks on the appropriate entities.
|
4
|
-
module Treat::Workers
|
4
|
+
module Treat::Workers::Categorizable
|
5
5
|
|
6
|
-
|
6
|
+
require_relative 'groupable'
|
7
7
|
|
8
8
|
# A lookup table for entity types.
|
9
9
|
@@lookup = {}
|
10
10
|
|
11
11
|
# Find a worker group based on method.
|
12
|
-
def
|
12
|
+
def lookup(method)
|
13
13
|
@@lookup[method]
|
14
14
|
end
|
15
15
|
|
16
|
-
def
|
17
|
-
Treat.workers.
|
18
|
-
create_category(cat.
|
16
|
+
def categorize!
|
17
|
+
Treat.workers.members.each do |cat|
|
18
|
+
create_category(cat.
|
19
19
|
capitalize.intern,
|
20
20
|
load_category_conf(cat))
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
def
|
24
|
+
def load_category_conf(name)
|
25
25
|
config = Treat.workers[name]
|
26
26
|
if config.nil?
|
27
27
|
raise Treat::Exception,
|
@@ -31,10 +31,11 @@ module Treat::Workers
|
|
31
31
|
config
|
32
32
|
end
|
33
33
|
|
34
|
-
def
|
35
|
-
category =
|
34
|
+
def create_category(name, conf)
|
35
|
+
category = Treat::Workers.
|
36
|
+
const_set(name, Module.new)
|
36
37
|
conf.each_pair do |group, worker|
|
37
|
-
name =
|
38
|
+
name = group.to_s.cc.intern
|
38
39
|
category.module_eval do
|
39
40
|
@@methods = []; def methods;
|
40
41
|
@@methods; end; def groups;
|
@@ -44,7 +45,7 @@ module Treat::Workers
|
|
44
45
|
end
|
45
46
|
end
|
46
47
|
|
47
|
-
def
|
48
|
+
def create_group(name, conf, category)
|
48
49
|
group = category.const_set(name, Module.new)
|
49
50
|
self.set_group_options(group, conf)
|
50
51
|
self.bind_group_targets(group)
|
@@ -53,17 +54,17 @@ module Treat::Workers
|
|
53
54
|
@@lookup[group.method] = group
|
54
55
|
end
|
55
56
|
|
56
|
-
def
|
57
|
+
def bind_group_targets(group)
|
57
58
|
group.targets.each do |entity_type|
|
58
59
|
entity = Treat::Entities.
|
59
|
-
const_get(cc
|
60
|
+
const_get(entity_type.cc)
|
60
61
|
entity.class_eval do
|
61
62
|
add_workers group
|
62
63
|
end
|
63
64
|
end
|
64
65
|
end
|
65
66
|
|
66
|
-
def
|
67
|
+
def register_group_presets(group, conf)
|
67
68
|
return unless conf.respond_to? :presets
|
68
69
|
conf.presets.each do |m|
|
69
70
|
@@methods << m
|
@@ -71,9 +72,9 @@ module Treat::Workers
|
|
71
72
|
end
|
72
73
|
end
|
73
74
|
|
74
|
-
def
|
75
|
+
def set_group_options(group, conf)
|
75
76
|
group.module_eval do
|
76
|
-
extend Treat::Workers::
|
77
|
+
extend Treat::Workers::Groupable
|
77
78
|
self.type = conf.type
|
78
79
|
self.targets = conf.targets
|
79
80
|
if conf.respond_to?(:default)
|
@@ -90,7 +91,5 @@ module Treat::Workers
|
|
90
91
|
end
|
91
92
|
end
|
92
93
|
end
|
93
|
-
|
94
|
-
self.create_categories
|
95
|
-
|
94
|
+
|
96
95
|
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
#
|
2
|
-
# by selecting
|
3
|
-
#
|
1
|
+
# Extracts an arbitrary number of keywords from a
|
2
|
+
# document in a collection by selecting its N words
|
3
|
+
# with the highest TF*IDF score.
|
4
4
|
class Treat::Workers::Extractors::Keywords::TfIdf
|
5
5
|
|
6
6
|
# Default options - retrieve 5 keywords.
|
@@ -8,31 +8,31 @@ class Treat::Workers::Extractors::Keywords::TfIdf
|
|
8
8
|
|
9
9
|
# Annotate a document with an array containing
|
10
10
|
# the N words with the highest TF*IDF in that
|
11
|
-
# document
|
11
|
+
# document.
|
12
12
|
def self.keywords(entity, options = {})
|
13
13
|
|
14
14
|
options = DefaultOptions.merge(options)
|
15
15
|
tf_idfs = {}
|
16
16
|
|
17
17
|
entity.each_word do |word|
|
18
|
-
|
18
|
+
tf_idf = word.tf_idf
|
19
|
+
if tf_idf
|
20
|
+
tf_idfs[word] ||= tf_idf
|
21
|
+
end
|
19
22
|
end
|
20
23
|
|
21
24
|
tf_idfs = tf_idfs.
|
22
25
|
sort_by {|k,v| v}.reverse
|
23
|
-
|
24
|
-
if tf_idfs.size <= options[:number]
|
25
|
-
return tf_idfs
|
26
|
-
end
|
27
|
-
|
26
|
+
|
28
27
|
keywords = []
|
29
28
|
i = 0
|
29
|
+
max_count = tf_idfs.size < options[:number] ? tf_idfs.size : options[:number]
|
30
30
|
|
31
31
|
tf_idfs.each do |word|
|
32
32
|
|
33
33
|
w = word[0].to_s
|
34
34
|
next if keywords.include?(w)
|
35
|
-
break if i >
|
35
|
+
break if i > max_count
|
36
36
|
keywords << w
|
37
37
|
|
38
38
|
i += 1
|
@@ -1,9 +1,11 @@
|
|
1
|
-
#
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
#
|
6
|
-
|
1
|
+
# Language detection using a probabilistic algorithm
|
2
|
+
# that checks for the presence of words with Bloom
|
3
|
+
# filters built from dictionaries for each language.
|
4
|
+
#
|
5
|
+
# Original paper: Grothoff. 2007. A Quick Introduction to
|
6
|
+
# Bloom Filters. Department of Computer Sciences, Purdue
|
7
|
+
# University.
|
8
|
+
class Treat::Workers::Extractors::Language::WhatLanguage
|
7
9
|
|
8
10
|
# Require the 'whatlanguage' gem.
|
9
11
|
silence_warnings { require 'whatlanguage' }
|