treat 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +1 -1
- data/README.md +3 -3
- data/lib/treat/config.rb +10 -0
- data/lib/treat/core/data_set.rb +80 -32
- data/lib/treat/core/feature.rb +35 -0
- data/lib/treat/core/problem.rb +43 -0
- data/lib/treat/core/question.rb +27 -0
- data/lib/treat/entities/abilities/buildable.rb +5 -3
- data/lib/treat/entities/abilities/exportable.rb +4 -4
- data/lib/treat/entities/collection.rb +1 -1
- data/lib/treat/entities/document.rb +1 -1
- data/lib/treat/entities/group.rb +8 -5
- data/lib/treat/entities/section.rb +1 -1
- data/lib/treat/entities/token.rb +20 -8
- data/lib/treat/entities/zone.rb +6 -5
- data/lib/treat/loaders/linguistics.rb +18 -19
- data/lib/treat/loaders/stanford.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/language/what_language.rb +53 -57
- data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
- data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
- data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
- data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
- data/lib/treat/workers.rb +1 -1
- data/spec/entity.rb +7 -5
- data/spec/phrase.rb +2 -2
- data/spec/zone.rb +2 -3
- metadata +37 -15
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/21552208.html +0 -683
- data/files/3_2_release_notes.html +0 -766
- data/files/nethttp-cheat-sheet-2940.html +0 -395
- data/files/weather-central-canada-heat-wave.html +0 -1370
- data/lib/treat/core/classification.rb +0 -63
- data/lib/treat/core/server.rb +0 -3
- data/spec/sandbox.rb +0 -223
- data/tmp/english.yaml +0 -10340
@@ -6,55 +6,61 @@ class Treat::Workers::Formatters::Serializers::Mongo
|
|
6
6
|
|
7
7
|
DefaultOptions = {
|
8
8
|
:recursive => true,
|
9
|
-
:stop_at =>
|
9
|
+
:stop_at => nil
|
10
10
|
}
|
11
|
-
|
11
|
+
|
12
12
|
def self.serialize(entity, options = {})
|
13
|
-
|
13
|
+
|
14
14
|
options = DefaultOptions.merge(options)
|
15
|
-
stop_at = options[:stop_at] ?
|
15
|
+
options[:stop_at] = options[:stop_at] ?
|
16
16
|
Treat::Entities.const_get(
|
17
|
-
options[:stop_at].to_s.capitalize) :
|
18
|
-
|
19
|
-
|
17
|
+
options[:stop_at].to_s.capitalize) : nil
|
18
|
+
|
20
19
|
if !Treat.databases.mongo.db && !options[:db]
|
21
20
|
raise Treat::Exception,
|
22
21
|
'Must supply the database name in config. ' +
|
23
22
|
'(Treat.databases.mongo.db = ...) or pass ' +
|
24
23
|
'it as a parameter to #serialize.'
|
25
24
|
end
|
26
|
-
|
25
|
+
|
27
26
|
@@database ||= Mongo::Connection.
|
28
27
|
new(Treat.databases.mongo.host).
|
29
28
|
db(Treat.databases.mongo.db || options[:db])
|
30
|
-
|
31
|
-
type = cl(entity.class.superclass).downcase
|
32
|
-
type = entity.type.to_s if type == 'entity'
|
33
|
-
types = type + 's'
|
34
29
|
|
35
|
-
|
36
|
-
|
30
|
+
supertype = cl(Treat::Entities.const_get(
|
31
|
+
entity.type.to_s.capitalize.intern).superclass).downcase
|
32
|
+
supertype = entity.type.to_s if supertype == 'entity'
|
33
|
+
supertypes = supertype + 's'
|
34
|
+
|
35
|
+
coll = @@database.collection(supertypes)
|
36
|
+
entity_token = self.do_serialize(entity, options)
|
37
|
+
coll.update({id: entity.id}, entity_token, {upsert: true})
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.do_serialize(entity, options)
|
41
|
+
|
42
|
+
children = []
|
43
|
+
|
44
|
+
if options[:recursive] && entity.has_children?
|
45
|
+
entity.each do |child|
|
46
|
+
next if options[:stop_at] && child.class.
|
47
|
+
compare_with(options[:stop_at]) < 0
|
48
|
+
children << self.do_serialize(child, options)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
37
52
|
entity_token = {
|
38
53
|
:id => entity.id,
|
39
54
|
:value => entity.value,
|
40
55
|
:string => entity.to_s,
|
41
|
-
:type => entity.type,
|
42
|
-
:children =>
|
56
|
+
:type => entity.type.to_s,
|
57
|
+
:children => children,
|
43
58
|
:parent => (entity.has_parent? ? entity.parent.id : nil),
|
44
59
|
:features => entity.features
|
45
60
|
}
|
46
|
-
|
47
|
-
coll.insert(entity_token)
|
48
61
|
|
49
|
-
|
50
|
-
|
51
|
-
next if child.class.compare_with(stop_at) < 0
|
52
|
-
self.serialize(child, options)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
62
|
+
entity_token
|
63
|
+
|
56
64
|
end
|
57
65
|
|
58
66
|
end
|
59
|
-
|
60
|
-
|
@@ -1,20 +1,9 @@
|
|
1
1
|
module Treat::Workers::Formatters::Unserializers::Mongo
|
2
|
-
|
3
|
-
DefaultOptions = {
|
4
|
-
:recursive => true,
|
5
|
-
:stop_at => nil
|
6
|
-
}
|
7
2
|
|
8
3
|
require 'mongo'
|
9
4
|
|
10
5
|
def self.unserialize(entity, options={})
|
11
6
|
|
12
|
-
options = DefaultOptions.merge(options)
|
13
|
-
options[:stop_at] = options[:stop_at] ?
|
14
|
-
Treat::Entities.const_get(
|
15
|
-
options[:stop_at].to_s.capitalize) :
|
16
|
-
Treat::Entities::Token
|
17
|
-
|
18
7
|
if !Treat.databases.mongo.db && !options[:db]
|
19
8
|
raise Treat::Exception,
|
20
9
|
'Must supply the database name in config. ' +
|
@@ -25,12 +14,6 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
25
14
|
@@database ||= Mongo::Connection.
|
26
15
|
new(Treat.databases.mongo.host).
|
27
16
|
db(Treat.databases.mongo.db || options[:db])
|
28
|
-
|
29
|
-
self.do_unserialize(entity, options)
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
def self.do_unserialize(entity, options)
|
34
17
|
|
35
18
|
supertype = cl(Treat::Entities.const_get(
|
36
19
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
@@ -39,38 +22,33 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
39
22
|
|
40
23
|
coll = @@database.collection(supertypes)
|
41
24
|
record = coll.find_one(:id => entity.id)
|
42
|
-
|
25
|
+
|
43
26
|
unless record
|
44
27
|
raise Treat::Exception,
|
45
28
|
"Couldn't find record ID #{entity.id}."
|
46
29
|
end
|
30
|
+
|
31
|
+
self.do_unserialize(record, options)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.do_unserialize(record, options)
|
36
|
+
|
37
|
+
entity = Treat::Entities.
|
38
|
+
const_get(record['type'].
|
39
|
+
capitalize.intern).new(
|
40
|
+
record['value'], record['id'])
|
47
41
|
|
48
|
-
# Convert feature keys to symbols.
|
49
42
|
features = record['features']
|
50
43
|
new_feat = {}
|
51
44
|
features.each do |feature, value|
|
52
45
|
new_feat[feature.intern] = value
|
53
46
|
end
|
54
|
-
entity.features = new_feat
|
55
47
|
|
56
|
-
|
57
|
-
entity.value = record['value']
|
58
|
-
|
59
|
-
if entity.class.compare_with(
|
60
|
-
options[:stop_at]) == 0
|
61
|
-
entity.value = record['string']
|
62
|
-
end
|
48
|
+
entity.features = new_feat
|
63
49
|
|
64
|
-
return entity unless options[:recursive]
|
65
|
-
|
66
50
|
record['children'].each do |c|
|
67
|
-
|
68
|
-
cklass = Treat::Entities.const_get(
|
69
|
-
ctype.capitalize.intern)
|
70
|
-
next if cklass.compare_with(
|
71
|
-
options[:stop_at]) < 0
|
72
|
-
entity << self.do_unserialize(
|
73
|
-
cklass.new('', cid), options)
|
51
|
+
entity << self.do_unserialize(c, options)
|
74
52
|
end
|
75
53
|
|
76
54
|
entity
|
@@ -7,22 +7,21 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
7
7
|
def self.classify(entity, options = {})
|
8
8
|
|
9
9
|
set = options[:training]
|
10
|
-
cl = set.
|
10
|
+
cl = set.problem
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
|
15
|
-
cl.
|
14
|
+
cl.labels.map { |l| l.to_s }, set.items, cl.question.default,
|
15
|
+
cl.question.type)
|
16
16
|
dec_tree.train
|
17
17
|
@@classifiers[cl] = dec_tree
|
18
18
|
else
|
19
19
|
dec_tree = @@classifiers[cl]
|
20
|
+
dec_tree.graph('testingbitch')
|
20
21
|
end
|
21
|
-
|
22
22
|
dec_tree.predict(
|
23
23
|
cl.export_item(entity, false)
|
24
24
|
)
|
25
|
-
|
26
25
|
end
|
27
26
|
|
28
27
|
end
|
data/lib/treat/workers.rb
CHANGED
@@ -34,7 +34,7 @@ module Treat::Workers
|
|
34
34
|
def self.create_category(name, conf)
|
35
35
|
category = self.const_set(name, Module.new)
|
36
36
|
conf.each_pair do |group, worker|
|
37
|
-
name = group.to_s.
|
37
|
+
name = cc(group.to_s).intern
|
38
38
|
category.module_eval do
|
39
39
|
@@methods = []; def methods;
|
40
40
|
@@methods; end; def groups;
|
data/spec/entity.rb
CHANGED
@@ -113,12 +113,14 @@ describe Treat::Entities::Entity do
|
|
113
113
|
describe "Exportable" do
|
114
114
|
|
115
115
|
context "when supplied with a classification to export" do
|
116
|
-
|
116
|
+
feature = Treat::Core::Feature.new(:tag)
|
117
|
+
question = Treat::Core::Question.new(:is_keyword, :word, :discrete, false)
|
118
|
+
problem = Treat::Core::Problem.new(question, feature)
|
117
119
|
it "returns a data set with the exported features" do
|
118
|
-
ds = @sentence.export(
|
119
|
-
ds.
|
120
|
-
ds.labels.should eql [:tag]
|
121
|
-
ds.
|
120
|
+
ds = @sentence.export(problem)
|
121
|
+
ds.problem.should eql problem
|
122
|
+
ds.problem.labels.should eql [:tag]
|
123
|
+
ds.entities.should eql @sentence.words.map { |w| w.id }
|
122
124
|
ds.items.should eql [
|
123
125
|
["DT", false], ["JJ", false],
|
124
126
|
["NN", false], ["VBZ", false],
|
data/spec/phrase.rb
CHANGED
@@ -42,7 +42,7 @@ describe Treat::Entities::Phrase do
|
|
42
42
|
|
43
43
|
describe "#time" do
|
44
44
|
it "returns a DateTime object representing the time in the phrase" do
|
45
|
-
Treat.languages.english
|
45
|
+
Treat.languages.english.workers.extractors.time.each do |e|
|
46
46
|
t = 'october 2006'.time(e)
|
47
47
|
t.month.should eql 10
|
48
48
|
end
|
@@ -55,7 +55,7 @@ describe Treat::Entities::Phrase do
|
|
55
55
|
describe "#tokenize" do
|
56
56
|
|
57
57
|
it "splits a phrase/sentence into tokens and adds them as children of the phrase" do
|
58
|
-
Treat.languages.english
|
58
|
+
Treat.languages.english.workers.processors.tokenizers.each do |t|
|
59
59
|
@phrase = Treat::Entities::Phrase.new('a phrase to tokenize')
|
60
60
|
@phrase.tokenize(t)
|
61
61
|
@phrase.children.should eql @phrase.tokens
|
data/spec/zone.rb
CHANGED
@@ -5,13 +5,12 @@ describe Treat::Entities::Zone do
|
|
5
5
|
describe "Processable" do
|
6
6
|
|
7
7
|
before do
|
8
|
-
@processors = Treat.languages.
|
9
|
-
english[:workers][:processors]
|
8
|
+
@processors = Treat.languages.english.workers.processors
|
10
9
|
end
|
11
10
|
describe "#segment" do
|
12
11
|
|
13
12
|
it "splits a zone into phrases/sentences and adds them as children of the zone" do
|
14
|
-
@processors
|
13
|
+
@processors.segmenters.each do |s|
|
15
14
|
paragraph = Treat::Entities::Paragraph.new(
|
16
15
|
"This is a first sentence inside the first paragraph. " +
|
17
16
|
"This is the second sentence that is inside the paragraph.")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: schiphol
|
@@ -27,6 +27,38 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: birch
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: sourcify
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
30
62
|
- !ruby/object:Gem::Dependency
|
31
63
|
name: rspec
|
32
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,11 +99,6 @@ extensions: []
|
|
67
99
|
extra_rdoc_files: []
|
68
100
|
files:
|
69
101
|
- bin/MANIFEST
|
70
|
-
- bin/stanford/bridge.jar
|
71
|
-
- bin/stanford/joda-time.jar
|
72
|
-
- bin/stanford/stanford-corenlp.jar
|
73
|
-
- bin/stanford/stanford-parser.jar
|
74
|
-
- bin/stanford/xom.jar
|
75
102
|
- lib/treat/config/core/acronyms.rb
|
76
103
|
- lib/treat/config/core/encodings.rb
|
77
104
|
- lib/treat/config/core/entities.rb
|
@@ -110,10 +137,11 @@ files:
|
|
110
137
|
- lib/treat/config/workers/processors.rb
|
111
138
|
- lib/treat/config/workers/retrievers.rb
|
112
139
|
- lib/treat/config.rb
|
113
|
-
- lib/treat/core/classification.rb
|
114
140
|
- lib/treat/core/data_set.rb
|
141
|
+
- lib/treat/core/feature.rb
|
115
142
|
- lib/treat/core/node.rb
|
116
|
-
- lib/treat/core/
|
143
|
+
- lib/treat/core/problem.rb
|
144
|
+
- lib/treat/core/question.rb
|
117
145
|
- lib/treat/core.rb
|
118
146
|
- lib/treat/entities/abilities/buildable.rb
|
119
147
|
- lib/treat/entities/abilities/checkable.rb
|
@@ -224,18 +252,12 @@ files:
|
|
224
252
|
- spec/samples/mathematicians/gauss.pdf
|
225
253
|
- spec/samples/mathematicians/leibniz.txt
|
226
254
|
- spec/samples/mathematicians/newton.doc
|
227
|
-
- spec/sandbox.rb
|
228
255
|
- spec/token.rb
|
229
256
|
- spec/treat.rb
|
230
257
|
- spec/word.rb
|
231
258
|
- spec/zone.rb
|
232
|
-
- tmp/english.yaml
|
233
259
|
- tmp/MANIFEST
|
234
|
-
- files/21552208.html
|
235
|
-
- files/3_2_release_notes.html
|
236
260
|
- files/MANIFEST
|
237
|
-
- files/nethttp-cheat-sheet-2940.html
|
238
|
-
- files/weather-central-canada-heat-wave.html
|
239
261
|
- README.md
|
240
262
|
- LICENSE
|
241
263
|
homepage: https://github.com/louismullie/treat
|
data/bin/stanford/bridge.jar
DELETED
Binary file
|
data/bin/stanford/joda-time.jar
DELETED
Binary file
|
Binary file
|
Binary file
|
data/bin/stanford/xom.jar
DELETED
Binary file
|