treat 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/README.md +3 -3
- data/lib/treat/config.rb +10 -0
- data/lib/treat/core/data_set.rb +80 -32
- data/lib/treat/core/feature.rb +35 -0
- data/lib/treat/core/problem.rb +43 -0
- data/lib/treat/core/question.rb +27 -0
- data/lib/treat/entities/abilities/buildable.rb +5 -3
- data/lib/treat/entities/abilities/exportable.rb +4 -4
- data/lib/treat/entities/collection.rb +1 -1
- data/lib/treat/entities/document.rb +1 -1
- data/lib/treat/entities/group.rb +8 -5
- data/lib/treat/entities/section.rb +1 -1
- data/lib/treat/entities/token.rb +20 -8
- data/lib/treat/entities/zone.rb +6 -5
- data/lib/treat/loaders/linguistics.rb +18 -19
- data/lib/treat/loaders/stanford.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/language/what_language.rb +53 -57
- data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
- data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
- data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
- data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
- data/lib/treat/workers.rb +1 -1
- data/spec/entity.rb +7 -5
- data/spec/phrase.rb +2 -2
- data/spec/zone.rb +2 -3
- metadata +37 -15
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/21552208.html +0 -683
- data/files/3_2_release_notes.html +0 -766
- data/files/nethttp-cheat-sheet-2940.html +0 -395
- data/files/weather-central-canada-heat-wave.html +0 -1370
- data/lib/treat/core/classification.rb +0 -63
- data/lib/treat/core/server.rb +0 -3
- data/spec/sandbox.rb +0 -223
- data/tmp/english.yaml +0 -10340
@@ -6,55 +6,61 @@ class Treat::Workers::Formatters::Serializers::Mongo
|
|
6
6
|
|
7
7
|
DefaultOptions = {
|
8
8
|
:recursive => true,
|
9
|
-
:stop_at =>
|
9
|
+
:stop_at => nil
|
10
10
|
}
|
11
|
-
|
11
|
+
|
12
12
|
def self.serialize(entity, options = {})
|
13
|
-
|
13
|
+
|
14
14
|
options = DefaultOptions.merge(options)
|
15
|
-
stop_at = options[:stop_at] ?
|
15
|
+
options[:stop_at] = options[:stop_at] ?
|
16
16
|
Treat::Entities.const_get(
|
17
|
-
options[:stop_at].to_s.capitalize) :
|
18
|
-
|
19
|
-
|
17
|
+
options[:stop_at].to_s.capitalize) : nil
|
18
|
+
|
20
19
|
if !Treat.databases.mongo.db && !options[:db]
|
21
20
|
raise Treat::Exception,
|
22
21
|
'Must supply the database name in config. ' +
|
23
22
|
'(Treat.databases.mongo.db = ...) or pass ' +
|
24
23
|
'it as a parameter to #serialize.'
|
25
24
|
end
|
26
|
-
|
25
|
+
|
27
26
|
@@database ||= Mongo::Connection.
|
28
27
|
new(Treat.databases.mongo.host).
|
29
28
|
db(Treat.databases.mongo.db || options[:db])
|
30
|
-
|
31
|
-
type = cl(entity.class.superclass).downcase
|
32
|
-
type = entity.type.to_s if type == 'entity'
|
33
|
-
types = type + 's'
|
34
29
|
|
35
|
-
|
36
|
-
|
30
|
+
supertype = cl(Treat::Entities.const_get(
|
31
|
+
entity.type.to_s.capitalize.intern).superclass).downcase
|
32
|
+
supertype = entity.type.to_s if supertype == 'entity'
|
33
|
+
supertypes = supertype + 's'
|
34
|
+
|
35
|
+
coll = @@database.collection(supertypes)
|
36
|
+
entity_token = self.do_serialize(entity, options)
|
37
|
+
coll.update({id: entity.id}, entity_token, {upsert: true})
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.do_serialize(entity, options)
|
41
|
+
|
42
|
+
children = []
|
43
|
+
|
44
|
+
if options[:recursive] && entity.has_children?
|
45
|
+
entity.each do |child|
|
46
|
+
next if options[:stop_at] && child.class.
|
47
|
+
compare_with(options[:stop_at]) < 0
|
48
|
+
children << self.do_serialize(child, options)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
37
52
|
entity_token = {
|
38
53
|
:id => entity.id,
|
39
54
|
:value => entity.value,
|
40
55
|
:string => entity.to_s,
|
41
|
-
:type => entity.type,
|
42
|
-
:children =>
|
56
|
+
:type => entity.type.to_s,
|
57
|
+
:children => children,
|
43
58
|
:parent => (entity.has_parent? ? entity.parent.id : nil),
|
44
59
|
:features => entity.features
|
45
60
|
}
|
46
|
-
|
47
|
-
coll.insert(entity_token)
|
48
61
|
|
49
|
-
|
50
|
-
|
51
|
-
next if child.class.compare_with(stop_at) < 0
|
52
|
-
self.serialize(child, options)
|
53
|
-
end
|
54
|
-
end
|
55
|
-
|
62
|
+
entity_token
|
63
|
+
|
56
64
|
end
|
57
65
|
|
58
66
|
end
|
59
|
-
|
60
|
-
|
@@ -1,20 +1,9 @@
|
|
1
1
|
module Treat::Workers::Formatters::Unserializers::Mongo
|
2
|
-
|
3
|
-
DefaultOptions = {
|
4
|
-
:recursive => true,
|
5
|
-
:stop_at => nil
|
6
|
-
}
|
7
2
|
|
8
3
|
require 'mongo'
|
9
4
|
|
10
5
|
def self.unserialize(entity, options={})
|
11
6
|
|
12
|
-
options = DefaultOptions.merge(options)
|
13
|
-
options[:stop_at] = options[:stop_at] ?
|
14
|
-
Treat::Entities.const_get(
|
15
|
-
options[:stop_at].to_s.capitalize) :
|
16
|
-
Treat::Entities::Token
|
17
|
-
|
18
7
|
if !Treat.databases.mongo.db && !options[:db]
|
19
8
|
raise Treat::Exception,
|
20
9
|
'Must supply the database name in config. ' +
|
@@ -25,12 +14,6 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
25
14
|
@@database ||= Mongo::Connection.
|
26
15
|
new(Treat.databases.mongo.host).
|
27
16
|
db(Treat.databases.mongo.db || options[:db])
|
28
|
-
|
29
|
-
self.do_unserialize(entity, options)
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
def self.do_unserialize(entity, options)
|
34
17
|
|
35
18
|
supertype = cl(Treat::Entities.const_get(
|
36
19
|
entity.type.to_s.capitalize.intern).superclass).downcase
|
@@ -39,38 +22,33 @@ module Treat::Workers::Formatters::Unserializers::Mongo
|
|
39
22
|
|
40
23
|
coll = @@database.collection(supertypes)
|
41
24
|
record = coll.find_one(:id => entity.id)
|
42
|
-
|
25
|
+
|
43
26
|
unless record
|
44
27
|
raise Treat::Exception,
|
45
28
|
"Couldn't find record ID #{entity.id}."
|
46
29
|
end
|
30
|
+
|
31
|
+
self.do_unserialize(record, options)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.do_unserialize(record, options)
|
36
|
+
|
37
|
+
entity = Treat::Entities.
|
38
|
+
const_get(record['type'].
|
39
|
+
capitalize.intern).new(
|
40
|
+
record['value'], record['id'])
|
47
41
|
|
48
|
-
# Convert feature keys to symbols.
|
49
42
|
features = record['features']
|
50
43
|
new_feat = {}
|
51
44
|
features.each do |feature, value|
|
52
45
|
new_feat[feature.intern] = value
|
53
46
|
end
|
54
|
-
entity.features = new_feat
|
55
47
|
|
56
|
-
|
57
|
-
entity.value = record['value']
|
58
|
-
|
59
|
-
if entity.class.compare_with(
|
60
|
-
options[:stop_at]) == 0
|
61
|
-
entity.value = record['string']
|
62
|
-
end
|
48
|
+
entity.features = new_feat
|
63
49
|
|
64
|
-
return entity unless options[:recursive]
|
65
|
-
|
66
50
|
record['children'].each do |c|
|
67
|
-
|
68
|
-
cklass = Treat::Entities.const_get(
|
69
|
-
ctype.capitalize.intern)
|
70
|
-
next if cklass.compare_with(
|
71
|
-
options[:stop_at]) < 0
|
72
|
-
entity << self.do_unserialize(
|
73
|
-
cklass.new('', cid), options)
|
51
|
+
entity << self.do_unserialize(c, options)
|
74
52
|
end
|
75
53
|
|
76
54
|
entity
|
@@ -7,22 +7,21 @@ class Treat::Workers::Learners::Classifiers::ID3
|
|
7
7
|
def self.classify(entity, options = {})
|
8
8
|
|
9
9
|
set = options[:training]
|
10
|
-
cl = set.
|
10
|
+
cl = set.problem
|
11
11
|
|
12
12
|
if !@@classifiers[cl]
|
13
13
|
dec_tree = DecisionTree::ID3Tree.new(
|
14
|
-
|
15
|
-
cl.
|
14
|
+
cl.labels.map { |l| l.to_s }, set.items, cl.question.default,
|
15
|
+
cl.question.type)
|
16
16
|
dec_tree.train
|
17
17
|
@@classifiers[cl] = dec_tree
|
18
18
|
else
|
19
19
|
dec_tree = @@classifiers[cl]
|
20
|
+
dec_tree.graph('testingbitch')
|
20
21
|
end
|
21
|
-
|
22
22
|
dec_tree.predict(
|
23
23
|
cl.export_item(entity, false)
|
24
24
|
)
|
25
|
-
|
26
25
|
end
|
27
26
|
|
28
27
|
end
|
data/lib/treat/workers.rb
CHANGED
@@ -34,7 +34,7 @@ module Treat::Workers
|
|
34
34
|
def self.create_category(name, conf)
|
35
35
|
category = self.const_set(name, Module.new)
|
36
36
|
conf.each_pair do |group, worker|
|
37
|
-
name = group.to_s.
|
37
|
+
name = cc(group.to_s).intern
|
38
38
|
category.module_eval do
|
39
39
|
@@methods = []; def methods;
|
40
40
|
@@methods; end; def groups;
|
data/spec/entity.rb
CHANGED
@@ -113,12 +113,14 @@ describe Treat::Entities::Entity do
|
|
113
113
|
describe "Exportable" do
|
114
114
|
|
115
115
|
context "when supplied with a classification to export" do
|
116
|
-
|
116
|
+
feature = Treat::Core::Feature.new(:tag)
|
117
|
+
question = Treat::Core::Question.new(:is_keyword, :word, :discrete, false)
|
118
|
+
problem = Treat::Core::Problem.new(question, feature)
|
117
119
|
it "returns a data set with the exported features" do
|
118
|
-
ds = @sentence.export(
|
119
|
-
ds.
|
120
|
-
ds.labels.should eql [:tag]
|
121
|
-
ds.
|
120
|
+
ds = @sentence.export(problem)
|
121
|
+
ds.problem.should eql problem
|
122
|
+
ds.problem.labels.should eql [:tag]
|
123
|
+
ds.entities.should eql @sentence.words.map { |w| w.id }
|
122
124
|
ds.items.should eql [
|
123
125
|
["DT", false], ["JJ", false],
|
124
126
|
["NN", false], ["VBZ", false],
|
data/spec/phrase.rb
CHANGED
@@ -42,7 +42,7 @@ describe Treat::Entities::Phrase do
|
|
42
42
|
|
43
43
|
describe "#time" do
|
44
44
|
it "returns a DateTime object representing the time in the phrase" do
|
45
|
-
Treat.languages.english
|
45
|
+
Treat.languages.english.workers.extractors.time.each do |e|
|
46
46
|
t = 'october 2006'.time(e)
|
47
47
|
t.month.should eql 10
|
48
48
|
end
|
@@ -55,7 +55,7 @@ describe Treat::Entities::Phrase do
|
|
55
55
|
describe "#tokenize" do
|
56
56
|
|
57
57
|
it "splits a phrase/sentence into tokens and adds them as children of the phrase" do
|
58
|
-
Treat.languages.english
|
58
|
+
Treat.languages.english.workers.processors.tokenizers.each do |t|
|
59
59
|
@phrase = Treat::Entities::Phrase.new('a phrase to tokenize')
|
60
60
|
@phrase.tokenize(t)
|
61
61
|
@phrase.children.should eql @phrase.tokens
|
data/spec/zone.rb
CHANGED
@@ -5,13 +5,12 @@ describe Treat::Entities::Zone do
|
|
5
5
|
describe "Processable" do
|
6
6
|
|
7
7
|
before do
|
8
|
-
@processors = Treat.languages.
|
9
|
-
english[:workers][:processors]
|
8
|
+
@processors = Treat.languages.english.workers.processors
|
10
9
|
end
|
11
10
|
describe "#segment" do
|
12
11
|
|
13
12
|
it "splits a zone into phrases/sentences and adds them as children of the zone" do
|
14
|
-
@processors
|
13
|
+
@processors.segmenters.each do |s|
|
15
14
|
paragraph = Treat::Entities::Paragraph.new(
|
16
15
|
"This is a first sentence inside the first paragraph. " +
|
17
16
|
"This is the second sentence that is inside the paragraph.")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: treat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-07-
|
12
|
+
date: 2012-07-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: schiphol
|
@@ -27,6 +27,38 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: birch
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: sourcify
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
30
62
|
- !ruby/object:Gem::Dependency
|
31
63
|
name: rspec
|
32
64
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,11 +99,6 @@ extensions: []
|
|
67
99
|
extra_rdoc_files: []
|
68
100
|
files:
|
69
101
|
- bin/MANIFEST
|
70
|
-
- bin/stanford/bridge.jar
|
71
|
-
- bin/stanford/joda-time.jar
|
72
|
-
- bin/stanford/stanford-corenlp.jar
|
73
|
-
- bin/stanford/stanford-parser.jar
|
74
|
-
- bin/stanford/xom.jar
|
75
102
|
- lib/treat/config/core/acronyms.rb
|
76
103
|
- lib/treat/config/core/encodings.rb
|
77
104
|
- lib/treat/config/core/entities.rb
|
@@ -110,10 +137,11 @@ files:
|
|
110
137
|
- lib/treat/config/workers/processors.rb
|
111
138
|
- lib/treat/config/workers/retrievers.rb
|
112
139
|
- lib/treat/config.rb
|
113
|
-
- lib/treat/core/classification.rb
|
114
140
|
- lib/treat/core/data_set.rb
|
141
|
+
- lib/treat/core/feature.rb
|
115
142
|
- lib/treat/core/node.rb
|
116
|
-
- lib/treat/core/
|
143
|
+
- lib/treat/core/problem.rb
|
144
|
+
- lib/treat/core/question.rb
|
117
145
|
- lib/treat/core.rb
|
118
146
|
- lib/treat/entities/abilities/buildable.rb
|
119
147
|
- lib/treat/entities/abilities/checkable.rb
|
@@ -224,18 +252,12 @@ files:
|
|
224
252
|
- spec/samples/mathematicians/gauss.pdf
|
225
253
|
- spec/samples/mathematicians/leibniz.txt
|
226
254
|
- spec/samples/mathematicians/newton.doc
|
227
|
-
- spec/sandbox.rb
|
228
255
|
- spec/token.rb
|
229
256
|
- spec/treat.rb
|
230
257
|
- spec/word.rb
|
231
258
|
- spec/zone.rb
|
232
|
-
- tmp/english.yaml
|
233
259
|
- tmp/MANIFEST
|
234
|
-
- files/21552208.html
|
235
|
-
- files/3_2_release_notes.html
|
236
260
|
- files/MANIFEST
|
237
|
-
- files/nethttp-cheat-sheet-2940.html
|
238
|
-
- files/weather-central-canada-heat-wave.html
|
239
261
|
- README.md
|
240
262
|
- LICENSE
|
241
263
|
homepage: https://github.com/louismullie/treat
|
data/bin/stanford/bridge.jar
DELETED
Binary file
|
data/bin/stanford/joda-time.jar
DELETED
Binary file
|
Binary file
|
Binary file
|
data/bin/stanford/xom.jar
DELETED
Binary file
|