treat 1.1.0 → 1.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +1 -1
- data/README.md +3 -3
- data/lib/treat/config.rb +10 -0
- data/lib/treat/core/data_set.rb +80 -32
- data/lib/treat/core/feature.rb +35 -0
- data/lib/treat/core/problem.rb +43 -0
- data/lib/treat/core/question.rb +27 -0
- data/lib/treat/entities/abilities/buildable.rb +5 -3
- data/lib/treat/entities/abilities/exportable.rb +4 -4
- data/lib/treat/entities/collection.rb +1 -1
- data/lib/treat/entities/document.rb +1 -1
- data/lib/treat/entities/group.rb +8 -5
- data/lib/treat/entities/section.rb +1 -1
- data/lib/treat/entities/token.rb +20 -8
- data/lib/treat/entities/zone.rb +6 -5
- data/lib/treat/loaders/linguistics.rb +18 -19
- data/lib/treat/loaders/stanford.rb +3 -2
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/language/what_language.rb +53 -57
- data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
- data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
- data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
- data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
- data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
- data/lib/treat/workers.rb +1 -1
- data/spec/entity.rb +7 -5
- data/spec/phrase.rb +2 -2
- data/spec/zone.rb +2 -3
- metadata +37 -15
- data/bin/stanford/bridge.jar +0 -0
- data/bin/stanford/joda-time.jar +0 -0
- data/bin/stanford/stanford-corenlp.jar +0 -0
- data/bin/stanford/stanford-parser.jar +0 -0
- data/bin/stanford/xom.jar +0 -0
- data/files/21552208.html +0 -683
- data/files/3_2_release_notes.html +0 -766
- data/files/nethttp-cheat-sheet-2940.html +0 -395
- data/files/weather-central-canada-heat-wave.html +0 -1370
- data/lib/treat/core/classification.rb +0 -63
- data/lib/treat/core/server.rb +0 -3
- data/spec/sandbox.rb +0 -223
- data/tmp/english.yaml +0 -10340
@@ -1,63 +0,0 @@
|
|
1
|
-
class Treat::Core::Classification
|
2
|
-
|
3
|
-
attr_reader :types
|
4
|
-
attr_reader :features
|
5
|
-
attr_reader :question
|
6
|
-
attr_reader :labels
|
7
|
-
attr_reader :mode
|
8
|
-
attr_reader :default
|
9
|
-
|
10
|
-
def initialize(type_or_types, feature_or_features,
|
11
|
-
question, default = false, mode = :continuous)
|
12
|
-
@types, @features,
|
13
|
-
@question, @default =
|
14
|
-
[*type_or_types],
|
15
|
-
[*feature_or_features],
|
16
|
-
question, default
|
17
|
-
|
18
|
-
@mode = mode
|
19
|
-
@labels = []
|
20
|
-
@features.each do |cmd|
|
21
|
-
if cmd.is_a?(Array)
|
22
|
-
@labels << cmd[0]
|
23
|
-
else
|
24
|
-
@labels << cmd
|
25
|
-
end
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
def export_item(e, include_question = true)
|
30
|
-
|
31
|
-
line = []
|
32
|
-
|
33
|
-
@features.each do |cmd|
|
34
|
-
dflt = nil
|
35
|
-
begin
|
36
|
-
if cmd.is_a?(Array)
|
37
|
-
if cmd.size == 3
|
38
|
-
r = cmd[1].call(e)
|
39
|
-
dflt = cmd[2]
|
40
|
-
line << (r ? r : dflt)
|
41
|
-
elsif cmd.size == 2
|
42
|
-
r = e.send(cmd[0])
|
43
|
-
dflt = cmd[1]
|
44
|
-
line << (r ? r : dflt)
|
45
|
-
end
|
46
|
-
else
|
47
|
-
line << e.send(cmd)
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
if include_question
|
53
|
-
if e.has?(@question)
|
54
|
-
line << e.get(@question)
|
55
|
-
else
|
56
|
-
line << @default
|
57
|
-
end
|
58
|
-
end
|
59
|
-
|
60
|
-
line
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|
data/lib/treat/core/server.rb
DELETED
data/spec/sandbox.rb
DELETED
@@ -1,223 +0,0 @@
|
|
1
|
-
#encoding: utf-8
|
2
|
-
require_relative '../lib/treat'
|
3
|
-
require 'ruby-prof'
|
4
|
-
Treat.databases.mongo.db = 'test2_treat'
|
5
|
-
|
6
|
-
d = Document 'merkozy_rides_again.txt'
|
7
|
-
d.do :chunk, :segment, :tokenize, :category, :tag
|
8
|
-
|
9
|
-
d.serialize :mongo
|
10
|
-
|
11
|
-
Treat::Entities::Document.from_db(:mongo, id: d.id, stop_at: :sentence).print_tree
|
12
|
-
|
13
|
-
=begin
|
14
|
-
d = Document 'http://www.cbc.ca/news/canada/story/2012/07/06/weather-central-canada-heat-wave.html'
|
15
|
-
|
16
|
-
d.do :chunk, :segment, :tokenize, :tag, :category
|
17
|
-
d.serialize :mongo, db: "test_treat"
|
18
|
-
d2 = Treat::Entities::Document.from_db(:mongo, id: d.id)
|
19
|
-
puts d2.inspect
|
20
|
-
abort
|
21
|
-
require 'benchmark'
|
22
|
-
|
23
|
-
Benchmark.bm do |x|
|
24
|
-
|
25
|
-
|
26
|
-
x.report "Mongo serialization" do
|
27
|
-
10.times do
|
28
|
-
d.serialize :mongo, db: "test_treat"
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
x.report "Mongo deserialization" do
|
33
|
-
1.times do
|
34
|
-
Treat::Entities::Document.from_db(:mongo, id: d.id)
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
=end
|
40
|
-
=begin
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
|
45
|
-
d = Treat::Entities::Document.build(f)
|
46
|
-
|
47
|
-
d.do :chunk, :segment
|
48
|
-
|
49
|
-
d.serialize :mongo, db: 'testing1234'
|
50
|
-
|
51
|
-
d2 = Treat::Entities::Document.from_db(:mongo, db: 'testing1234', id: d.id)
|
52
|
-
puts d2.to_s
|
53
|
-
|
54
|
-
puts d2.print_tree
|
55
|
-
=end
|
56
|
-
=begin
|
57
|
-
Treat.databases.mongo.db = 'treat_testing'
|
58
|
-
|
59
|
-
p = Phrase 'this is'
|
60
|
-
p.set :tag, 'VP'
|
61
|
-
w = Word 'this'
|
62
|
-
w.set :category, :determiner
|
63
|
-
w2 = Word 'is'
|
64
|
-
w2.set :category, 'verb'
|
65
|
-
p << w
|
66
|
-
p << w2
|
67
|
-
|
68
|
-
p.serialize :mongo
|
69
|
-
|
70
|
-
p2 = Phrase "#{p.id}.mongo"
|
71
|
-
|
72
|
-
p2.print_tree
|
73
|
-
=end
|
74
|
-
=begin
|
75
|
-
entity = Treat::Entities::Entity.create(
|
76
|
-
id: 1,
|
77
|
-
value: 'test',
|
78
|
-
children: [1, 2, 3],
|
79
|
-
features: [a: 'a', b: 'b', c: 'c']
|
80
|
-
)
|
81
|
-
|
82
|
-
entity.save
|
83
|
-
|
84
|
-
=end
|
85
|
-
|
86
|
-
w = Word 'hello'
|
87
|
-
|
88
|
-
=begin
|
89
|
-
require_relative '../lib/treat/loaders/stanford'
|
90
|
-
|
91
|
-
Treat::Loaders::Stanford.model_path = '/ruby/stanford/models/'
|
92
|
-
Treat::Loaders::Stanford.jar_path = '/ruby/stanford/bin/'
|
93
|
-
|
94
|
-
class Treat::Entities::Sentence
|
95
|
-
|
96
|
-
def long_word_count
|
97
|
-
i = 0
|
98
|
-
each_word do |word|
|
99
|
-
i += 1 if word.syllable_count > 3
|
100
|
-
end
|
101
|
-
i
|
102
|
-
end
|
103
|
-
|
104
|
-
def flesch_kincaid
|
105
|
-
syllable_count / word_count
|
106
|
-
end
|
107
|
-
|
108
|
-
def syllable_count
|
109
|
-
c = 0
|
110
|
-
each_word do |word|
|
111
|
-
c += word.syllable_count
|
112
|
-
end
|
113
|
-
c
|
114
|
-
end
|
115
|
-
|
116
|
-
end
|
117
|
-
|
118
|
-
class Treat::Entities::Word
|
119
|
-
|
120
|
-
def syllable_count
|
121
|
-
w = to_s.downcase
|
122
|
-
return 1 if w.length <= 3
|
123
|
-
w.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
|
124
|
-
w.sub!(/^y/, '')
|
125
|
-
w.scan(/[aeiouy]{1,2}/).size
|
126
|
-
end
|
127
|
-
|
128
|
-
end
|
129
|
-
|
130
|
-
c = Collection Treat.paths.spec + 'samples/kant'
|
131
|
-
|
132
|
-
d = Document Treat.paths.spec + 'samples/kant/kant_enlightnement.txt'
|
133
|
-
|
134
|
-
d.do :chunk, :segment, :tokenize, :tag, :category, :name_tag
|
135
|
-
|
136
|
-
# Position of sentence in containers - clustering??
|
137
|
-
d.each_sentence do |s|
|
138
|
-
s.set :section_p, (s.parent_section.position.to_f / s.parent_document.children.size.to_f).round(2)
|
139
|
-
s.set :zone_p, (s.parent_zone.position.to_f / s.parent_section.children.size.to_f).round(2)
|
140
|
-
s.set :sentence_p, (s.position.to_f / s.parent_zone.children.size.to_f).round(2)
|
141
|
-
end
|
142
|
-
|
143
|
-
# Part of speech partitionning of the sentence
|
144
|
-
d.each_sentence do |s|
|
145
|
-
s.set :noun_density, (s.noun_count.to_f / (s.word_count + 1).to_f).round(2)
|
146
|
-
s.set :verb_density, (s.verb_count.to_f / (s.word_count + 1).to_f).round(2)
|
147
|
-
s.set :adjective_density, (s.adjective_count.to_f / (s.word_count + 1).to_f).round(2)
|
148
|
-
s.set :adverb_density, (s.adverb_count.to_f / (s.word_count + 1).to_f).round(2)
|
149
|
-
end
|
150
|
-
|
151
|
-
# Sentence readability -> length and long words.
|
152
|
-
d.each_sentence do |s|
|
153
|
-
s.set :word_count, s.word_count
|
154
|
-
s.set :long_word_count, s.long_word_count
|
155
|
-
s.set :flesch_kincaid, s.flesch_kincaid
|
156
|
-
end
|
157
|
-
|
158
|
-
# Domain specificity -> named entities according to domain.
|
159
|
-
d.each_sentence do |s|
|
160
|
-
s.set :person_count, s.entities_with_feature(:name_tag, 'person').size
|
161
|
-
s.set :time_count, s.entities_with_feature(:name_tag, 'time').size
|
162
|
-
s.set :location_count, s.entities_with_feature(:name_tag, 'location').size
|
163
|
-
s.set :number_count, s.number_count
|
164
|
-
puts s.inspect
|
165
|
-
end
|
166
|
-
|
167
|
-
d.each_sentence do |s|
|
168
|
-
if Random.rand() >= 0.5
|
169
|
-
s.set :golden, true
|
170
|
-
else
|
171
|
-
s.set :golden, false
|
172
|
-
end
|
173
|
-
end
|
174
|
-
|
175
|
-
golden = []
|
176
|
-
not_golden = []
|
177
|
-
|
178
|
-
d.each_sentence do |s|
|
179
|
-
if s.golden
|
180
|
-
golden << s
|
181
|
-
else
|
182
|
-
not_golden << s
|
183
|
-
end
|
184
|
-
end
|
185
|
-
|
186
|
-
i = 0
|
187
|
-
golden.each do |s|
|
188
|
-
puts s.sentence_p.to_s + ' ' + not_golden[i].sentence_p.to_s
|
189
|
-
i += 1
|
190
|
-
end
|
191
|
-
=end
|
192
|
-
=begin
|
193
|
-
|
194
|
-
d = Document 'http://www.cbc.ca/news/canada/montreal/story/2012/06/04/montreal-magnotta-search.html'
|
195
|
-
|
196
|
-
d.do :chunk, :segment
|
197
|
-
|
198
|
-
d.each_zone do |z|
|
199
|
-
puts '-------' + z.type.to_s
|
200
|
-
z.do tokenize: :ptb
|
201
|
-
z.each_sentence do |s|
|
202
|
-
puts s.to_s
|
203
|
-
end
|
204
|
-
#puts z.to_s
|
205
|
-
puts '-------'
|
206
|
-
end
|
207
|
-
|
208
|
-
|
209
|
-
abort
|
210
|
-
|
211
|
-
Treat::Databases.connect :mongo
|
212
|
-
|
213
|
-
p = Phrase ''
|
214
|
-
w = Word 'test'
|
215
|
-
p << w
|
216
|
-
|
217
|
-
p.print_tree
|
218
|
-
|
219
|
-
p.serialize :mongo, :db => 'treat'
|
220
|
-
p2 = Treat::Workers::Formatters::Unserializers::Mongo.unserialize(Treat::Entities::Phrase.new('', p.id))
|
221
|
-
p2.print_tree
|
222
|
-
|
223
|
-
=end
|