treat 1.1.0 → 1.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. data/LICENSE +1 -1
  2. data/README.md +3 -3
  3. data/lib/treat/config.rb +10 -0
  4. data/lib/treat/core/data_set.rb +80 -32
  5. data/lib/treat/core/feature.rb +35 -0
  6. data/lib/treat/core/problem.rb +43 -0
  7. data/lib/treat/core/question.rb +27 -0
  8. data/lib/treat/entities/abilities/buildable.rb +5 -3
  9. data/lib/treat/entities/abilities/exportable.rb +4 -4
  10. data/lib/treat/entities/collection.rb +1 -1
  11. data/lib/treat/entities/document.rb +1 -1
  12. data/lib/treat/entities/group.rb +8 -5
  13. data/lib/treat/entities/section.rb +1 -1
  14. data/lib/treat/entities/token.rb +20 -8
  15. data/lib/treat/entities/zone.rb +6 -5
  16. data/lib/treat/loaders/linguistics.rb +18 -19
  17. data/lib/treat/loaders/stanford.rb +3 -2
  18. data/lib/treat/version.rb +1 -1
  19. data/lib/treat/workers/extractors/language/what_language.rb +53 -57
  20. data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
  21. data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
  22. data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
  23. data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
  24. data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
  25. data/lib/treat/workers.rb +1 -1
  26. data/spec/entity.rb +7 -5
  27. data/spec/phrase.rb +2 -2
  28. data/spec/zone.rb +2 -3
  29. metadata +37 -15
  30. data/bin/stanford/bridge.jar +0 -0
  31. data/bin/stanford/joda-time.jar +0 -0
  32. data/bin/stanford/stanford-corenlp.jar +0 -0
  33. data/bin/stanford/stanford-parser.jar +0 -0
  34. data/bin/stanford/xom.jar +0 -0
  35. data/files/21552208.html +0 -683
  36. data/files/3_2_release_notes.html +0 -766
  37. data/files/nethttp-cheat-sheet-2940.html +0 -395
  38. data/files/weather-central-canada-heat-wave.html +0 -1370
  39. data/lib/treat/core/classification.rb +0 -63
  40. data/lib/treat/core/server.rb +0 -3
  41. data/spec/sandbox.rb +0 -223
  42. data/tmp/english.yaml +0 -10340
@@ -1,63 +0,0 @@
1
- class Treat::Core::Classification
2
-
3
- attr_reader :types
4
- attr_reader :features
5
- attr_reader :question
6
- attr_reader :labels
7
- attr_reader :mode
8
- attr_reader :default
9
-
10
- def initialize(type_or_types, feature_or_features,
11
- question, default = false, mode = :continuous)
12
- @types, @features,
13
- @question, @default =
14
- [*type_or_types],
15
- [*feature_or_features],
16
- question, default
17
-
18
- @mode = mode
19
- @labels = []
20
- @features.each do |cmd|
21
- if cmd.is_a?(Array)
22
- @labels << cmd[0]
23
- else
24
- @labels << cmd
25
- end
26
- end
27
- end
28
-
29
- def export_item(e, include_question = true)
30
-
31
- line = []
32
-
33
- @features.each do |cmd|
34
- dflt = nil
35
- begin
36
- if cmd.is_a?(Array)
37
- if cmd.size == 3
38
- r = cmd[1].call(e)
39
- dflt = cmd[2]
40
- line << (r ? r : dflt)
41
- elsif cmd.size == 2
42
- r = e.send(cmd[0])
43
- dflt = cmd[1]
44
- line << (r ? r : dflt)
45
- end
46
- else
47
- line << e.send(cmd)
48
- end
49
- end
50
- end
51
-
52
- if include_question
53
- if e.has?(@question)
54
- line << e.get(@question)
55
- else
56
- line << @default
57
- end
58
- end
59
-
60
- line
61
- end
62
-
63
- end
@@ -1,3 +0,0 @@
1
- module Treat::Core::Server
2
- # To implement.
3
- end
data/spec/sandbox.rb DELETED
@@ -1,223 +0,0 @@
1
- #encoding: utf-8
2
- require_relative '../lib/treat'
3
- require 'ruby-prof'
4
- Treat.databases.mongo.db = 'test2_treat'
5
-
6
- d = Document 'merkozy_rides_again.txt'
7
- d.do :chunk, :segment, :tokenize, :category, :tag
8
-
9
- d.serialize :mongo
10
-
11
- Treat::Entities::Document.from_db(:mongo, id: d.id, stop_at: :sentence).print_tree
12
-
13
- =begin
14
- d = Document 'http://www.cbc.ca/news/canada/story/2012/07/06/weather-central-canada-heat-wave.html'
15
-
16
- d.do :chunk, :segment, :tokenize, :tag, :category
17
- d.serialize :mongo, db: "test_treat"
18
- d2 = Treat::Entities::Document.from_db(:mongo, id: d.id)
19
- puts d2.inspect
20
- abort
21
- require 'benchmark'
22
-
23
- Benchmark.bm do |x|
24
-
25
-
26
- x.report "Mongo serialization" do
27
- 10.times do
28
- d.serialize :mongo, db: "test_treat"
29
- end
30
- end
31
-
32
- x.report "Mongo deserialization" do
33
- 1.times do
34
- Treat::Entities::Document.from_db(:mongo, id: d.id)
35
- end
36
- end
37
-
38
- end
39
- =end
40
- =begin
41
-
42
-
43
-
44
- f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
45
- d = Treat::Entities::Document.build(f)
46
-
47
- d.do :chunk, :segment
48
-
49
- d.serialize :mongo, db: 'testing1234'
50
-
51
- d2 = Treat::Entities::Document.from_db(:mongo, db: 'testing1234', id: d.id)
52
- puts d2.to_s
53
-
54
- puts d2.print_tree
55
- =end
56
- =begin
57
- Treat.databases.mongo.db = 'treat_testing'
58
-
59
- p = Phrase 'this is'
60
- p.set :tag, 'VP'
61
- w = Word 'this'
62
- w.set :category, :determiner
63
- w2 = Word 'is'
64
- w2.set :category, 'verb'
65
- p << w
66
- p << w2
67
-
68
- p.serialize :mongo
69
-
70
- p2 = Phrase "#{p.id}.mongo"
71
-
72
- p2.print_tree
73
- =end
74
- =begin
75
- entity = Treat::Entities::Entity.create(
76
- id: 1,
77
- value: 'test',
78
- children: [1, 2, 3],
79
- features: [a: 'a', b: 'b', c: 'c']
80
- )
81
-
82
- entity.save
83
-
84
- =end
85
-
86
- w = Word 'hello'
87
-
88
- =begin
89
- require_relative '../lib/treat/loaders/stanford'
90
-
91
- Treat::Loaders::Stanford.model_path = '/ruby/stanford/models/'
92
- Treat::Loaders::Stanford.jar_path = '/ruby/stanford/bin/'
93
-
94
- class Treat::Entities::Sentence
95
-
96
- def long_word_count
97
- i = 0
98
- each_word do |word|
99
- i += 1 if word.syllable_count > 3
100
- end
101
- i
102
- end
103
-
104
- def flesch_kincaid
105
- syllable_count / word_count
106
- end
107
-
108
- def syllable_count
109
- c = 0
110
- each_word do |word|
111
- c += word.syllable_count
112
- end
113
- c
114
- end
115
-
116
- end
117
-
118
- class Treat::Entities::Word
119
-
120
- def syllable_count
121
- w = to_s.downcase
122
- return 1 if w.length <= 3
123
- w.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
124
- w.sub!(/^y/, '')
125
- w.scan(/[aeiouy]{1,2}/).size
126
- end
127
-
128
- end
129
-
130
- c = Collection Treat.paths.spec + 'samples/kant'
131
-
132
- d = Document Treat.paths.spec + 'samples/kant/kant_enlightnement.txt'
133
-
134
- d.do :chunk, :segment, :tokenize, :tag, :category, :name_tag
135
-
136
- # Position of sentence in containers - clustering??
137
- d.each_sentence do |s|
138
- s.set :section_p, (s.parent_section.position.to_f / s.parent_document.children.size.to_f).round(2)
139
- s.set :zone_p, (s.parent_zone.position.to_f / s.parent_section.children.size.to_f).round(2)
140
- s.set :sentence_p, (s.position.to_f / s.parent_zone.children.size.to_f).round(2)
141
- end
142
-
143
- # Part of speech partitionning of the sentence
144
- d.each_sentence do |s|
145
- s.set :noun_density, (s.noun_count.to_f / (s.word_count + 1).to_f).round(2)
146
- s.set :verb_density, (s.verb_count.to_f / (s.word_count + 1).to_f).round(2)
147
- s.set :adjective_density, (s.adjective_count.to_f / (s.word_count + 1).to_f).round(2)
148
- s.set :adverb_density, (s.adverb_count.to_f / (s.word_count + 1).to_f).round(2)
149
- end
150
-
151
- # Sentence readability -> length and long words.
152
- d.each_sentence do |s|
153
- s.set :word_count, s.word_count
154
- s.set :long_word_count, s.long_word_count
155
- s.set :flesch_kincaid, s.flesch_kincaid
156
- end
157
-
158
- # Domain specificity -> named entities according to domain.
159
- d.each_sentence do |s|
160
- s.set :person_count, s.entities_with_feature(:name_tag, 'person').size
161
- s.set :time_count, s.entities_with_feature(:name_tag, 'time').size
162
- s.set :location_count, s.entities_with_feature(:name_tag, 'location').size
163
- s.set :number_count, s.number_count
164
- puts s.inspect
165
- end
166
-
167
- d.each_sentence do |s|
168
- if Random.rand() >= 0.5
169
- s.set :golden, true
170
- else
171
- s.set :golden, false
172
- end
173
- end
174
-
175
- golden = []
176
- not_golden = []
177
-
178
- d.each_sentence do |s|
179
- if s.golden
180
- golden << s
181
- else
182
- not_golden << s
183
- end
184
- end
185
-
186
- i = 0
187
- golden.each do |s|
188
- puts s.sentence_p.to_s + ' ' + not_golden[i].sentence_p.to_s
189
- i += 1
190
- end
191
- =end
192
- =begin
193
-
194
- d = Document 'http://www.cbc.ca/news/canada/montreal/story/2012/06/04/montreal-magnotta-search.html'
195
-
196
- d.do :chunk, :segment
197
-
198
- d.each_zone do |z|
199
- puts '-------' + z.type.to_s
200
- z.do tokenize: :ptb
201
- z.each_sentence do |s|
202
- puts s.to_s
203
- end
204
- #puts z.to_s
205
- puts '-------'
206
- end
207
-
208
-
209
- abort
210
-
211
- Treat::Databases.connect :mongo
212
-
213
- p = Phrase ''
214
- w = Word 'test'
215
- p << w
216
-
217
- p.print_tree
218
-
219
- p.serialize :mongo, :db => 'treat'
220
- p2 = Treat::Workers::Formatters::Unserializers::Mongo.unserialize(Treat::Entities::Phrase.new('', p.id))
221
- p2.print_tree
222
-
223
- =end