treat 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. data/LICENSE +1 -1
  2. data/README.md +3 -3
  3. data/lib/treat/config.rb +10 -0
  4. data/lib/treat/core/data_set.rb +80 -32
  5. data/lib/treat/core/feature.rb +35 -0
  6. data/lib/treat/core/problem.rb +43 -0
  7. data/lib/treat/core/question.rb +27 -0
  8. data/lib/treat/entities/abilities/buildable.rb +5 -3
  9. data/lib/treat/entities/abilities/exportable.rb +4 -4
  10. data/lib/treat/entities/collection.rb +1 -1
  11. data/lib/treat/entities/document.rb +1 -1
  12. data/lib/treat/entities/group.rb +8 -5
  13. data/lib/treat/entities/section.rb +1 -1
  14. data/lib/treat/entities/token.rb +20 -8
  15. data/lib/treat/entities/zone.rb +6 -5
  16. data/lib/treat/loaders/linguistics.rb +18 -19
  17. data/lib/treat/loaders/stanford.rb +3 -2
  18. data/lib/treat/version.rb +1 -1
  19. data/lib/treat/workers/extractors/language/what_language.rb +53 -57
  20. data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
  21. data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
  22. data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
  23. data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
  24. data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
  25. data/lib/treat/workers.rb +1 -1
  26. data/spec/entity.rb +7 -5
  27. data/spec/phrase.rb +2 -2
  28. data/spec/zone.rb +2 -3
  29. metadata +37 -15
  30. data/bin/stanford/bridge.jar +0 -0
  31. data/bin/stanford/joda-time.jar +0 -0
  32. data/bin/stanford/stanford-corenlp.jar +0 -0
  33. data/bin/stanford/stanford-parser.jar +0 -0
  34. data/bin/stanford/xom.jar +0 -0
  35. data/files/21552208.html +0 -683
  36. data/files/3_2_release_notes.html +0 -766
  37. data/files/nethttp-cheat-sheet-2940.html +0 -395
  38. data/files/weather-central-canada-heat-wave.html +0 -1370
  39. data/lib/treat/core/classification.rb +0 -63
  40. data/lib/treat/core/server.rb +0 -3
  41. data/spec/sandbox.rb +0 -223
  42. data/tmp/english.yaml +0 -10340
@@ -1,63 +0,0 @@
1
- class Treat::Core::Classification
2
-
3
- attr_reader :types
4
- attr_reader :features
5
- attr_reader :question
6
- attr_reader :labels
7
- attr_reader :mode
8
- attr_reader :default
9
-
10
- def initialize(type_or_types, feature_or_features,
11
- question, default = false, mode = :continuous)
12
- @types, @features,
13
- @question, @default =
14
- [*type_or_types],
15
- [*feature_or_features],
16
- question, default
17
-
18
- @mode = mode
19
- @labels = []
20
- @features.each do |cmd|
21
- if cmd.is_a?(Array)
22
- @labels << cmd[0]
23
- else
24
- @labels << cmd
25
- end
26
- end
27
- end
28
-
29
- def export_item(e, include_question = true)
30
-
31
- line = []
32
-
33
- @features.each do |cmd|
34
- dflt = nil
35
- begin
36
- if cmd.is_a?(Array)
37
- if cmd.size == 3
38
- r = cmd[1].call(e)
39
- dflt = cmd[2]
40
- line << (r ? r : dflt)
41
- elsif cmd.size == 2
42
- r = e.send(cmd[0])
43
- dflt = cmd[1]
44
- line << (r ? r : dflt)
45
- end
46
- else
47
- line << e.send(cmd)
48
- end
49
- end
50
- end
51
-
52
- if include_question
53
- if e.has?(@question)
54
- line << e.get(@question)
55
- else
56
- line << @default
57
- end
58
- end
59
-
60
- line
61
- end
62
-
63
- end
@@ -1,3 +0,0 @@
1
- module Treat::Core::Server
2
- # To implement.
3
- end
data/spec/sandbox.rb DELETED
@@ -1,223 +0,0 @@
1
- #encoding: utf-8
2
- require_relative '../lib/treat'
3
- require 'ruby-prof'
4
- Treat.databases.mongo.db = 'test2_treat'
5
-
6
- d = Document 'merkozy_rides_again.txt'
7
- d.do :chunk, :segment, :tokenize, :category, :tag
8
-
9
- d.serialize :mongo
10
-
11
- Treat::Entities::Document.from_db(:mongo, id: d.id, stop_at: :sentence).print_tree
12
-
13
- =begin
14
- d = Document 'http://www.cbc.ca/news/canada/story/2012/07/06/weather-central-canada-heat-wave.html'
15
-
16
- d.do :chunk, :segment, :tokenize, :tag, :category
17
- d.serialize :mongo, db: "test_treat"
18
- d2 = Treat::Entities::Document.from_db(:mongo, id: d.id)
19
- puts d2.inspect
20
- abort
21
- require 'benchmark'
22
-
23
- Benchmark.bm do |x|
24
-
25
-
26
- x.report "Mongo serialization" do
27
- 10.times do
28
- d.serialize :mongo, db: "test_treat"
29
- end
30
- end
31
-
32
- x.report "Mongo deserialization" do
33
- 1.times do
34
- Treat::Entities::Document.from_db(:mongo, id: d.id)
35
- end
36
- end
37
-
38
- end
39
- =end
40
- =begin
41
-
42
-
43
-
44
- f = Treat.paths.spec + 'samples/mathematicians/leibniz.txt'
45
- d = Treat::Entities::Document.build(f)
46
-
47
- d.do :chunk, :segment
48
-
49
- d.serialize :mongo, db: 'testing1234'
50
-
51
- d2 = Treat::Entities::Document.from_db(:mongo, db: 'testing1234', id: d.id)
52
- puts d2.to_s
53
-
54
- puts d2.print_tree
55
- =end
56
- =begin
57
- Treat.databases.mongo.db = 'treat_testing'
58
-
59
- p = Phrase 'this is'
60
- p.set :tag, 'VP'
61
- w = Word 'this'
62
- w.set :category, :determiner
63
- w2 = Word 'is'
64
- w2.set :category, 'verb'
65
- p << w
66
- p << w2
67
-
68
- p.serialize :mongo
69
-
70
- p2 = Phrase "#{p.id}.mongo"
71
-
72
- p2.print_tree
73
- =end
74
- =begin
75
- entity = Treat::Entities::Entity.create(
76
- id: 1,
77
- value: 'test',
78
- children: [1, 2, 3],
79
- features: [a: 'a', b: 'b', c: 'c']
80
- )
81
-
82
- entity.save
83
-
84
- =end
85
-
86
- w = Word 'hello'
87
-
88
- =begin
89
- require_relative '../lib/treat/loaders/stanford'
90
-
91
- Treat::Loaders::Stanford.model_path = '/ruby/stanford/models/'
92
- Treat::Loaders::Stanford.jar_path = '/ruby/stanford/bin/'
93
-
94
- class Treat::Entities::Sentence
95
-
96
- def long_word_count
97
- i = 0
98
- each_word do |word|
99
- i += 1 if word.syllable_count > 3
100
- end
101
- i
102
- end
103
-
104
- def flesch_kincaid
105
- syllable_count / word_count
106
- end
107
-
108
- def syllable_count
109
- c = 0
110
- each_word do |word|
111
- c += word.syllable_count
112
- end
113
- c
114
- end
115
-
116
- end
117
-
118
- class Treat::Entities::Word
119
-
120
- def syllable_count
121
- w = to_s.downcase
122
- return 1 if w.length <= 3
123
- w.sub!(/(?:[^laeiouy]es|ed|[^laeiouy]e)$/, '')
124
- w.sub!(/^y/, '')
125
- w.scan(/[aeiouy]{1,2}/).size
126
- end
127
-
128
- end
129
-
130
- c = Collection Treat.paths.spec + 'samples/kant'
131
-
132
- d = Document Treat.paths.spec + 'samples/kant/kant_enlightnement.txt'
133
-
134
- d.do :chunk, :segment, :tokenize, :tag, :category, :name_tag
135
-
136
- # Position of sentence in containers - clustering??
137
- d.each_sentence do |s|
138
- s.set :section_p, (s.parent_section.position.to_f / s.parent_document.children.size.to_f).round(2)
139
- s.set :zone_p, (s.parent_zone.position.to_f / s.parent_section.children.size.to_f).round(2)
140
- s.set :sentence_p, (s.position.to_f / s.parent_zone.children.size.to_f).round(2)
141
- end
142
-
143
- # Part of speech partitionning of the sentence
144
- d.each_sentence do |s|
145
- s.set :noun_density, (s.noun_count.to_f / (s.word_count + 1).to_f).round(2)
146
- s.set :verb_density, (s.verb_count.to_f / (s.word_count + 1).to_f).round(2)
147
- s.set :adjective_density, (s.adjective_count.to_f / (s.word_count + 1).to_f).round(2)
148
- s.set :adverb_density, (s.adverb_count.to_f / (s.word_count + 1).to_f).round(2)
149
- end
150
-
151
- # Sentence readability -> length and long words.
152
- d.each_sentence do |s|
153
- s.set :word_count, s.word_count
154
- s.set :long_word_count, s.long_word_count
155
- s.set :flesch_kincaid, s.flesch_kincaid
156
- end
157
-
158
- # Domain specificity -> named entities according to domain.
159
- d.each_sentence do |s|
160
- s.set :person_count, s.entities_with_feature(:name_tag, 'person').size
161
- s.set :time_count, s.entities_with_feature(:name_tag, 'time').size
162
- s.set :location_count, s.entities_with_feature(:name_tag, 'location').size
163
- s.set :number_count, s.number_count
164
- puts s.inspect
165
- end
166
-
167
- d.each_sentence do |s|
168
- if Random.rand() >= 0.5
169
- s.set :golden, true
170
- else
171
- s.set :golden, false
172
- end
173
- end
174
-
175
- golden = []
176
- not_golden = []
177
-
178
- d.each_sentence do |s|
179
- if s.golden
180
- golden << s
181
- else
182
- not_golden << s
183
- end
184
- end
185
-
186
- i = 0
187
- golden.each do |s|
188
- puts s.sentence_p.to_s + ' ' + not_golden[i].sentence_p.to_s
189
- i += 1
190
- end
191
- =end
192
- =begin
193
-
194
- d = Document 'http://www.cbc.ca/news/canada/montreal/story/2012/06/04/montreal-magnotta-search.html'
195
-
196
- d.do :chunk, :segment
197
-
198
- d.each_zone do |z|
199
- puts '-------' + z.type.to_s
200
- z.do tokenize: :ptb
201
- z.each_sentence do |s|
202
- puts s.to_s
203
- end
204
- #puts z.to_s
205
- puts '-------'
206
- end
207
-
208
-
209
- abort
210
-
211
- Treat::Databases.connect :mongo
212
-
213
- p = Phrase ''
214
- w = Word 'test'
215
- p << w
216
-
217
- p.print_tree
218
-
219
- p.serialize :mongo, :db => 'treat'
220
- p2 = Treat::Workers::Formatters::Unserializers::Mongo.unserialize(Treat::Entities::Phrase.new('', p.id))
221
- p2.print_tree
222
-
223
- =end