veritable 0.1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.txt +3 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/lib/veritable/api.rb +460 -0
- data/lib/veritable/connection.rb +84 -0
- data/lib/veritable/cursor.rb +66 -0
- data/lib/veritable/datatypes.rb +3 -0
- data/lib/veritable/errors.rb +17 -0
- data/lib/veritable/object.rb +20 -0
- data/lib/veritable/resource.rb +14 -0
- data/lib/veritable/util.rb +429 -0
- data/lib/veritable/version.rb +3 -0
- data/lib/veritable.rb +32 -0
- metadata +154 -0
data/CHANGELOG.txt
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Prior Knowledge
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Veritable
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'veritable'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install veritable
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
@@ -0,0 +1,460 @@
|
|
1
|
+
require 'veritable/cursor'
|
2
|
+
require 'veritable/datatypes'
|
3
|
+
require 'veritable/errors'
|
4
|
+
require 'veritable/resource'
|
5
|
+
require 'veritable/util'
|
6
|
+
|
7
|
+
module Veritable
|
8
|
+
class API
|
9
|
+
include VeritableResource
|
10
|
+
|
11
|
+
def root; get(""); end
|
12
|
+
|
13
|
+
def limits; get("user/limits"); end
|
14
|
+
|
15
|
+
def tables(opts={'start' => nil, 'limit' => nil})
|
16
|
+
Cursor.new({'collection' => "tables",
|
17
|
+
'start' => opts['start'],
|
18
|
+
'limit' => opts['limit']}.update(@opts)) {|x| Table.new(@opts, x)}
|
19
|
+
end
|
20
|
+
|
21
|
+
def table(table_id)
|
22
|
+
Table.new(@opts, get("tables/#{table_id}"))
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_table(table_id=nil, description='', force=false)
|
26
|
+
if table_id.nil?
|
27
|
+
autogen = true
|
28
|
+
table_id = Util.make_table_id
|
29
|
+
else
|
30
|
+
autogen = false
|
31
|
+
Util.check_id table_id
|
32
|
+
end
|
33
|
+
|
34
|
+
if has_table? table_id
|
35
|
+
if autogen
|
36
|
+
return create_table(nil, description, false)
|
37
|
+
end
|
38
|
+
if ! force
|
39
|
+
raise VeritableError.new("Couldn't create table -- table with id #{table_id} already exists.")
|
40
|
+
else
|
41
|
+
delete_table table_id
|
42
|
+
end
|
43
|
+
end
|
44
|
+
doc = post("tables", {:_id => table_id, :description => description})
|
45
|
+
Table.new(@opts, doc)
|
46
|
+
end
|
47
|
+
|
48
|
+
def delete_table(table_id); delete("tables/#{table_id}"); end
|
49
|
+
|
50
|
+
def inspect; to_s; end
|
51
|
+
def to_s; "#<Veritable::API url='#{api_base_url}'>"; end
|
52
|
+
|
53
|
+
def has_table?(table_id)
|
54
|
+
begin
|
55
|
+
table table_id
|
56
|
+
rescue
|
57
|
+
false
|
58
|
+
else
|
59
|
+
true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class Table
|
65
|
+
include VeritableResource
|
66
|
+
|
67
|
+
alias :rest_delete :delete
|
68
|
+
def delete
|
69
|
+
rest_delete(link('self'))
|
70
|
+
end
|
71
|
+
|
72
|
+
def row(row_id); get("#{link('rows')}/#{row_id}"); end
|
73
|
+
|
74
|
+
def rows(opts={'start' => nil, 'limit' => nil})
|
75
|
+
Cursor.new({'collection' => link('rows'),
|
76
|
+
'start' => opts['start'],
|
77
|
+
'limit' => opts['limit']}.update(@opts))
|
78
|
+
end
|
79
|
+
|
80
|
+
def upload_row(row)
|
81
|
+
Util.check_row row
|
82
|
+
put("#{link('rows')}/#{row['_id']}", row)
|
83
|
+
end
|
84
|
+
|
85
|
+
def batch_upload_rows(rows, per_page=100)
|
86
|
+
batch_modify_rows('put', rows, per_page)
|
87
|
+
end
|
88
|
+
|
89
|
+
def delete_row(row_id)
|
90
|
+
rest_delete("#{link('rows')}/#{row_id}")
|
91
|
+
end
|
92
|
+
|
93
|
+
def batch_delete_rows(rows, per_page=100)
|
94
|
+
begin
|
95
|
+
batch_modify_rows('delete', rows, per_page)
|
96
|
+
rescue VeritableError => e
|
97
|
+
if (not e.respond_to?(:http_code)) or (not (e.http_code == "404 Resource Not Found"))
|
98
|
+
raise e
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def analysis(analysis_id)
|
104
|
+
Analysis.new(@opts, get("#{link('analyses')}/#{analysis_id}"))
|
105
|
+
end
|
106
|
+
|
107
|
+
def analyses(opts={'start' => nil, 'limit' => nil})
|
108
|
+
Cursor.new({'collection' => link('analyses'),
|
109
|
+
'start' => opts['start'],
|
110
|
+
'limit' => opts['limit']}.update(@opts)) {|x| Analysis.new(@opts, x)}
|
111
|
+
end
|
112
|
+
|
113
|
+
def delete_analysis(analysis_id)
|
114
|
+
rest_delete("#{link('analyses')}/#{analysis_id}")
|
115
|
+
end
|
116
|
+
|
117
|
+
def create_analysis(schema, analysis_id=nil, description="", force=false, analysis_type="veritable")
|
118
|
+
if analysis_type != "veritable"
|
119
|
+
if analysis_type.respond_to? :to_s
|
120
|
+
raise VeritableError.new("Invalid analysis type #{analysis_type}.")
|
121
|
+
else
|
122
|
+
raise VeritableError.new("Invalid analysis type.")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if analysis_id.nil?
|
127
|
+
autogen = true
|
128
|
+
analysis_id = Util.make_analysis_id
|
129
|
+
else
|
130
|
+
autogen = false
|
131
|
+
Util.check_id analysis_id
|
132
|
+
end
|
133
|
+
|
134
|
+
if has_analysis? analysis_id
|
135
|
+
if autogen
|
136
|
+
return create_analysis(nil, description, false)
|
137
|
+
end
|
138
|
+
if ! force
|
139
|
+
raise VeritableError.new("Couldn't create table -- table with id #{analysis_id} already exists.")
|
140
|
+
else
|
141
|
+
delete_analysis analysis_id
|
142
|
+
end
|
143
|
+
end
|
144
|
+
doc = post(link('analyses'), {:_id => analysis_id, :description => description, :type => analysis_type, :schema => schema})
|
145
|
+
Analysis.new(@opts, doc)
|
146
|
+
end
|
147
|
+
|
148
|
+
def inspect; to_s; end
|
149
|
+
def to_s; "#<Veritable::Table _id='#{_id}'>"; end
|
150
|
+
|
151
|
+
def _id; @doc['_id']; end
|
152
|
+
def description; @doc['description']; end
|
153
|
+
|
154
|
+
def has_analysis?(analysis_id)
|
155
|
+
begin
|
156
|
+
analysis analysis_id
|
157
|
+
rescue
|
158
|
+
false
|
159
|
+
else
|
160
|
+
true
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def batch_modify_rows(action, rows, per_page=100)
|
167
|
+
if not per_page.is_a? Fixnum or not per_page > 0
|
168
|
+
raise VeritableError.new("Batch upload or delete must have integer page size greater than 0.")
|
169
|
+
end
|
170
|
+
rows = rows.collect {|row|
|
171
|
+
Util.check_row(row)
|
172
|
+
row
|
173
|
+
}
|
174
|
+
if (not rows.is_a? Array) and (not rows.is_a? Veritable::Cursor)
|
175
|
+
raise VeritableError.new("Must pass an array of row hashes or a cursor of rows to batch upload or delete.")
|
176
|
+
end
|
177
|
+
ct = (1..per_page).to_a.cycle
|
178
|
+
batch = Array.new()
|
179
|
+
ct.each { |ct|
|
180
|
+
if rows.empty?
|
181
|
+
if batch.size > 0
|
182
|
+
post(link('rows'), {'action' => action, 'rows' => batch})
|
183
|
+
end
|
184
|
+
break
|
185
|
+
end
|
186
|
+
batch.push rows.shift
|
187
|
+
if ct == per_page
|
188
|
+
post(link('rows'), {'action' => action, 'rows' => batch})
|
189
|
+
batch = Array.new()
|
190
|
+
end
|
191
|
+
}
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class Analysis
|
196
|
+
include VeritableResource
|
197
|
+
|
198
|
+
def update; @doc = get(link('self')); end
|
199
|
+
|
200
|
+
alias :rest_delete :delete
|
201
|
+
def delete; rest_delete(link('self')); end
|
202
|
+
|
203
|
+
def schema; Schema.new(get(link('schema'))); end
|
204
|
+
|
205
|
+
def wait(max_time=nil, poll=2)
|
206
|
+
elapsed = 0
|
207
|
+
while running?
|
208
|
+
sleep poll
|
209
|
+
if not max_time.nil?
|
210
|
+
elapsed += poll
|
211
|
+
if elapsed > max_time
|
212
|
+
raise VeritableError.new("Wait for analysis -- Maximum time of #{max_time} second exceeded.")
|
213
|
+
end
|
214
|
+
end
|
215
|
+
update
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def predict(row, count=100)
|
220
|
+
update if running?
|
221
|
+
if succeeded?
|
222
|
+
if not row.is_a? Hash
|
223
|
+
raise VeritableError.new("Predict -- Must provide a row hash to make predictions.")
|
224
|
+
end
|
225
|
+
res = post(link('predict'), {'data' => row, 'count' => count})
|
226
|
+
if not res.is_a? Array
|
227
|
+
begin
|
228
|
+
res.to_s
|
229
|
+
rescue
|
230
|
+
raise VeritableError.new("Predict -- Error making predictions: #{res}")
|
231
|
+
else
|
232
|
+
raise VeritableError.new("Predict -- Error making predictions.")
|
233
|
+
end
|
234
|
+
end
|
235
|
+
Prediction.new(row, res, schema)
|
236
|
+
elsif running?
|
237
|
+
raise VeritableError.new("Predict -- Analysis with id #{_id} is still running and not yet ready to predict.")
|
238
|
+
elsif failed?
|
239
|
+
raise VeritableError.new("Predict -- Analysis with id #{_id} has failed and cannot predict.")
|
240
|
+
else
|
241
|
+
raise VeritableError.new("Predict -- Shouldn't be here -- please let us know at support@priorknowledge.com.")
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def related_to(column_id, opts={'start' => nil, 'limit' => nil})
|
246
|
+
update if running?
|
247
|
+
if succeeded?
|
248
|
+
Cursor.new(
|
249
|
+
{'collection' => "#{link('related')}/#{column_id}",
|
250
|
+
'start' => opts['start'],
|
251
|
+
'limit' => opts['limit']}.update(@opts))
|
252
|
+
elsif running?
|
253
|
+
raise VeritableError.new("Related -- Analysis with id #{_id} is still running and not yet ready to calculate related.")
|
254
|
+
elsif failed?
|
255
|
+
raise VeritableError.new("Related -- Analysis with id #{_id} has failed and cannot calculate related.")
|
256
|
+
else
|
257
|
+
raise VeritableError.new("Related -- Shouldn't be here -- please let us know at support@priorknowledge.com.")
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def inspect; to_s; end
|
262
|
+
def to_s; "#<Veritable::Analysis _id='#{_id}'>"; end
|
263
|
+
|
264
|
+
def _id; @doc['_id']; end
|
265
|
+
def created_at; @doc['created_at']; end
|
266
|
+
def finished_at; @doc['finished_at']; end
|
267
|
+
def state; @doc['state']; end
|
268
|
+
def running?; state == 'running'; end
|
269
|
+
def succeeded?; state == 'succeeded'; end
|
270
|
+
def failed?; state == 'failed'; end
|
271
|
+
def error; state == 'failed' ? @doc['error'] : nil; end
|
272
|
+
def progress; state == 'running' ? @doc['progress'] : nil; end
|
273
|
+
end
|
274
|
+
|
275
|
+
class Schema < Hash
|
276
|
+
def initialize(data, subset=nil)
|
277
|
+
begin
|
278
|
+
data.each {|k, v|
|
279
|
+
if subset.is_a? Array
|
280
|
+
self[k] = v if subset.include? k
|
281
|
+
elsif subset.is_a? Hash
|
282
|
+
self[k] = v if subset.has_key? k
|
283
|
+
else
|
284
|
+
self[k] = v
|
285
|
+
end
|
286
|
+
}
|
287
|
+
rescue
|
288
|
+
begin
|
289
|
+
data.to_s
|
290
|
+
rescue
|
291
|
+
raise VeritableError.new("Initialize schema -- invalid schema data.")
|
292
|
+
else
|
293
|
+
raise VeritableError.new("Initialize schema -- invalid schema data #{data}.")
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def type(column)
|
299
|
+
self[column]['type']
|
300
|
+
end
|
301
|
+
|
302
|
+
def validate
|
303
|
+
self.each {|k, v|
|
304
|
+
if not k.is_a? String
|
305
|
+
begin
|
306
|
+
k.to_s
|
307
|
+
rescue
|
308
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification: nonstring column id.")
|
309
|
+
else
|
310
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification: nonstring column id #{k}")
|
311
|
+
end
|
312
|
+
end
|
313
|
+
begin
|
314
|
+
Util.check_id k
|
315
|
+
rescue
|
316
|
+
raise VeritableError.new("Validate schema -- Invalid column name #{k}: must contain only alphanumerics, dashes, and underscores, and may not begin with a dash or underscore.")
|
317
|
+
end
|
318
|
+
if not v.include? 'type'
|
319
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification. Column #{k} must specify a 'type', one of #{DATATYPES}")
|
320
|
+
end
|
321
|
+
if not DATATYPES.include? v['type']
|
322
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification. Column #{k}, type #{v['type']} is not valid. Type must be one of #{DATATYPES}")
|
323
|
+
end
|
324
|
+
}
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
class Prediction < Hash
|
329
|
+
attr_reader :request
|
330
|
+
attr_reader :distribution
|
331
|
+
attr_reader :schema
|
332
|
+
attr_reader :uncertainty
|
333
|
+
|
334
|
+
def initialize(request, distribution, schema)
|
335
|
+
@request = request
|
336
|
+
@distribution = distribution
|
337
|
+
@schema = Schema.new(schema)
|
338
|
+
@uncertainty = Hash.new()
|
339
|
+
|
340
|
+
request.each { |k,v|
|
341
|
+
if v.nil?
|
342
|
+
self[k] = point_estimate k
|
343
|
+
@uncertainty[k] = calculate_uncertainty k
|
344
|
+
else
|
345
|
+
self[k] = v
|
346
|
+
@uncertainty[k] = 0.0
|
347
|
+
end
|
348
|
+
}
|
349
|
+
end
|
350
|
+
|
351
|
+
def prob_within(column, range)
|
352
|
+
col_type = schema.type column
|
353
|
+
Veritable::Util.check_datatype(col_type, "Probability within -- ")
|
354
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
355
|
+
count = distribution.inject(0) {|memo, row|
|
356
|
+
if range.include? row[column]
|
357
|
+
memo + 1
|
358
|
+
else
|
359
|
+
memo
|
360
|
+
end
|
361
|
+
}
|
362
|
+
count.to_f / distribution.size
|
363
|
+
elsif col_type == 'count' or col_type == 'real'
|
364
|
+
mn = range[0]
|
365
|
+
mx = range[1]
|
366
|
+
count = distribution.inject(0) {|memo, row|
|
367
|
+
v = row[column]
|
368
|
+
if (mn.nil? or v >= mn) and (mx.nil? or v <=mx)
|
369
|
+
memo + 1
|
370
|
+
else
|
371
|
+
memo
|
372
|
+
end
|
373
|
+
}
|
374
|
+
count.to_f / distribution.size
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
def credible_values(column, p=nil)
|
379
|
+
col_type = schema.type column
|
380
|
+
Veritable::Util.check_datatype(col_type, "Credible values -- ")
|
381
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
382
|
+
p = 0.5 if p.nil?
|
383
|
+
tf = Hash.new
|
384
|
+
((freqs(counts(column)).sort_by {|k, v| v}).reject {|c, a| a < p}).each {|k, v| tf[k] = v}
|
385
|
+
tf
|
386
|
+
elsif col_type == 'count' or col_type == 'real'
|
387
|
+
p = 0.9 if p.nil?
|
388
|
+
n = distribution.size
|
389
|
+
a = (n * (1.0 - p) / 2.0).round.to_i
|
390
|
+
sv = sorted_values column
|
391
|
+
n = sv.size
|
392
|
+
lo = sv[a]
|
393
|
+
hi = sv[n - 1 - a]
|
394
|
+
[lo, hi]
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
def inspect; to_s; end
|
399
|
+
def to_s; "<Veritable::Prediction #{super}>"; end
|
400
|
+
|
401
|
+
private
|
402
|
+
|
403
|
+
def sorted_values(column)
|
404
|
+
values = (distribution.collect {|row| row[column]}).reject {|x| x.nil?}
|
405
|
+
values.sort
|
406
|
+
end
|
407
|
+
|
408
|
+
def counts(column)
|
409
|
+
cts = Hash.new
|
410
|
+
distribution.each {|row|
|
411
|
+
if row.has_key? column
|
412
|
+
cat = row[column]
|
413
|
+
if not (cts.has_key? cat)
|
414
|
+
cts[cat] = 0
|
415
|
+
end
|
416
|
+
cts[cat] += 1
|
417
|
+
end
|
418
|
+
}
|
419
|
+
cts
|
420
|
+
end
|
421
|
+
|
422
|
+
def freqs(cts)
|
423
|
+
total = cts.values.inject(0) {|memo, obj| memo + obj}
|
424
|
+
freqs = Hash.new()
|
425
|
+
cts.each {|k, v|
|
426
|
+
freqs[k] = v.to_f / total
|
427
|
+
}
|
428
|
+
freqs
|
429
|
+
end
|
430
|
+
|
431
|
+
def point_estimate(column)
|
432
|
+
col_type = schema.type column
|
433
|
+
Veritable::Util.check_datatype(col_type, "Point estimate -- ")
|
434
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
435
|
+
# use the mode
|
436
|
+
(counts(column).max_by {|k, v| v})[0]
|
437
|
+
elsif col_type == 'real' or col_type == 'count'
|
438
|
+
# use the mean
|
439
|
+
values = distribution.collect {|row| row[column]}
|
440
|
+
mean = (values.inject(0) {|memo, obj| memo + obj}) / values.size.to_f
|
441
|
+
col_type == 'real' ? mean : mean.round.to_i
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
def calculate_uncertainty(column)
|
446
|
+
values = distribution.collect {|row| row[column]}
|
447
|
+
col_type = schema.type column
|
448
|
+
Veritable::Util.check_datatype(col_type, "Calculate uncertainty -- ")
|
449
|
+
n = values.size
|
450
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
451
|
+
e = ((counts column).max_by {|k, v| v})[0]
|
452
|
+
c = 1.0 - (values.count {|v| v == e} / n.to_f)
|
453
|
+
c.to_f
|
454
|
+
elsif col_type == 'count' or col_type == 'real'
|
455
|
+
r = credible_values column
|
456
|
+
(r[1] - r[0]).to_f
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'veritable/object'
|
2
|
+
require 'multi_json'
|
3
|
+
|
4
|
+
module Veritable
|
5
|
+
module Connection
|
6
|
+
include VeritableObject
|
7
|
+
def initialize(opts=nil, doc=nil)
|
8
|
+
super(opts, doc)
|
9
|
+
require_opts :api_key, :api_base_url
|
10
|
+
default_opts(:ssl_verify => true, :enable_gzip => true)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(url, params=nil, headers={})
|
14
|
+
if params and params.count > 0
|
15
|
+
query_string = Util.query_params(params)
|
16
|
+
url += "?#{query_string}"
|
17
|
+
end
|
18
|
+
request(:get, url, nil, headers)
|
19
|
+
end
|
20
|
+
|
21
|
+
def post(url, payload, headers={})
|
22
|
+
payload = MultiJson.encode(payload)
|
23
|
+
headers = headers.merge({:content_type => 'application/json'})
|
24
|
+
request(:post, url, payload, headers)
|
25
|
+
end
|
26
|
+
|
27
|
+
def put(url, payload, headers={})
|
28
|
+
payload = MultiJson.encode(payload)
|
29
|
+
headers = headers.merge({:content_type => 'application/json'})
|
30
|
+
request(:put, url, payload, headers)
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(url, headers={})
|
34
|
+
begin
|
35
|
+
request(:delete, url, nil, headers)
|
36
|
+
rescue VeritableError => e
|
37
|
+
if not e.respond_to? :http_code or not e.http_code == "404 Resource Not Found"
|
38
|
+
raise e
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def request(verb, url, payload=nil, headers={})
|
44
|
+
url = api_base_url + "/" + url
|
45
|
+
|
46
|
+
headers = {
|
47
|
+
:user_agent => USER_AGENT,
|
48
|
+
:accept => :json,
|
49
|
+
:accept_encoding => enable_gzip ? :gzip : nil
|
50
|
+
}.merge(headers)
|
51
|
+
|
52
|
+
opts = {
|
53
|
+
:method => verb.to_s,
|
54
|
+
:url => url,
|
55
|
+
:user => api_key,
|
56
|
+
:password => "",
|
57
|
+
:headers => headers,
|
58
|
+
:payload => payload,
|
59
|
+
:verify_ssl => ssl_verify,
|
60
|
+
}
|
61
|
+
begin
|
62
|
+
response = RestClient::Request.execute(opts)
|
63
|
+
rescue RestClient::Exception => e
|
64
|
+
begin
|
65
|
+
r = MultiJson.decode(e.response)
|
66
|
+
msg = r['message']
|
67
|
+
code = r['code']
|
68
|
+
rescue
|
69
|
+
raise e
|
70
|
+
end
|
71
|
+
raise VeritableError.new("HTTP Error #{e.message} -- #{code}: #{msg}", {'http_code' => e.message, 'api_code' => code, 'api_message' => msg})
|
72
|
+
end
|
73
|
+
return MultiJson.decode(response)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def api_key; @opts[:api_key]; end
|
79
|
+
def api_base_url; @opts[:api_base_url]; end
|
80
|
+
def ssl_verify; @opts[:ssl_verify]; end
|
81
|
+
def enable_gzip; @opts[:enable_gzip]; end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'veritable/object'
|
2
|
+
require 'veritable/resource'
|
3
|
+
|
4
|
+
module Veritable
|
5
|
+
class Cursor
|
6
|
+
include VeritableResource
|
7
|
+
include Enumerable
|
8
|
+
def initialize(opts=nil, doc=nil, &lazymap)
|
9
|
+
super(opts, doc)
|
10
|
+
|
11
|
+
require_opts 'collection'
|
12
|
+
default_opts({'per_page' => 100})
|
13
|
+
|
14
|
+
collection_key = collection.split("/")[-1]
|
15
|
+
@doc = get(collection, params={:count => per_page, :start => start})
|
16
|
+
@doc.has_key?(collection_key) ? @opts['key'] = collection_key : @opts['key'] = 'data'
|
17
|
+
@opts['lazymap'] = lazymap if lazymap
|
18
|
+
end
|
19
|
+
|
20
|
+
def each
|
21
|
+
i = limit if limit
|
22
|
+
loop do
|
23
|
+
if data.length > 0 or refresh > 0
|
24
|
+
if limit
|
25
|
+
raise StopIteration if i == 0
|
26
|
+
i = i - 1
|
27
|
+
end
|
28
|
+
if lazymap
|
29
|
+
yield lazymap.call(data.shift)
|
30
|
+
else
|
31
|
+
yield data.shift
|
32
|
+
end
|
33
|
+
else
|
34
|
+
raise StopIteration
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
def inspect; to_s; end
|
39
|
+
def to_s; "#<Veritable::Cursor collection='" + collection + "'>"; end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def refresh
|
44
|
+
return data.length if data.length > 0
|
45
|
+
if next_page
|
46
|
+
@doc = get next_page
|
47
|
+
elsif last_page?
|
48
|
+
return 0
|
49
|
+
else
|
50
|
+
@doc = get(collection, params={:count => per_page, :start => start})
|
51
|
+
end
|
52
|
+
return data.length
|
53
|
+
end
|
54
|
+
|
55
|
+
def limit; @opts['limit']; end
|
56
|
+
def limit=(x); @opts['limit'] = x; end
|
57
|
+
def start; @opts['start']; end
|
58
|
+
def per_page; @opts['per_page']; end
|
59
|
+
def collection; @opts['collection'] end
|
60
|
+
def lazymap; @opts['lazymap']; end
|
61
|
+
def key; @opts['key'] end
|
62
|
+
def next_page; link 'next' end
|
63
|
+
def last_page?; ! @doc.has_key? 'next' end
|
64
|
+
def data; @doc[key] end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class VeritableError < StandardError
|
2
|
+
attr_reader :message
|
3
|
+
def initialize(message, opts=nil)
|
4
|
+
@message = message
|
5
|
+
if opts.is_a? Hash
|
6
|
+
@opts = opts
|
7
|
+
eigenclass = class << self; self; end
|
8
|
+
@opts.keys.each {|k|
|
9
|
+
eigenclass.send(:define_method, k.to_sym) {
|
10
|
+
@opts[k]
|
11
|
+
}
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
def to_s; message; end
|
16
|
+
def inspect; message; end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'veritable/errors'
|
2
|
+
|
3
|
+
module Veritable
|
4
|
+
module VeritableObject
|
5
|
+
def initialize(opts=nil, doc=nil)
|
6
|
+
@opts = opts
|
7
|
+
@doc = doc
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def require_opts(*keys)
|
13
|
+
keys.each {|k| raise VeritableError.new("Error initializing object -- must provide #{k}") unless @opts.has_key?(k)}
|
14
|
+
end
|
15
|
+
|
16
|
+
def default_opts(hash={})
|
17
|
+
hash.each {|k, v| @opts[k] = v unless @opts.has_key?(k)}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,429 @@
|
|
1
|
+
require 'veritable/datatypes'
|
2
|
+
require 'veritable/errors'
|
3
|
+
require 'uuid'
|
4
|
+
require 'uri'
|
5
|
+
require 'csv'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
module Veritable
|
9
|
+
module Util
|
10
|
+
class << self
|
11
|
+
def make_table_id; UUID.new.generate :compact ; end
|
12
|
+
def make_analysis_id; UUID.new.generate :compact ; end
|
13
|
+
|
14
|
+
def query_params(params, parent=nil)
|
15
|
+
flatten_params(params).collect {|x|
|
16
|
+
"#{x[0]}=#{x[1]}"
|
17
|
+
}.join("&")
|
18
|
+
end
|
19
|
+
|
20
|
+
def check_id(id)
|
21
|
+
if not id.is_a? String
|
22
|
+
begin
|
23
|
+
id.to_s
|
24
|
+
rescue
|
25
|
+
raise VeritableError.new("Invalid id -- strings only.")
|
26
|
+
else
|
27
|
+
raise VeritableError.new("Invalid id '#{id}' -- strings only.")
|
28
|
+
end
|
29
|
+
elsif not id =~ Regexp.new('\A[a-zA-Z0-9][-_a-zA-Z0-9]*\z')
|
30
|
+
raise VeritableError.new("Invalid id '#{id}' -- must contain only alphanumerics, underscores, and dashes.")
|
31
|
+
elsif id[0] == '_' or id[0] == '-'
|
32
|
+
raise VeritableError.new("Invalid id '#{id}' -- may not begin with a dash or underscore.")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def check_row(row)
|
37
|
+
if not row.is_a? Hash
|
38
|
+
begin
|
39
|
+
row.to_s
|
40
|
+
rescue
|
41
|
+
raise VeritableError.new("Invalid row -- Must provide a hash of column name-value pairs.")
|
42
|
+
else
|
43
|
+
raise VeritableError.new("Invalid row #{row} -- Must provide a hash of column name-value pairs.")
|
44
|
+
end
|
45
|
+
elsif not row.has_key? '_id'
|
46
|
+
raise VeritableError.new("Invalid row #{row} -- rows must contain unique row ids in the '_id' field.")
|
47
|
+
else
|
48
|
+
begin
|
49
|
+
check_id row['_id']
|
50
|
+
rescue VeritableError => e
|
51
|
+
raise VeritableError.new("Invalid row #{row} -- #{e}")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def check_datatype(datatype, msg=nil)
|
57
|
+
if not DATATYPES.include? datatype
|
58
|
+
begin
|
59
|
+
datatype.to_s
|
60
|
+
rescue
|
61
|
+
raise VeritableError.new("#{msg}Invalid data type.")
|
62
|
+
else
|
63
|
+
raise VeritableError.new("#{msg}Invalid data type '#{datatype}'.")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def split_rows(rows, frac)
|
69
|
+
rows = rows.to_a
|
70
|
+
n = rows.size
|
71
|
+
inds = (0...n).to_a.shuffle
|
72
|
+
border_ind = (n * frac).floor.to_i
|
73
|
+
train_dataset = (0...border_ind).collect {|i| rows[inds[i]] }
|
74
|
+
test_dataset = (border_ind...n).collect {|i| rows[inds[i]] }
|
75
|
+
return [train_dataset, test_dataset]
|
76
|
+
end
|
77
|
+
|
78
|
+
def validate_schema(schema)
|
79
|
+
schema.is_a? Veritable::Schema ? schema.validate : Veritable::Schema.new(schema).validate
|
80
|
+
end
|
81
|
+
|
82
|
+
def make_schema(schema_rule, opts={})
|
83
|
+
if ((not opts.has_key?('headers')) and (not opts.has_key?('rows')))
|
84
|
+
raise VeritableError.new("Either :headers or :rows must be provided!")
|
85
|
+
end
|
86
|
+
headers = opts.has_key?('headers') ? opts['headers'] : nil
|
87
|
+
if headers.nil?
|
88
|
+
headers = Set.new
|
89
|
+
opts['rows'].each {|row| headers.merge(row.keys)}
|
90
|
+
headers = headers.to_a.sort
|
91
|
+
end
|
92
|
+
schema = {}
|
93
|
+
headers.each do |c|
|
94
|
+
schema_rule.each do |r, t|
|
95
|
+
if r === c
|
96
|
+
schema[c] = t
|
97
|
+
break
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return Veritable::Schema.new(schema)
|
102
|
+
end
|
103
|
+
|
104
|
+
def write_csv(rows, filename)
|
105
|
+
headers = Set.new
|
106
|
+
rows.each {|row| headers.merge(row.keys)}
|
107
|
+
headers = headers.to_a.sort
|
108
|
+
CSV.open(filename, "w") do |csv|
|
109
|
+
csv << headers
|
110
|
+
rows.each do |row|
|
111
|
+
out_row = headers.collect {|h| row.keys.include?(h) ? row[h] : ''}
|
112
|
+
csv << out_row
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def read_csv(filename, id_col=nil, na_vals=[''])
|
118
|
+
rows = CSV.read(filename)
|
119
|
+
header = rows.shift
|
120
|
+
header = header.collect {|h| (h == id_col ? '_id' : h).strip}
|
121
|
+
if header.include?('_id')
|
122
|
+
id_col = '_id'
|
123
|
+
end
|
124
|
+
rid = 0
|
125
|
+
rows = rows.collect do |raw_row|
|
126
|
+
rid = rid + 1
|
127
|
+
row = {}
|
128
|
+
(0...raw_row.length).each do |i|
|
129
|
+
row[header[i]] = ( na_vals.include?(raw_row[i]) ? nil : raw_row[i] )
|
130
|
+
end
|
131
|
+
if id_col.nil?
|
132
|
+
row['_id'] = rid.to_s
|
133
|
+
end
|
134
|
+
row
|
135
|
+
end
|
136
|
+
return rows
|
137
|
+
end
|
138
|
+
|
139
|
+
def clean_data(rows, schema, opts={})
|
140
|
+
validate(rows, schema, {
|
141
|
+
'convert_types' => opts.has_key?('convert_types') ? opts['convert_types'] : true,
|
142
|
+
'allow_nones' => false,
|
143
|
+
'remove_nones' => opts.has_key?('remove_nones') ? opts['remove_nones'] : true,
|
144
|
+
'remove_invalids' => opts.has_key?('remove_invalids') ? opts['remove_invalids'] : true,
|
145
|
+
'reduce_categories' => opts.has_key?('reduce_categories') ? opts['reduce_categories'] : true,
|
146
|
+
'has_ids' => true,
|
147
|
+
'assign_ids' => opts.has_key?('assign_ids') ? opts['assign_ids'] : false,
|
148
|
+
'allow_extra_fields' => true,
|
149
|
+
'remove_extra_fields' => opts.has_key?('remove_extra_fields') ? opts['remove_extra_fields'] : false,
|
150
|
+
'allow_empty_columns' => false})
|
151
|
+
end
|
152
|
+
|
153
|
+
def validate_data(rows, schema)
|
154
|
+
validate(rows, schema, {
|
155
|
+
'convert_types' => false,
|
156
|
+
'allow_nones' => false,
|
157
|
+
'remove_nones' => false,
|
158
|
+
'remove_invalids' => false,
|
159
|
+
'reduce_categories' => false,
|
160
|
+
'has_ids' => true,
|
161
|
+
'assign_ids' => false,
|
162
|
+
'allow_extra_fields' => true,
|
163
|
+
'remove_extra_fields' => false,
|
164
|
+
'allow_empty_columns' => false})
|
165
|
+
end
|
166
|
+
|
167
|
+
def clean_predictions(predictions, schema, opts={})
|
168
|
+
validate(predictions, schema, {
|
169
|
+
'convert_types' => opts.has_key?('convert_types') ? opts['convert_types'] : true,
|
170
|
+
'allow_nones' => true,
|
171
|
+
'remove_nones' => false,
|
172
|
+
'remove_invalids' => opts.has_key?('remove_invalids') ? opts['remove_invalids'] : true,
|
173
|
+
'reduce_categories' => false,
|
174
|
+
'has_ids' => false,
|
175
|
+
'assign_ids' => false,
|
176
|
+
'allow_extra_fields' => false,
|
177
|
+
'remove_extra_fields' => opts.has_key?('remove_extra_fields') ? opts['remove_extra_fields'] : true,
|
178
|
+
'allow_empty_columns' => true})
|
179
|
+
end
|
180
|
+
|
181
|
+
def validate_predictions(predictions, schema)
|
182
|
+
validate(predictions, schema, {
|
183
|
+
'convert_types' => false,
|
184
|
+
'allow_nones' => true,
|
185
|
+
'remove_nones' => false,
|
186
|
+
'remove_invalids' => false,
|
187
|
+
'reduce_categories' => false,
|
188
|
+
'has_ids' => false,
|
189
|
+
'assign_ids' => false,
|
190
|
+
'allow_extra_fields' => false,
|
191
|
+
'remove_extra_fields' => false,
|
192
|
+
'allow_empty_columns' => true})
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def flatten_params(params, parent=nil)
|
198
|
+
result = []
|
199
|
+
if params.is_a? Hash
|
200
|
+
params.each {|k, v|
|
201
|
+
kk = parent ? "#{parent}[#{urlencode(k)}]" : urlencode(k)
|
202
|
+
if v.is_a?(Hash) or v.is_a?(Array)
|
203
|
+
result += flatten_params(v, kk)
|
204
|
+
else
|
205
|
+
result << [kk, urlencode(v)]
|
206
|
+
end
|
207
|
+
}
|
208
|
+
elsif params.is_a? Array
|
209
|
+
params.each {|v|
|
210
|
+
if v.is_a?(Hash) or v.is_a?(Array)
|
211
|
+
result += flatten_params(v, kk)
|
212
|
+
else
|
213
|
+
result << ["#{parent}[]", urlencode(v)]
|
214
|
+
end
|
215
|
+
}
|
216
|
+
end
|
217
|
+
result
|
218
|
+
end
|
219
|
+
|
220
|
+
def urlencode(k)
|
221
|
+
URI.escape(k.to_s, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
|
222
|
+
end
|
223
|
+
|
224
|
+
def to_integer(v)
|
225
|
+
return v if v.is_a? Fixnum
|
226
|
+
v.gsub!(/\A([+-]?\d+?)\.0*?\Z/, '\1')
|
227
|
+
Integer(v)
|
228
|
+
end
|
229
|
+
|
230
|
+
def validate(rows, schema, opts)
|
231
|
+
schema = Veritable::Schema.new(schema) unless schema.is_a? Veritable::Schema
|
232
|
+
|
233
|
+
# ensure the schema is well-formed
|
234
|
+
schema.validate
|
235
|
+
|
236
|
+
# store the row numbers of each unique id so that we can warn the user
|
237
|
+
unique_ids = Hash.new
|
238
|
+
|
239
|
+
# store the density of fields
|
240
|
+
field_fill = Hash.new
|
241
|
+
schema.keys.each {|c|
|
242
|
+
field_fill[c] = 0 if c != '_id'
|
243
|
+
}
|
244
|
+
|
245
|
+
# store the number of categories in each categorical column
|
246
|
+
category_counts = Hash.new
|
247
|
+
|
248
|
+
# values which will be converted to true and false in boolean cols if convert_types
|
249
|
+
true_strings = ['true', 't', 'yes', 'y']
|
250
|
+
false_strings = ['false', 'f', 'no', 'n']
|
251
|
+
|
252
|
+
max_cats = 256
|
253
|
+
# be careful before changing the order of any of this logic -- the point is to do this all only once
|
254
|
+
(0...rows.size).each {|i|
|
255
|
+
if opts['assign_ids']
|
256
|
+
rows[i]['_id'] = i.to_s # number the rows sequentially
|
257
|
+
elsif opts['has_ids']
|
258
|
+
raise VeritableError.new("Validate -- row #{i} is missing key '_id'", {'row' => i, 'col' => '_id'}) unless rows[i].include? '_id'
|
259
|
+
|
260
|
+
if opts['convert_types'] # attempt to convert _id to string
|
261
|
+
begin
|
262
|
+
rows[i]['_id'] = rows[i]['_id'].to_s if not rows[i]['_id'].is_a? String
|
263
|
+
rescue
|
264
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' cannot be converted to string.", {'row' => i, 'col' => '_id'})
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
if not rows[i]['_id'].is_a? String # invalid type for _id
|
269
|
+
begin
|
270
|
+
rows[i]['_id'].to_s
|
271
|
+
rescue
|
272
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' is not a string.", {'row' => i, 'col' => '_id'})
|
273
|
+
else
|
274
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} is not a string.", {'row' => i, 'col' => '_id'})
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
begin
|
279
|
+
check_id rows[i]['_id'] # make sure _id is alphanumeric
|
280
|
+
rescue
|
281
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} contains disallowed characters. Ids must contain only alphanumerics, with underscores and hyphens allowed after the beginning of the id.", {'row' => i, 'col' => '_id'})
|
282
|
+
end
|
283
|
+
|
284
|
+
if unique_ids.include? rows[i]['_id']
|
285
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} is non-unique, conflicts with row #{unique_ids[rows[i]['_id']]}", {'row' => i, 'col' => '_id'})
|
286
|
+
end
|
287
|
+
|
288
|
+
unique_ids[rows[i]['_id']] = i
|
289
|
+
elsif rows[i].include? '_id' # no ids, no autoid, but _id column
|
290
|
+
if opts['remove_extra_fields'] # just remove it
|
291
|
+
rows[i].delete '_id'
|
292
|
+
else
|
293
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' should not be included.", {'row' => i, 'col' => '_id'})
|
294
|
+
end
|
295
|
+
end
|
296
|
+
rows[i].keys.each {|c|
|
297
|
+
if c != '_id'
|
298
|
+
if not schema.include? c # keys missing from schema
|
299
|
+
if opts['remove_extra_fields'] # remove it
|
300
|
+
rows[i].delete c
|
301
|
+
else
|
302
|
+
if not opts['allow_extra_fields'] # or silently allow
|
303
|
+
raise VeritableError.new("Row #{i}, key #{c} is not defined in schema", {'row' => i, 'col' => c})
|
304
|
+
end
|
305
|
+
end
|
306
|
+
elsif rows[i][c].nil? # nil values
|
307
|
+
if opts['remove_nones'] # remove
|
308
|
+
rows[i].delete c
|
309
|
+
else
|
310
|
+
if not opts['allow_nones'] # or silently allow
|
311
|
+
raise VeritableError.new("Row #{i}, key #{c} should be removed because it is nil", {'row' => i, 'col' => c})
|
312
|
+
end
|
313
|
+
end
|
314
|
+
else # keys present in schema
|
315
|
+
coltype = schema.type c # check the column type
|
316
|
+
if coltype == 'count'
|
317
|
+
if opts['convert_types'] # try converting to int
|
318
|
+
begin
|
319
|
+
rows[i][c] = to_integer(rows[i][c])
|
320
|
+
rescue
|
321
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
322
|
+
end
|
323
|
+
end
|
324
|
+
if rows[i][c].nil?
|
325
|
+
rows[i].delete c # remove flagged values
|
326
|
+
elsif opts['remove_invalids'] and (rows[i][c].is_a? Fixnum) and (rows[i][c] < 0)
|
327
|
+
rows[i].delete c
|
328
|
+
else
|
329
|
+
if not (rows[i][c].is_a? Fixnum) or not (rows[i][c] >= 0) # catch invalids
|
330
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is #{rows[i][c].class}, not a non-negative integer.", {'row' => i, 'col' => c})
|
331
|
+
end
|
332
|
+
end
|
333
|
+
elsif coltype == 'real'
|
334
|
+
if opts['convert_types'] # try converting to float
|
335
|
+
begin
|
336
|
+
rows[i][c] = Float(rows[i][c]) unless rows[i][c].is_a? Float
|
337
|
+
rescue
|
338
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
339
|
+
end
|
340
|
+
end
|
341
|
+
if rows[i][c].nil?
|
342
|
+
rows[i].delete c
|
343
|
+
else
|
344
|
+
if not rows[i][c].is_a? Float
|
345
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is a #{rows[i][c].class}, not a float.", {'row' => i, 'col' => c})
|
346
|
+
end
|
347
|
+
end
|
348
|
+
elsif coltype == 'boolean'
|
349
|
+
if opts['convert_types'] # try converting to bool
|
350
|
+
lc = (rows[i][c]).to_s.strip.downcase
|
351
|
+
begin
|
352
|
+
if true_strings.include? lc
|
353
|
+
rows[i][c] = true
|
354
|
+
elsif false_strings.include? lc
|
355
|
+
rows[i][c] = false
|
356
|
+
elsif to_integer(rows[i][c]) == 0 # note that this behavior differs from what a rubyist might expect; "0" maps to false
|
357
|
+
rows[i][c] = false
|
358
|
+
else
|
359
|
+
rows[i][c] = true
|
360
|
+
end
|
361
|
+
rescue
|
362
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
363
|
+
end
|
364
|
+
end
|
365
|
+
if rows[i][c].nil? # remove flagged values
|
366
|
+
rows[i].delete c
|
367
|
+
else
|
368
|
+
if not [true, false].include? rows[i][c]
|
369
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is #{rows[i][c].class}, not a boolean", {'row' => i, 'col' => c})
|
370
|
+
end
|
371
|
+
end
|
372
|
+
elsif coltype == 'categorical'
|
373
|
+
if opts['convert_types'] # try converting to string
|
374
|
+
begin
|
375
|
+
rows[i][c] = rows[i][c].to_s unless rows[i][c].is_a? String
|
376
|
+
rescue
|
377
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
378
|
+
end
|
379
|
+
end
|
380
|
+
if rows[i][c].nil? # remove flagged values
|
381
|
+
rows[i].delete c
|
382
|
+
else
|
383
|
+
if not rows[i][c].is_a? String # catch invalids
|
384
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is a #{rows[i][c].class}, not a string", {'row' => i, 'col' => c})
|
385
|
+
end
|
386
|
+
category_counts[c] = Hash.new if not category_counts.include? c # increment count
|
387
|
+
category_counts[c][rows[i][c]] = 0 if not category_counts[c].include? rows[i][c]
|
388
|
+
category_counts[c][rows[i][c]] += 1
|
389
|
+
end
|
390
|
+
else
|
391
|
+
raise VeritableError.new("Validate -- didn't recognize column type #{coltype}")
|
392
|
+
end
|
393
|
+
end
|
394
|
+
if not field_fill.include? c and not opts['remove_extra_fields']
|
395
|
+
field_fill[c] = 0
|
396
|
+
end
|
397
|
+
if rows[i].include? c and not rows[i][c].nil?
|
398
|
+
field_fill[c] += 1
|
399
|
+
end
|
400
|
+
end
|
401
|
+
}
|
402
|
+
}
|
403
|
+
category_counts.keys.each {|c|
|
404
|
+
cats = category_counts[c].keys
|
405
|
+
if cats.size > max_cats # too many categories
|
406
|
+
if opts['reduce_categories'] # keep the largest max_cats - 1
|
407
|
+
cats = cats.sort! {|a,b| category_counts[c][b] <=> category_counts[c][a]}
|
408
|
+
category_map = Hash.new
|
409
|
+
(0...cats.size).each {|j|
|
410
|
+
j < max_cats - 1 ? category_map[cats[j]] = cats[j] : category_map[cats[j]] = "Other"
|
411
|
+
}
|
412
|
+
(0...rows.size).each {|i|
|
413
|
+
rows[i][c] = category_map[rows[i][c]] if rows[i].include? c and not rows[i][c].nil?
|
414
|
+
}
|
415
|
+
else
|
416
|
+
raise VeritableError.new("Validate -- categorical column #{c} has #{category_counts[c].keys.size} unique values which exceeds the limits of #{max_cats}.", {'col' => c})
|
417
|
+
end
|
418
|
+
end
|
419
|
+
}
|
420
|
+
if not opts['allow_empty_columns']
|
421
|
+
field_fill.each {|c, fill|
|
422
|
+
raise VeritableError.new("Validate -- column #{c} does not have any values", {'col' => c}) if fill == 0
|
423
|
+
}
|
424
|
+
end
|
425
|
+
nil
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
data/lib/veritable.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
require 'veritable/api'
|
4
|
+
require 'veritable/connection'
|
5
|
+
require 'veritable/errors'
|
6
|
+
require 'veritable/util'
|
7
|
+
require 'veritable/version'
|
8
|
+
|
9
|
+
require 'rest_client'
|
10
|
+
require 'uuid'
|
11
|
+
require 'multi_json'
|
12
|
+
|
13
|
+
module Veritable
|
14
|
+
USER_AGENT = 'veritable-ruby ' + VERSION
|
15
|
+
BASE_URL = "https://api.priorknowledge.com"
|
16
|
+
|
17
|
+
def self.connect(opts={})
|
18
|
+
opts[:api_key] = opts[:api_key] || ENV['VERITABLE_KEY']
|
19
|
+
opts[:api_base_url] = opts[:api_base_url] || ENV['VERITABLE_URL'] || BASE_URL
|
20
|
+
|
21
|
+
opts[:ssl_verify] = true unless opts.has_key?(:ssl_verify)
|
22
|
+
opts[:enable_gzip] = true unless opts.has_key?(:enable_gzip)
|
23
|
+
|
24
|
+
api = API.new(opts)
|
25
|
+
connection_test = api.root
|
26
|
+
status = connection_test["status"]
|
27
|
+
entropy = connection_test["entropy"]
|
28
|
+
raise VeritableError.new("No Veritable server responding at #{opts[:api_base_url]}") if status != "SUCCESS"
|
29
|
+
raise VeritableError.new("No Veritable server responding at #{opts[:api_base_url]}") if ! entropy.is_a?(Float)
|
30
|
+
api
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,154 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: veritable
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Prior Knowledge
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-02 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rest-client
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.4'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.4'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: uuid
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: multi_json
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: test-unit
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: simplecov
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Veritable is the predictive database developed by Prior Knowledge (http://www.priorknowledge.com)
|
111
|
+
email:
|
112
|
+
- support@priorknowledge.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- CHANGELOG.txt
|
118
|
+
- lib/veritable/api.rb
|
119
|
+
- lib/veritable/connection.rb
|
120
|
+
- lib/veritable/cursor.rb
|
121
|
+
- lib/veritable/datatypes.rb
|
122
|
+
- lib/veritable/errors.rb
|
123
|
+
- lib/veritable/object.rb
|
124
|
+
- lib/veritable/resource.rb
|
125
|
+
- lib/veritable/util.rb
|
126
|
+
- lib/veritable/version.rb
|
127
|
+
- lib/veritable.rb
|
128
|
+
- LICENSE
|
129
|
+
- README.md
|
130
|
+
homepage: https://dev.priorknowledge.com
|
131
|
+
licenses: []
|
132
|
+
post_install_message:
|
133
|
+
rdoc_options: []
|
134
|
+
require_paths:
|
135
|
+
- lib
|
136
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
|
+
none: false
|
144
|
+
requirements:
|
145
|
+
- - ! '>='
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 1.8.24
|
151
|
+
signing_key:
|
152
|
+
specification_version: 3
|
153
|
+
summary: Ruby client for Veritable API
|
154
|
+
test_files: []
|