veritable 0.1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.txt +3 -0
- data/LICENSE +22 -0
- data/README.md +29 -0
- data/lib/veritable/api.rb +460 -0
- data/lib/veritable/connection.rb +84 -0
- data/lib/veritable/cursor.rb +66 -0
- data/lib/veritable/datatypes.rb +3 -0
- data/lib/veritable/errors.rb +17 -0
- data/lib/veritable/object.rb +20 -0
- data/lib/veritable/resource.rb +14 -0
- data/lib/veritable/util.rb +429 -0
- data/lib/veritable/version.rb +3 -0
- data/lib/veritable.rb +32 -0
- metadata +154 -0
data/CHANGELOG.txt
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Prior Knowledge
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# Veritable
|
2
|
+
|
3
|
+
TODO: Write a gem description
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
gem 'veritable'
|
10
|
+
|
11
|
+
And then execute:
|
12
|
+
|
13
|
+
$ bundle
|
14
|
+
|
15
|
+
Or install it yourself as:
|
16
|
+
|
17
|
+
$ gem install veritable
|
18
|
+
|
19
|
+
## Usage
|
20
|
+
|
21
|
+
TODO: Write usage instructions here
|
22
|
+
|
23
|
+
## Contributing
|
24
|
+
|
25
|
+
1. Fork it
|
26
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
28
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
+
5. Create new Pull Request
|
@@ -0,0 +1,460 @@
|
|
1
|
+
require 'veritable/cursor'
|
2
|
+
require 'veritable/datatypes'
|
3
|
+
require 'veritable/errors'
|
4
|
+
require 'veritable/resource'
|
5
|
+
require 'veritable/util'
|
6
|
+
|
7
|
+
module Veritable
|
8
|
+
class API
|
9
|
+
include VeritableResource
|
10
|
+
|
11
|
+
def root; get(""); end
|
12
|
+
|
13
|
+
def limits; get("user/limits"); end
|
14
|
+
|
15
|
+
def tables(opts={'start' => nil, 'limit' => nil})
|
16
|
+
Cursor.new({'collection' => "tables",
|
17
|
+
'start' => opts['start'],
|
18
|
+
'limit' => opts['limit']}.update(@opts)) {|x| Table.new(@opts, x)}
|
19
|
+
end
|
20
|
+
|
21
|
+
def table(table_id)
|
22
|
+
Table.new(@opts, get("tables/#{table_id}"))
|
23
|
+
end
|
24
|
+
|
25
|
+
def create_table(table_id=nil, description='', force=false)
|
26
|
+
if table_id.nil?
|
27
|
+
autogen = true
|
28
|
+
table_id = Util.make_table_id
|
29
|
+
else
|
30
|
+
autogen = false
|
31
|
+
Util.check_id table_id
|
32
|
+
end
|
33
|
+
|
34
|
+
if has_table? table_id
|
35
|
+
if autogen
|
36
|
+
return create_table(nil, description, false)
|
37
|
+
end
|
38
|
+
if ! force
|
39
|
+
raise VeritableError.new("Couldn't create table -- table with id #{table_id} already exists.")
|
40
|
+
else
|
41
|
+
delete_table table_id
|
42
|
+
end
|
43
|
+
end
|
44
|
+
doc = post("tables", {:_id => table_id, :description => description})
|
45
|
+
Table.new(@opts, doc)
|
46
|
+
end
|
47
|
+
|
48
|
+
def delete_table(table_id); delete("tables/#{table_id}"); end
|
49
|
+
|
50
|
+
def inspect; to_s; end
|
51
|
+
def to_s; "#<Veritable::API url='#{api_base_url}'>"; end
|
52
|
+
|
53
|
+
def has_table?(table_id)
|
54
|
+
begin
|
55
|
+
table table_id
|
56
|
+
rescue
|
57
|
+
false
|
58
|
+
else
|
59
|
+
true
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
class Table
|
65
|
+
include VeritableResource
|
66
|
+
|
67
|
+
alias :rest_delete :delete
|
68
|
+
def delete
|
69
|
+
rest_delete(link('self'))
|
70
|
+
end
|
71
|
+
|
72
|
+
def row(row_id); get("#{link('rows')}/#{row_id}"); end
|
73
|
+
|
74
|
+
def rows(opts={'start' => nil, 'limit' => nil})
|
75
|
+
Cursor.new({'collection' => link('rows'),
|
76
|
+
'start' => opts['start'],
|
77
|
+
'limit' => opts['limit']}.update(@opts))
|
78
|
+
end
|
79
|
+
|
80
|
+
def upload_row(row)
|
81
|
+
Util.check_row row
|
82
|
+
put("#{link('rows')}/#{row['_id']}", row)
|
83
|
+
end
|
84
|
+
|
85
|
+
def batch_upload_rows(rows, per_page=100)
|
86
|
+
batch_modify_rows('put', rows, per_page)
|
87
|
+
end
|
88
|
+
|
89
|
+
def delete_row(row_id)
|
90
|
+
rest_delete("#{link('rows')}/#{row_id}")
|
91
|
+
end
|
92
|
+
|
93
|
+
def batch_delete_rows(rows, per_page=100)
|
94
|
+
begin
|
95
|
+
batch_modify_rows('delete', rows, per_page)
|
96
|
+
rescue VeritableError => e
|
97
|
+
if (not e.respond_to?(:http_code)) or (not (e.http_code == "404 Resource Not Found"))
|
98
|
+
raise e
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def analysis(analysis_id)
|
104
|
+
Analysis.new(@opts, get("#{link('analyses')}/#{analysis_id}"))
|
105
|
+
end
|
106
|
+
|
107
|
+
def analyses(opts={'start' => nil, 'limit' => nil})
|
108
|
+
Cursor.new({'collection' => link('analyses'),
|
109
|
+
'start' => opts['start'],
|
110
|
+
'limit' => opts['limit']}.update(@opts)) {|x| Analysis.new(@opts, x)}
|
111
|
+
end
|
112
|
+
|
113
|
+
def delete_analysis(analysis_id)
|
114
|
+
rest_delete("#{link('analyses')}/#{analysis_id}")
|
115
|
+
end
|
116
|
+
|
117
|
+
def create_analysis(schema, analysis_id=nil, description="", force=false, analysis_type="veritable")
|
118
|
+
if analysis_type != "veritable"
|
119
|
+
if analysis_type.respond_to? :to_s
|
120
|
+
raise VeritableError.new("Invalid analysis type #{analysis_type}.")
|
121
|
+
else
|
122
|
+
raise VeritableError.new("Invalid analysis type.")
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
if analysis_id.nil?
|
127
|
+
autogen = true
|
128
|
+
analysis_id = Util.make_analysis_id
|
129
|
+
else
|
130
|
+
autogen = false
|
131
|
+
Util.check_id analysis_id
|
132
|
+
end
|
133
|
+
|
134
|
+
if has_analysis? analysis_id
|
135
|
+
if autogen
|
136
|
+
return create_analysis(nil, description, false)
|
137
|
+
end
|
138
|
+
if ! force
|
139
|
+
raise VeritableError.new("Couldn't create table -- table with id #{analysis_id} already exists.")
|
140
|
+
else
|
141
|
+
delete_analysis analysis_id
|
142
|
+
end
|
143
|
+
end
|
144
|
+
doc = post(link('analyses'), {:_id => analysis_id, :description => description, :type => analysis_type, :schema => schema})
|
145
|
+
Analysis.new(@opts, doc)
|
146
|
+
end
|
147
|
+
|
148
|
+
def inspect; to_s; end
|
149
|
+
def to_s; "#<Veritable::Table _id='#{_id}'>"; end
|
150
|
+
|
151
|
+
def _id; @doc['_id']; end
|
152
|
+
def description; @doc['description']; end
|
153
|
+
|
154
|
+
def has_analysis?(analysis_id)
|
155
|
+
begin
|
156
|
+
analysis analysis_id
|
157
|
+
rescue
|
158
|
+
false
|
159
|
+
else
|
160
|
+
true
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def batch_modify_rows(action, rows, per_page=100)
|
167
|
+
if not per_page.is_a? Fixnum or not per_page > 0
|
168
|
+
raise VeritableError.new("Batch upload or delete must have integer page size greater than 0.")
|
169
|
+
end
|
170
|
+
rows = rows.collect {|row|
|
171
|
+
Util.check_row(row)
|
172
|
+
row
|
173
|
+
}
|
174
|
+
if (not rows.is_a? Array) and (not rows.is_a? Veritable::Cursor)
|
175
|
+
raise VeritableError.new("Must pass an array of row hashes or a cursor of rows to batch upload or delete.")
|
176
|
+
end
|
177
|
+
ct = (1..per_page).to_a.cycle
|
178
|
+
batch = Array.new()
|
179
|
+
ct.each { |ct|
|
180
|
+
if rows.empty?
|
181
|
+
if batch.size > 0
|
182
|
+
post(link('rows'), {'action' => action, 'rows' => batch})
|
183
|
+
end
|
184
|
+
break
|
185
|
+
end
|
186
|
+
batch.push rows.shift
|
187
|
+
if ct == per_page
|
188
|
+
post(link('rows'), {'action' => action, 'rows' => batch})
|
189
|
+
batch = Array.new()
|
190
|
+
end
|
191
|
+
}
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
class Analysis
|
196
|
+
include VeritableResource
|
197
|
+
|
198
|
+
def update; @doc = get(link('self')); end
|
199
|
+
|
200
|
+
alias :rest_delete :delete
|
201
|
+
def delete; rest_delete(link('self')); end
|
202
|
+
|
203
|
+
def schema; Schema.new(get(link('schema'))); end
|
204
|
+
|
205
|
+
def wait(max_time=nil, poll=2)
|
206
|
+
elapsed = 0
|
207
|
+
while running?
|
208
|
+
sleep poll
|
209
|
+
if not max_time.nil?
|
210
|
+
elapsed += poll
|
211
|
+
if elapsed > max_time
|
212
|
+
raise VeritableError.new("Wait for analysis -- Maximum time of #{max_time} second exceeded.")
|
213
|
+
end
|
214
|
+
end
|
215
|
+
update
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
def predict(row, count=100)
|
220
|
+
update if running?
|
221
|
+
if succeeded?
|
222
|
+
if not row.is_a? Hash
|
223
|
+
raise VeritableError.new("Predict -- Must provide a row hash to make predictions.")
|
224
|
+
end
|
225
|
+
res = post(link('predict'), {'data' => row, 'count' => count})
|
226
|
+
if not res.is_a? Array
|
227
|
+
begin
|
228
|
+
res.to_s
|
229
|
+
rescue
|
230
|
+
raise VeritableError.new("Predict -- Error making predictions: #{res}")
|
231
|
+
else
|
232
|
+
raise VeritableError.new("Predict -- Error making predictions.")
|
233
|
+
end
|
234
|
+
end
|
235
|
+
Prediction.new(row, res, schema)
|
236
|
+
elsif running?
|
237
|
+
raise VeritableError.new("Predict -- Analysis with id #{_id} is still running and not yet ready to predict.")
|
238
|
+
elsif failed?
|
239
|
+
raise VeritableError.new("Predict -- Analysis with id #{_id} has failed and cannot predict.")
|
240
|
+
else
|
241
|
+
raise VeritableError.new("Predict -- Shouldn't be here -- please let us know at support@priorknowledge.com.")
|
242
|
+
end
|
243
|
+
end
|
244
|
+
|
245
|
+
def related_to(column_id, opts={'start' => nil, 'limit' => nil})
|
246
|
+
update if running?
|
247
|
+
if succeeded?
|
248
|
+
Cursor.new(
|
249
|
+
{'collection' => "#{link('related')}/#{column_id}",
|
250
|
+
'start' => opts['start'],
|
251
|
+
'limit' => opts['limit']}.update(@opts))
|
252
|
+
elsif running?
|
253
|
+
raise VeritableError.new("Related -- Analysis with id #{_id} is still running and not yet ready to calculate related.")
|
254
|
+
elsif failed?
|
255
|
+
raise VeritableError.new("Related -- Analysis with id #{_id} has failed and cannot calculate related.")
|
256
|
+
else
|
257
|
+
raise VeritableError.new("Related -- Shouldn't be here -- please let us know at support@priorknowledge.com.")
|
258
|
+
end
|
259
|
+
end
|
260
|
+
|
261
|
+
def inspect; to_s; end
|
262
|
+
def to_s; "#<Veritable::Analysis _id='#{_id}'>"; end
|
263
|
+
|
264
|
+
def _id; @doc['_id']; end
|
265
|
+
def created_at; @doc['created_at']; end
|
266
|
+
def finished_at; @doc['finished_at']; end
|
267
|
+
def state; @doc['state']; end
|
268
|
+
def running?; state == 'running'; end
|
269
|
+
def succeeded?; state == 'succeeded'; end
|
270
|
+
def failed?; state == 'failed'; end
|
271
|
+
def error; state == 'failed' ? @doc['error'] : nil; end
|
272
|
+
def progress; state == 'running' ? @doc['progress'] : nil; end
|
273
|
+
end
|
274
|
+
|
275
|
+
class Schema < Hash
|
276
|
+
def initialize(data, subset=nil)
|
277
|
+
begin
|
278
|
+
data.each {|k, v|
|
279
|
+
if subset.is_a? Array
|
280
|
+
self[k] = v if subset.include? k
|
281
|
+
elsif subset.is_a? Hash
|
282
|
+
self[k] = v if subset.has_key? k
|
283
|
+
else
|
284
|
+
self[k] = v
|
285
|
+
end
|
286
|
+
}
|
287
|
+
rescue
|
288
|
+
begin
|
289
|
+
data.to_s
|
290
|
+
rescue
|
291
|
+
raise VeritableError.new("Initialize schema -- invalid schema data.")
|
292
|
+
else
|
293
|
+
raise VeritableError.new("Initialize schema -- invalid schema data #{data}.")
|
294
|
+
end
|
295
|
+
end
|
296
|
+
end
|
297
|
+
|
298
|
+
def type(column)
|
299
|
+
self[column]['type']
|
300
|
+
end
|
301
|
+
|
302
|
+
def validate
|
303
|
+
self.each {|k, v|
|
304
|
+
if not k.is_a? String
|
305
|
+
begin
|
306
|
+
k.to_s
|
307
|
+
rescue
|
308
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification: nonstring column id.")
|
309
|
+
else
|
310
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification: nonstring column id #{k}")
|
311
|
+
end
|
312
|
+
end
|
313
|
+
begin
|
314
|
+
Util.check_id k
|
315
|
+
rescue
|
316
|
+
raise VeritableError.new("Validate schema -- Invalid column name #{k}: must contain only alphanumerics, dashes, and underscores, and may not begin with a dash or underscore.")
|
317
|
+
end
|
318
|
+
if not v.include? 'type'
|
319
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification. Column #{k} must specify a 'type', one of #{DATATYPES}")
|
320
|
+
end
|
321
|
+
if not DATATYPES.include? v['type']
|
322
|
+
raise VeritableError.new("Validate schema -- Invalid schema specification. Column #{k}, type #{v['type']} is not valid. Type must be one of #{DATATYPES}")
|
323
|
+
end
|
324
|
+
}
|
325
|
+
end
|
326
|
+
end
|
327
|
+
|
328
|
+
class Prediction < Hash
|
329
|
+
attr_reader :request
|
330
|
+
attr_reader :distribution
|
331
|
+
attr_reader :schema
|
332
|
+
attr_reader :uncertainty
|
333
|
+
|
334
|
+
def initialize(request, distribution, schema)
|
335
|
+
@request = request
|
336
|
+
@distribution = distribution
|
337
|
+
@schema = Schema.new(schema)
|
338
|
+
@uncertainty = Hash.new()
|
339
|
+
|
340
|
+
request.each { |k,v|
|
341
|
+
if v.nil?
|
342
|
+
self[k] = point_estimate k
|
343
|
+
@uncertainty[k] = calculate_uncertainty k
|
344
|
+
else
|
345
|
+
self[k] = v
|
346
|
+
@uncertainty[k] = 0.0
|
347
|
+
end
|
348
|
+
}
|
349
|
+
end
|
350
|
+
|
351
|
+
def prob_within(column, range)
|
352
|
+
col_type = schema.type column
|
353
|
+
Veritable::Util.check_datatype(col_type, "Probability within -- ")
|
354
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
355
|
+
count = distribution.inject(0) {|memo, row|
|
356
|
+
if range.include? row[column]
|
357
|
+
memo + 1
|
358
|
+
else
|
359
|
+
memo
|
360
|
+
end
|
361
|
+
}
|
362
|
+
count.to_f / distribution.size
|
363
|
+
elsif col_type == 'count' or col_type == 'real'
|
364
|
+
mn = range[0]
|
365
|
+
mx = range[1]
|
366
|
+
count = distribution.inject(0) {|memo, row|
|
367
|
+
v = row[column]
|
368
|
+
if (mn.nil? or v >= mn) and (mx.nil? or v <=mx)
|
369
|
+
memo + 1
|
370
|
+
else
|
371
|
+
memo
|
372
|
+
end
|
373
|
+
}
|
374
|
+
count.to_f / distribution.size
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
def credible_values(column, p=nil)
|
379
|
+
col_type = schema.type column
|
380
|
+
Veritable::Util.check_datatype(col_type, "Credible values -- ")
|
381
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
382
|
+
p = 0.5 if p.nil?
|
383
|
+
tf = Hash.new
|
384
|
+
((freqs(counts(column)).sort_by {|k, v| v}).reject {|c, a| a < p}).each {|k, v| tf[k] = v}
|
385
|
+
tf
|
386
|
+
elsif col_type == 'count' or col_type == 'real'
|
387
|
+
p = 0.9 if p.nil?
|
388
|
+
n = distribution.size
|
389
|
+
a = (n * (1.0 - p) / 2.0).round.to_i
|
390
|
+
sv = sorted_values column
|
391
|
+
n = sv.size
|
392
|
+
lo = sv[a]
|
393
|
+
hi = sv[n - 1 - a]
|
394
|
+
[lo, hi]
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
def inspect; to_s; end
|
399
|
+
def to_s; "<Veritable::Prediction #{super}>"; end
|
400
|
+
|
401
|
+
private
|
402
|
+
|
403
|
+
def sorted_values(column)
|
404
|
+
values = (distribution.collect {|row| row[column]}).reject {|x| x.nil?}
|
405
|
+
values.sort
|
406
|
+
end
|
407
|
+
|
408
|
+
def counts(column)
|
409
|
+
cts = Hash.new
|
410
|
+
distribution.each {|row|
|
411
|
+
if row.has_key? column
|
412
|
+
cat = row[column]
|
413
|
+
if not (cts.has_key? cat)
|
414
|
+
cts[cat] = 0
|
415
|
+
end
|
416
|
+
cts[cat] += 1
|
417
|
+
end
|
418
|
+
}
|
419
|
+
cts
|
420
|
+
end
|
421
|
+
|
422
|
+
def freqs(cts)
|
423
|
+
total = cts.values.inject(0) {|memo, obj| memo + obj}
|
424
|
+
freqs = Hash.new()
|
425
|
+
cts.each {|k, v|
|
426
|
+
freqs[k] = v.to_f / total
|
427
|
+
}
|
428
|
+
freqs
|
429
|
+
end
|
430
|
+
|
431
|
+
def point_estimate(column)
|
432
|
+
col_type = schema.type column
|
433
|
+
Veritable::Util.check_datatype(col_type, "Point estimate -- ")
|
434
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
435
|
+
# use the mode
|
436
|
+
(counts(column).max_by {|k, v| v})[0]
|
437
|
+
elsif col_type == 'real' or col_type == 'count'
|
438
|
+
# use the mean
|
439
|
+
values = distribution.collect {|row| row[column]}
|
440
|
+
mean = (values.inject(0) {|memo, obj| memo + obj}) / values.size.to_f
|
441
|
+
col_type == 'real' ? mean : mean.round.to_i
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
def calculate_uncertainty(column)
|
446
|
+
values = distribution.collect {|row| row[column]}
|
447
|
+
col_type = schema.type column
|
448
|
+
Veritable::Util.check_datatype(col_type, "Calculate uncertainty -- ")
|
449
|
+
n = values.size
|
450
|
+
if col_type == 'boolean' or col_type == 'categorical'
|
451
|
+
e = ((counts column).max_by {|k, v| v})[0]
|
452
|
+
c = 1.0 - (values.count {|v| v == e} / n.to_f)
|
453
|
+
c.to_f
|
454
|
+
elsif col_type == 'count' or col_type == 'real'
|
455
|
+
r = credible_values column
|
456
|
+
(r[1] - r[0]).to_f
|
457
|
+
end
|
458
|
+
end
|
459
|
+
end
|
460
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'veritable/object'
|
2
|
+
require 'multi_json'
|
3
|
+
|
4
|
+
module Veritable
|
5
|
+
module Connection
|
6
|
+
include VeritableObject
|
7
|
+
def initialize(opts=nil, doc=nil)
|
8
|
+
super(opts, doc)
|
9
|
+
require_opts :api_key, :api_base_url
|
10
|
+
default_opts(:ssl_verify => true, :enable_gzip => true)
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(url, params=nil, headers={})
|
14
|
+
if params and params.count > 0
|
15
|
+
query_string = Util.query_params(params)
|
16
|
+
url += "?#{query_string}"
|
17
|
+
end
|
18
|
+
request(:get, url, nil, headers)
|
19
|
+
end
|
20
|
+
|
21
|
+
def post(url, payload, headers={})
|
22
|
+
payload = MultiJson.encode(payload)
|
23
|
+
headers = headers.merge({:content_type => 'application/json'})
|
24
|
+
request(:post, url, payload, headers)
|
25
|
+
end
|
26
|
+
|
27
|
+
def put(url, payload, headers={})
|
28
|
+
payload = MultiJson.encode(payload)
|
29
|
+
headers = headers.merge({:content_type => 'application/json'})
|
30
|
+
request(:put, url, payload, headers)
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(url, headers={})
|
34
|
+
begin
|
35
|
+
request(:delete, url, nil, headers)
|
36
|
+
rescue VeritableError => e
|
37
|
+
if not e.respond_to? :http_code or not e.http_code == "404 Resource Not Found"
|
38
|
+
raise e
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def request(verb, url, payload=nil, headers={})
|
44
|
+
url = api_base_url + "/" + url
|
45
|
+
|
46
|
+
headers = {
|
47
|
+
:user_agent => USER_AGENT,
|
48
|
+
:accept => :json,
|
49
|
+
:accept_encoding => enable_gzip ? :gzip : nil
|
50
|
+
}.merge(headers)
|
51
|
+
|
52
|
+
opts = {
|
53
|
+
:method => verb.to_s,
|
54
|
+
:url => url,
|
55
|
+
:user => api_key,
|
56
|
+
:password => "",
|
57
|
+
:headers => headers,
|
58
|
+
:payload => payload,
|
59
|
+
:verify_ssl => ssl_verify,
|
60
|
+
}
|
61
|
+
begin
|
62
|
+
response = RestClient::Request.execute(opts)
|
63
|
+
rescue RestClient::Exception => e
|
64
|
+
begin
|
65
|
+
r = MultiJson.decode(e.response)
|
66
|
+
msg = r['message']
|
67
|
+
code = r['code']
|
68
|
+
rescue
|
69
|
+
raise e
|
70
|
+
end
|
71
|
+
raise VeritableError.new("HTTP Error #{e.message} -- #{code}: #{msg}", {'http_code' => e.message, 'api_code' => code, 'api_message' => msg})
|
72
|
+
end
|
73
|
+
return MultiJson.decode(response)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
def api_key; @opts[:api_key]; end
|
79
|
+
def api_base_url; @opts[:api_base_url]; end
|
80
|
+
def ssl_verify; @opts[:ssl_verify]; end
|
81
|
+
def enable_gzip; @opts[:enable_gzip]; end
|
82
|
+
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'veritable/object'
|
2
|
+
require 'veritable/resource'
|
3
|
+
|
4
|
+
module Veritable
|
5
|
+
class Cursor
|
6
|
+
include VeritableResource
|
7
|
+
include Enumerable
|
8
|
+
def initialize(opts=nil, doc=nil, &lazymap)
|
9
|
+
super(opts, doc)
|
10
|
+
|
11
|
+
require_opts 'collection'
|
12
|
+
default_opts({'per_page' => 100})
|
13
|
+
|
14
|
+
collection_key = collection.split("/")[-1]
|
15
|
+
@doc = get(collection, params={:count => per_page, :start => start})
|
16
|
+
@doc.has_key?(collection_key) ? @opts['key'] = collection_key : @opts['key'] = 'data'
|
17
|
+
@opts['lazymap'] = lazymap if lazymap
|
18
|
+
end
|
19
|
+
|
20
|
+
def each
|
21
|
+
i = limit if limit
|
22
|
+
loop do
|
23
|
+
if data.length > 0 or refresh > 0
|
24
|
+
if limit
|
25
|
+
raise StopIteration if i == 0
|
26
|
+
i = i - 1
|
27
|
+
end
|
28
|
+
if lazymap
|
29
|
+
yield lazymap.call(data.shift)
|
30
|
+
else
|
31
|
+
yield data.shift
|
32
|
+
end
|
33
|
+
else
|
34
|
+
raise StopIteration
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
def inspect; to_s; end
|
39
|
+
def to_s; "#<Veritable::Cursor collection='" + collection + "'>"; end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def refresh
|
44
|
+
return data.length if data.length > 0
|
45
|
+
if next_page
|
46
|
+
@doc = get next_page
|
47
|
+
elsif last_page?
|
48
|
+
return 0
|
49
|
+
else
|
50
|
+
@doc = get(collection, params={:count => per_page, :start => start})
|
51
|
+
end
|
52
|
+
return data.length
|
53
|
+
end
|
54
|
+
|
55
|
+
def limit; @opts['limit']; end
|
56
|
+
def limit=(x); @opts['limit'] = x; end
|
57
|
+
def start; @opts['start']; end
|
58
|
+
def per_page; @opts['per_page']; end
|
59
|
+
def collection; @opts['collection'] end
|
60
|
+
def lazymap; @opts['lazymap']; end
|
61
|
+
def key; @opts['key'] end
|
62
|
+
def next_page; link 'next' end
|
63
|
+
def last_page?; ! @doc.has_key? 'next' end
|
64
|
+
def data; @doc[key] end
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class VeritableError < StandardError
|
2
|
+
attr_reader :message
|
3
|
+
def initialize(message, opts=nil)
|
4
|
+
@message = message
|
5
|
+
if opts.is_a? Hash
|
6
|
+
@opts = opts
|
7
|
+
eigenclass = class << self; self; end
|
8
|
+
@opts.keys.each {|k|
|
9
|
+
eigenclass.send(:define_method, k.to_sym) {
|
10
|
+
@opts[k]
|
11
|
+
}
|
12
|
+
}
|
13
|
+
end
|
14
|
+
end
|
15
|
+
def to_s; message; end
|
16
|
+
def inspect; message; end
|
17
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'veritable/errors'
|
2
|
+
|
3
|
+
module Veritable
|
4
|
+
module VeritableObject
|
5
|
+
def initialize(opts=nil, doc=nil)
|
6
|
+
@opts = opts
|
7
|
+
@doc = doc
|
8
|
+
end
|
9
|
+
|
10
|
+
private
|
11
|
+
|
12
|
+
def require_opts(*keys)
|
13
|
+
keys.each {|k| raise VeritableError.new("Error initializing object -- must provide #{k}") unless @opts.has_key?(k)}
|
14
|
+
end
|
15
|
+
|
16
|
+
def default_opts(hash={})
|
17
|
+
hash.each {|k, v| @opts[k] = v unless @opts.has_key?(k)}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,429 @@
|
|
1
|
+
require 'veritable/datatypes'
|
2
|
+
require 'veritable/errors'
|
3
|
+
require 'uuid'
|
4
|
+
require 'uri'
|
5
|
+
require 'csv'
|
6
|
+
require 'set'
|
7
|
+
|
8
|
+
module Veritable
|
9
|
+
module Util
|
10
|
+
class << self
|
11
|
+
def make_table_id; UUID.new.generate :compact ; end
|
12
|
+
def make_analysis_id; UUID.new.generate :compact ; end
|
13
|
+
|
14
|
+
def query_params(params, parent=nil)
|
15
|
+
flatten_params(params).collect {|x|
|
16
|
+
"#{x[0]}=#{x[1]}"
|
17
|
+
}.join("&")
|
18
|
+
end
|
19
|
+
|
20
|
+
def check_id(id)
|
21
|
+
if not id.is_a? String
|
22
|
+
begin
|
23
|
+
id.to_s
|
24
|
+
rescue
|
25
|
+
raise VeritableError.new("Invalid id -- strings only.")
|
26
|
+
else
|
27
|
+
raise VeritableError.new("Invalid id '#{id}' -- strings only.")
|
28
|
+
end
|
29
|
+
elsif not id =~ Regexp.new('\A[a-zA-Z0-9][-_a-zA-Z0-9]*\z')
|
30
|
+
raise VeritableError.new("Invalid id '#{id}' -- must contain only alphanumerics, underscores, and dashes.")
|
31
|
+
elsif id[0] == '_' or id[0] == '-'
|
32
|
+
raise VeritableError.new("Invalid id '#{id}' -- may not begin with a dash or underscore.")
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def check_row(row)
|
37
|
+
if not row.is_a? Hash
|
38
|
+
begin
|
39
|
+
row.to_s
|
40
|
+
rescue
|
41
|
+
raise VeritableError.new("Invalid row -- Must provide a hash of column name-value pairs.")
|
42
|
+
else
|
43
|
+
raise VeritableError.new("Invalid row #{row} -- Must provide a hash of column name-value pairs.")
|
44
|
+
end
|
45
|
+
elsif not row.has_key? '_id'
|
46
|
+
raise VeritableError.new("Invalid row #{row} -- rows must contain unique row ids in the '_id' field.")
|
47
|
+
else
|
48
|
+
begin
|
49
|
+
check_id row['_id']
|
50
|
+
rescue VeritableError => e
|
51
|
+
raise VeritableError.new("Invalid row #{row} -- #{e}")
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def check_datatype(datatype, msg=nil)
|
57
|
+
if not DATATYPES.include? datatype
|
58
|
+
begin
|
59
|
+
datatype.to_s
|
60
|
+
rescue
|
61
|
+
raise VeritableError.new("#{msg}Invalid data type.")
|
62
|
+
else
|
63
|
+
raise VeritableError.new("#{msg}Invalid data type '#{datatype}'.")
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def split_rows(rows, frac)
|
69
|
+
rows = rows.to_a
|
70
|
+
n = rows.size
|
71
|
+
inds = (0...n).to_a.shuffle
|
72
|
+
border_ind = (n * frac).floor.to_i
|
73
|
+
train_dataset = (0...border_ind).collect {|i| rows[inds[i]] }
|
74
|
+
test_dataset = (border_ind...n).collect {|i| rows[inds[i]] }
|
75
|
+
return [train_dataset, test_dataset]
|
76
|
+
end
|
77
|
+
|
78
|
+
def validate_schema(schema)
|
79
|
+
schema.is_a? Veritable::Schema ? schema.validate : Veritable::Schema.new(schema).validate
|
80
|
+
end
|
81
|
+
|
82
|
+
def make_schema(schema_rule, opts={})
|
83
|
+
if ((not opts.has_key?('headers')) and (not opts.has_key?('rows')))
|
84
|
+
raise VeritableError.new("Either :headers or :rows must be provided!")
|
85
|
+
end
|
86
|
+
headers = opts.has_key?('headers') ? opts['headers'] : nil
|
87
|
+
if headers.nil?
|
88
|
+
headers = Set.new
|
89
|
+
opts['rows'].each {|row| headers.merge(row.keys)}
|
90
|
+
headers = headers.to_a.sort
|
91
|
+
end
|
92
|
+
schema = {}
|
93
|
+
headers.each do |c|
|
94
|
+
schema_rule.each do |r, t|
|
95
|
+
if r === c
|
96
|
+
schema[c] = t
|
97
|
+
break
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
return Veritable::Schema.new(schema)
|
102
|
+
end
|
103
|
+
|
104
|
+
def write_csv(rows, filename)
|
105
|
+
headers = Set.new
|
106
|
+
rows.each {|row| headers.merge(row.keys)}
|
107
|
+
headers = headers.to_a.sort
|
108
|
+
CSV.open(filename, "w") do |csv|
|
109
|
+
csv << headers
|
110
|
+
rows.each do |row|
|
111
|
+
out_row = headers.collect {|h| row.keys.include?(h) ? row[h] : ''}
|
112
|
+
csv << out_row
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def read_csv(filename, id_col=nil, na_vals=[''])
|
118
|
+
rows = CSV.read(filename)
|
119
|
+
header = rows.shift
|
120
|
+
header = header.collect {|h| (h == id_col ? '_id' : h).strip}
|
121
|
+
if header.include?('_id')
|
122
|
+
id_col = '_id'
|
123
|
+
end
|
124
|
+
rid = 0
|
125
|
+
rows = rows.collect do |raw_row|
|
126
|
+
rid = rid + 1
|
127
|
+
row = {}
|
128
|
+
(0...raw_row.length).each do |i|
|
129
|
+
row[header[i]] = ( na_vals.include?(raw_row[i]) ? nil : raw_row[i] )
|
130
|
+
end
|
131
|
+
if id_col.nil?
|
132
|
+
row['_id'] = rid.to_s
|
133
|
+
end
|
134
|
+
row
|
135
|
+
end
|
136
|
+
return rows
|
137
|
+
end
|
138
|
+
|
139
|
+
def clean_data(rows, schema, opts={})
|
140
|
+
validate(rows, schema, {
|
141
|
+
'convert_types' => opts.has_key?('convert_types') ? opts['convert_types'] : true,
|
142
|
+
'allow_nones' => false,
|
143
|
+
'remove_nones' => opts.has_key?('remove_nones') ? opts['remove_nones'] : true,
|
144
|
+
'remove_invalids' => opts.has_key?('remove_invalids') ? opts['remove_invalids'] : true,
|
145
|
+
'reduce_categories' => opts.has_key?('reduce_categories') ? opts['reduce_categories'] : true,
|
146
|
+
'has_ids' => true,
|
147
|
+
'assign_ids' => opts.has_key?('assign_ids') ? opts['assign_ids'] : false,
|
148
|
+
'allow_extra_fields' => true,
|
149
|
+
'remove_extra_fields' => opts.has_key?('remove_extra_fields') ? opts['remove_extra_fields'] : false,
|
150
|
+
'allow_empty_columns' => false})
|
151
|
+
end
|
152
|
+
|
153
|
+
def validate_data(rows, schema)
|
154
|
+
validate(rows, schema, {
|
155
|
+
'convert_types' => false,
|
156
|
+
'allow_nones' => false,
|
157
|
+
'remove_nones' => false,
|
158
|
+
'remove_invalids' => false,
|
159
|
+
'reduce_categories' => false,
|
160
|
+
'has_ids' => true,
|
161
|
+
'assign_ids' => false,
|
162
|
+
'allow_extra_fields' => true,
|
163
|
+
'remove_extra_fields' => false,
|
164
|
+
'allow_empty_columns' => false})
|
165
|
+
end
|
166
|
+
|
167
|
+
def clean_predictions(predictions, schema, opts={})
|
168
|
+
validate(predictions, schema, {
|
169
|
+
'convert_types' => opts.has_key?('convert_types') ? opts['convert_types'] : true,
|
170
|
+
'allow_nones' => true,
|
171
|
+
'remove_nones' => false,
|
172
|
+
'remove_invalids' => opts.has_key?('remove_invalids') ? opts['remove_invalids'] : true,
|
173
|
+
'reduce_categories' => false,
|
174
|
+
'has_ids' => false,
|
175
|
+
'assign_ids' => false,
|
176
|
+
'allow_extra_fields' => false,
|
177
|
+
'remove_extra_fields' => opts.has_key?('remove_extra_fields') ? opts['remove_extra_fields'] : true,
|
178
|
+
'allow_empty_columns' => true})
|
179
|
+
end
|
180
|
+
|
181
|
+
def validate_predictions(predictions, schema)
|
182
|
+
validate(predictions, schema, {
|
183
|
+
'convert_types' => false,
|
184
|
+
'allow_nones' => true,
|
185
|
+
'remove_nones' => false,
|
186
|
+
'remove_invalids' => false,
|
187
|
+
'reduce_categories' => false,
|
188
|
+
'has_ids' => false,
|
189
|
+
'assign_ids' => false,
|
190
|
+
'allow_extra_fields' => false,
|
191
|
+
'remove_extra_fields' => false,
|
192
|
+
'allow_empty_columns' => true})
|
193
|
+
end
|
194
|
+
|
195
|
+
private
|
196
|
+
|
197
|
+
def flatten_params(params, parent=nil)
|
198
|
+
result = []
|
199
|
+
if params.is_a? Hash
|
200
|
+
params.each {|k, v|
|
201
|
+
kk = parent ? "#{parent}[#{urlencode(k)}]" : urlencode(k)
|
202
|
+
if v.is_a?(Hash) or v.is_a?(Array)
|
203
|
+
result += flatten_params(v, kk)
|
204
|
+
else
|
205
|
+
result << [kk, urlencode(v)]
|
206
|
+
end
|
207
|
+
}
|
208
|
+
elsif params.is_a? Array
|
209
|
+
params.each {|v|
|
210
|
+
if v.is_a?(Hash) or v.is_a?(Array)
|
211
|
+
result += flatten_params(v, kk)
|
212
|
+
else
|
213
|
+
result << ["#{parent}[]", urlencode(v)]
|
214
|
+
end
|
215
|
+
}
|
216
|
+
end
|
217
|
+
result
|
218
|
+
end
|
219
|
+
|
220
|
+
def urlencode(k)
|
221
|
+
URI.escape(k.to_s, Regexp.new("[^#{URI::PATTERN::UNRESERVED}]"))
|
222
|
+
end
|
223
|
+
|
224
|
+
def to_integer(v)
|
225
|
+
return v if v.is_a? Fixnum
|
226
|
+
v.gsub!(/\A([+-]?\d+?)\.0*?\Z/, '\1')
|
227
|
+
Integer(v)
|
228
|
+
end
|
229
|
+
|
230
|
+
def validate(rows, schema, opts)
|
231
|
+
schema = Veritable::Schema.new(schema) unless schema.is_a? Veritable::Schema
|
232
|
+
|
233
|
+
# ensure the schema is well-formed
|
234
|
+
schema.validate
|
235
|
+
|
236
|
+
# store the row numbers of each unique id so that we can warn the user
|
237
|
+
unique_ids = Hash.new
|
238
|
+
|
239
|
+
# store the density of fields
|
240
|
+
field_fill = Hash.new
|
241
|
+
schema.keys.each {|c|
|
242
|
+
field_fill[c] = 0 if c != '_id'
|
243
|
+
}
|
244
|
+
|
245
|
+
# store the number of categories in each categorical column
|
246
|
+
category_counts = Hash.new
|
247
|
+
|
248
|
+
# values which will be converted to true and false in boolean cols if convert_types
|
249
|
+
true_strings = ['true', 't', 'yes', 'y']
|
250
|
+
false_strings = ['false', 'f', 'no', 'n']
|
251
|
+
|
252
|
+
max_cats = 256
|
253
|
+
# be careful before changing the order of any of this logic -- the point is to do this all only once
|
254
|
+
(0...rows.size).each {|i|
|
255
|
+
if opts['assign_ids']
|
256
|
+
rows[i]['_id'] = i.to_s # number the rows sequentially
|
257
|
+
elsif opts['has_ids']
|
258
|
+
raise VeritableError.new("Validate -- row #{i} is missing key '_id'", {'row' => i, 'col' => '_id'}) unless rows[i].include? '_id'
|
259
|
+
|
260
|
+
if opts['convert_types'] # attempt to convert _id to string
|
261
|
+
begin
|
262
|
+
rows[i]['_id'] = rows[i]['_id'].to_s if not rows[i]['_id'].is_a? String
|
263
|
+
rescue
|
264
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' cannot be converted to string.", {'row' => i, 'col' => '_id'})
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
if not rows[i]['_id'].is_a? String # invalid type for _id
|
269
|
+
begin
|
270
|
+
rows[i]['_id'].to_s
|
271
|
+
rescue
|
272
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' is not a string.", {'row' => i, 'col' => '_id'})
|
273
|
+
else
|
274
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} is not a string.", {'row' => i, 'col' => '_id'})
|
275
|
+
end
|
276
|
+
end
|
277
|
+
|
278
|
+
begin
|
279
|
+
check_id rows[i]['_id'] # make sure _id is alphanumeric
|
280
|
+
rescue
|
281
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} contains disallowed characters. Ids must contain only alphanumerics, with underscores and hyphens allowed after the beginning of the id.", {'row' => i, 'col' => '_id'})
|
282
|
+
end
|
283
|
+
|
284
|
+
if unique_ids.include? rows[i]['_id']
|
285
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id', value #{rows[i]['_id']} is non-unique, conflicts with row #{unique_ids[rows[i]['_id']]}", {'row' => i, 'col' => '_id'})
|
286
|
+
end
|
287
|
+
|
288
|
+
unique_ids[rows[i]['_id']] = i
|
289
|
+
elsif rows[i].include? '_id' # no ids, no autoid, but _id column
|
290
|
+
if opts['remove_extra_fields'] # just remove it
|
291
|
+
rows[i].delete '_id'
|
292
|
+
else
|
293
|
+
raise VeritableError.new("Validate -- row #{i}, key '_id' should not be included.", {'row' => i, 'col' => '_id'})
|
294
|
+
end
|
295
|
+
end
|
296
|
+
rows[i].keys.each {|c|
|
297
|
+
if c != '_id'
|
298
|
+
if not schema.include? c # keys missing from schema
|
299
|
+
if opts['remove_extra_fields'] # remove it
|
300
|
+
rows[i].delete c
|
301
|
+
else
|
302
|
+
if not opts['allow_extra_fields'] # or silently allow
|
303
|
+
raise VeritableError.new("Row #{i}, key #{c} is not defined in schema", {'row' => i, 'col' => c})
|
304
|
+
end
|
305
|
+
end
|
306
|
+
elsif rows[i][c].nil? # nil values
|
307
|
+
if opts['remove_nones'] # remove
|
308
|
+
rows[i].delete c
|
309
|
+
else
|
310
|
+
if not opts['allow_nones'] # or silently allow
|
311
|
+
raise VeritableError.new("Row #{i}, key #{c} should be removed because it is nil", {'row' => i, 'col' => c})
|
312
|
+
end
|
313
|
+
end
|
314
|
+
else # keys present in schema
|
315
|
+
coltype = schema.type c # check the column type
|
316
|
+
if coltype == 'count'
|
317
|
+
if opts['convert_types'] # try converting to int
|
318
|
+
begin
|
319
|
+
rows[i][c] = to_integer(rows[i][c])
|
320
|
+
rescue
|
321
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
322
|
+
end
|
323
|
+
end
|
324
|
+
if rows[i][c].nil?
|
325
|
+
rows[i].delete c # remove flagged values
|
326
|
+
elsif opts['remove_invalids'] and (rows[i][c].is_a? Fixnum) and (rows[i][c] < 0)
|
327
|
+
rows[i].delete c
|
328
|
+
else
|
329
|
+
if not (rows[i][c].is_a? Fixnum) or not (rows[i][c] >= 0) # catch invalids
|
330
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is #{rows[i][c].class}, not a non-negative integer.", {'row' => i, 'col' => c})
|
331
|
+
end
|
332
|
+
end
|
333
|
+
elsif coltype == 'real'
|
334
|
+
if opts['convert_types'] # try converting to float
|
335
|
+
begin
|
336
|
+
rows[i][c] = Float(rows[i][c]) unless rows[i][c].is_a? Float
|
337
|
+
rescue
|
338
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
339
|
+
end
|
340
|
+
end
|
341
|
+
if rows[i][c].nil?
|
342
|
+
rows[i].delete c
|
343
|
+
else
|
344
|
+
if not rows[i][c].is_a? Float
|
345
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is a #{rows[i][c].class}, not a float.", {'row' => i, 'col' => c})
|
346
|
+
end
|
347
|
+
end
|
348
|
+
elsif coltype == 'boolean'
|
349
|
+
if opts['convert_types'] # try converting to bool
|
350
|
+
lc = (rows[i][c]).to_s.strip.downcase
|
351
|
+
begin
|
352
|
+
if true_strings.include? lc
|
353
|
+
rows[i][c] = true
|
354
|
+
elsif false_strings.include? lc
|
355
|
+
rows[i][c] = false
|
356
|
+
elsif to_integer(rows[i][c]) == 0 # note that this behavior differs from what a rubyist might expect; "0" maps to false
|
357
|
+
rows[i][c] = false
|
358
|
+
else
|
359
|
+
rows[i][c] = true
|
360
|
+
end
|
361
|
+
rescue
|
362
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
363
|
+
end
|
364
|
+
end
|
365
|
+
if rows[i][c].nil? # remove flagged values
|
366
|
+
rows[i].delete c
|
367
|
+
else
|
368
|
+
if not [true, false].include? rows[i][c]
|
369
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is #{rows[i][c].class}, not a boolean", {'row' => i, 'col' => c})
|
370
|
+
end
|
371
|
+
end
|
372
|
+
elsif coltype == 'categorical'
|
373
|
+
if opts['convert_types'] # try converting to string
|
374
|
+
begin
|
375
|
+
rows[i][c] = rows[i][c].to_s unless rows[i][c].is_a? String
|
376
|
+
rescue
|
377
|
+
rows[i][c] = opts['remove_invalids'] ? nil : rows[i][c] # flag for removal
|
378
|
+
end
|
379
|
+
end
|
380
|
+
if rows[i][c].nil? # remove flagged values
|
381
|
+
rows[i].delete c
|
382
|
+
else
|
383
|
+
if not rows[i][c].is_a? String # catch invalids
|
384
|
+
raise VeritableError.new("Validate -- row #{i}, key #{c}, value #{rows[i][c]} is a #{rows[i][c].class}, not a string", {'row' => i, 'col' => c})
|
385
|
+
end
|
386
|
+
category_counts[c] = Hash.new if not category_counts.include? c # increment count
|
387
|
+
category_counts[c][rows[i][c]] = 0 if not category_counts[c].include? rows[i][c]
|
388
|
+
category_counts[c][rows[i][c]] += 1
|
389
|
+
end
|
390
|
+
else
|
391
|
+
raise VeritableError.new("Validate -- didn't recognize column type #{coltype}")
|
392
|
+
end
|
393
|
+
end
|
394
|
+
if not field_fill.include? c and not opts['remove_extra_fields']
|
395
|
+
field_fill[c] = 0
|
396
|
+
end
|
397
|
+
if rows[i].include? c and not rows[i][c].nil?
|
398
|
+
field_fill[c] += 1
|
399
|
+
end
|
400
|
+
end
|
401
|
+
}
|
402
|
+
}
|
403
|
+
category_counts.keys.each {|c|
|
404
|
+
cats = category_counts[c].keys
|
405
|
+
if cats.size > max_cats # too many categories
|
406
|
+
if opts['reduce_categories'] # keep the largest max_cats - 1
|
407
|
+
cats = cats.sort! {|a,b| category_counts[c][b] <=> category_counts[c][a]}
|
408
|
+
category_map = Hash.new
|
409
|
+
(0...cats.size).each {|j|
|
410
|
+
j < max_cats - 1 ? category_map[cats[j]] = cats[j] : category_map[cats[j]] = "Other"
|
411
|
+
}
|
412
|
+
(0...rows.size).each {|i|
|
413
|
+
rows[i][c] = category_map[rows[i][c]] if rows[i].include? c and not rows[i][c].nil?
|
414
|
+
}
|
415
|
+
else
|
416
|
+
raise VeritableError.new("Validate -- categorical column #{c} has #{category_counts[c].keys.size} unique values which exceeds the limits of #{max_cats}.", {'col' => c})
|
417
|
+
end
|
418
|
+
end
|
419
|
+
}
|
420
|
+
if not opts['allow_empty_columns']
|
421
|
+
field_fill.each {|c, fill|
|
422
|
+
raise VeritableError.new("Validate -- column #{c} does not have any values", {'col' => c}) if fill == 0
|
423
|
+
}
|
424
|
+
end
|
425
|
+
nil
|
426
|
+
end
|
427
|
+
end
|
428
|
+
end
|
429
|
+
end
|
data/lib/veritable.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'openssl'
|
2
|
+
|
3
|
+
require 'veritable/api'
|
4
|
+
require 'veritable/connection'
|
5
|
+
require 'veritable/errors'
|
6
|
+
require 'veritable/util'
|
7
|
+
require 'veritable/version'
|
8
|
+
|
9
|
+
require 'rest_client'
|
10
|
+
require 'uuid'
|
11
|
+
require 'multi_json'
|
12
|
+
|
13
|
+
module Veritable
|
14
|
+
USER_AGENT = 'veritable-ruby ' + VERSION
|
15
|
+
BASE_URL = "https://api.priorknowledge.com"
|
16
|
+
|
17
|
+
def self.connect(opts={})
|
18
|
+
opts[:api_key] = opts[:api_key] || ENV['VERITABLE_KEY']
|
19
|
+
opts[:api_base_url] = opts[:api_base_url] || ENV['VERITABLE_URL'] || BASE_URL
|
20
|
+
|
21
|
+
opts[:ssl_verify] = true unless opts.has_key?(:ssl_verify)
|
22
|
+
opts[:enable_gzip] = true unless opts.has_key?(:enable_gzip)
|
23
|
+
|
24
|
+
api = API.new(opts)
|
25
|
+
connection_test = api.root
|
26
|
+
status = connection_test["status"]
|
27
|
+
entropy = connection_test["entropy"]
|
28
|
+
raise VeritableError.new("No Veritable server responding at #{opts[:api_base_url]}") if status != "SUCCESS"
|
29
|
+
raise VeritableError.new("No Veritable server responding at #{opts[:api_base_url]}") if ! entropy.is_a?(Float)
|
30
|
+
api
|
31
|
+
end
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,154 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: veritable
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Prior Knowledge
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-02 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rest-client
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.4'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.4'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: uuid
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: multi_json
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ! '>='
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
- !ruby/object:Gem::Dependency
|
63
|
+
name: test-unit
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
type: :development
|
71
|
+
prerelease: false
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
- !ruby/object:Gem::Dependency
|
79
|
+
name: rake
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ! '>='
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
type: :development
|
87
|
+
prerelease: false
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: simplecov
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :development
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
description: Veritable is the predictive database developed by Prior Knowledge (http://www.priorknowledge.com)
|
111
|
+
email:
|
112
|
+
- support@priorknowledge.com
|
113
|
+
executables: []
|
114
|
+
extensions: []
|
115
|
+
extra_rdoc_files: []
|
116
|
+
files:
|
117
|
+
- CHANGELOG.txt
|
118
|
+
- lib/veritable/api.rb
|
119
|
+
- lib/veritable/connection.rb
|
120
|
+
- lib/veritable/cursor.rb
|
121
|
+
- lib/veritable/datatypes.rb
|
122
|
+
- lib/veritable/errors.rb
|
123
|
+
- lib/veritable/object.rb
|
124
|
+
- lib/veritable/resource.rb
|
125
|
+
- lib/veritable/util.rb
|
126
|
+
- lib/veritable/version.rb
|
127
|
+
- lib/veritable.rb
|
128
|
+
- LICENSE
|
129
|
+
- README.md
|
130
|
+
homepage: https://dev.priorknowledge.com
|
131
|
+
licenses: []
|
132
|
+
post_install_message:
|
133
|
+
rdoc_options: []
|
134
|
+
require_paths:
|
135
|
+
- lib
|
136
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
142
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
143
|
+
none: false
|
144
|
+
requirements:
|
145
|
+
- - ! '>='
|
146
|
+
- !ruby/object:Gem::Version
|
147
|
+
version: '0'
|
148
|
+
requirements: []
|
149
|
+
rubyforge_project:
|
150
|
+
rubygems_version: 1.8.24
|
151
|
+
signing_key:
|
152
|
+
specification_version: 3
|
153
|
+
summary: Ruby client for Veritable API
|
154
|
+
test_files: []
|