scraperwiki-api 0.0.3 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +18 -2
- data/lib/scraperwiki-api/matchers.rb +168 -23
- data/lib/scraperwiki-api/version.rb +1 -1
- data/scraperwiki-api.gemspec +3 -2
- metadata +18 -7
data/README.md
CHANGED
@@ -49,9 +49,13 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
49
49
|
it {should be_editable_by('frabcus')}
|
50
50
|
it {should run(:daily)}
|
51
51
|
it {should_not be_broken}
|
52
|
+
it {should have_a_row_count_of(42).on('swdata')}
|
53
|
+
|
54
|
+
# Check for missing keys:
|
52
55
|
it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
56
|
+
|
57
|
+
# Check for extra keys:
|
53
58
|
it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
54
|
-
it {should have_a_row_count_of(42).on('swdata')}
|
55
59
|
end
|
56
60
|
|
57
61
|
data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
@@ -60,6 +64,9 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
60
64
|
include ScraperWiki::API::Matchers
|
61
65
|
subject {data}
|
62
66
|
|
67
|
+
it {should set_any_of(['name', 'first_name', 'last_name'])}
|
68
|
+
|
69
|
+
# Validate the values of individual fields:
|
63
70
|
it {should_not have_blank_values.in('name')}
|
64
71
|
it {should have_unique_values.in('email')}
|
65
72
|
it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -67,7 +74,16 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
67
74
|
it {should have_values_starting_with('http://').in('url')}
|
68
75
|
it {should have_values_ending_with('Inc.').in('company_name')}
|
69
76
|
it {should have_integer_values.in('year')}
|
70
|
-
|
77
|
+
|
78
|
+
# If you store a hash or an array of hashes in a field as a JSON string,
|
79
|
+
# you can validate the values of these subfields by chaining on an +at+:
|
80
|
+
it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
81
|
+
|
82
|
+
# Check for missing keys within subfields:
|
83
|
+
it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
84
|
+
|
85
|
+
# Check for extra keys within subfields:
|
86
|
+
it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
71
87
|
end
|
72
88
|
|
73
89
|
More documentation at [RubyDoc.info](http://rdoc.info/gems/scraperwiki-api/ScraperWiki/API/Matchers).
|
@@ -16,9 +16,13 @@ module ScraperWiki
|
|
16
16
|
# it {should be_editable_by('frabcus')}
|
17
17
|
# it {should run(:daily)}
|
18
18
|
# it {should_not be_broken}
|
19
|
+
# it {should have_a_row_count_of(42).on('swdata')}
|
20
|
+
#
|
21
|
+
# # Check for missing keys:
|
19
22
|
# it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
23
|
+
#
|
24
|
+
# # Check for extra keys:
|
20
25
|
# it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
21
|
-
# it {should have_a_row_count_of(42).on('swdata')}
|
22
26
|
# end
|
23
27
|
#
|
24
28
|
# data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
@@ -27,6 +31,9 @@ module ScraperWiki
|
|
27
31
|
# include ScraperWiki::API::Matchers
|
28
32
|
# subject {data}
|
29
33
|
#
|
34
|
+
# it {should set_any_of(['name', 'first_name', 'last_name'])}
|
35
|
+
#
|
36
|
+
# # Validate the values of individual fields:
|
30
37
|
# it {should_not have_blank_values.in('name')}
|
31
38
|
# it {should have_unique_values.in('email')}
|
32
39
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -34,7 +41,16 @@ module ScraperWiki
|
|
34
41
|
# it {should have_values_starting_with('http://').in('url')}
|
35
42
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
36
43
|
# it {should have_integer_values.in('year')}
|
37
|
-
#
|
44
|
+
#
|
45
|
+
# # If you store a hash or an array of hashes in a field as a JSON string,
|
46
|
+
# # you can validate the values of these subfields by chaining on an +at+:
|
47
|
+
# it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
48
|
+
#
|
49
|
+
# # Check for missing keys within subfields:
|
50
|
+
# it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
51
|
+
#
|
52
|
+
# # Check for extra keys within subfields:
|
53
|
+
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
38
54
|
# end
|
39
55
|
#
|
40
56
|
# RSpec matchers for ScraperWiki scrapers.
|
@@ -54,7 +70,7 @@ module ScraperWiki
|
|
54
70
|
end
|
55
71
|
|
56
72
|
def failure_message
|
57
|
-
NotImplementerError
|
73
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
58
74
|
end
|
59
75
|
|
60
76
|
def negative_failure_message
|
@@ -149,12 +165,16 @@ module ScraperWiki
|
|
149
165
|
difference.empty?
|
150
166
|
end
|
151
167
|
|
168
|
+
def failure_message
|
169
|
+
"#{@actual['short_name']} #{failure_predicate}: #{difference.join ', '}"
|
170
|
+
end
|
171
|
+
|
152
172
|
def failure_predicate
|
153
|
-
raise NotImplementerError
|
173
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
154
174
|
end
|
155
175
|
|
156
|
-
def
|
157
|
-
|
176
|
+
def difference
|
177
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
158
178
|
end
|
159
179
|
end
|
160
180
|
|
@@ -245,7 +265,7 @@ module ScraperWiki
|
|
245
265
|
hash
|
246
266
|
end
|
247
267
|
else
|
248
|
-
raise NotImplementerError
|
268
|
+
raise NotImplementerError, "Can only handle jsondict or jsonlist formats"
|
249
269
|
end
|
250
270
|
end
|
251
271
|
|
@@ -262,11 +282,11 @@ module ScraperWiki
|
|
262
282
|
end
|
263
283
|
|
264
284
|
def matches
|
265
|
-
raise NotImplementerError
|
285
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
266
286
|
end
|
267
287
|
|
268
288
|
def mismatches
|
269
|
-
raise NotImplementerError
|
289
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
270
290
|
end
|
271
291
|
|
272
292
|
def failures
|
@@ -285,10 +305,6 @@ module ScraperWiki
|
|
285
305
|
end
|
286
306
|
end
|
287
307
|
|
288
|
-
def failure_description
|
289
|
-
raise NotImplementerError
|
290
|
-
end
|
291
|
-
|
292
308
|
def failure_message
|
293
309
|
"#{failure_size} of #{items.size} #{failure_description}\n#{failures.map(&:inspect).join "\n"}"
|
294
310
|
end
|
@@ -296,6 +312,10 @@ module ScraperWiki
|
|
296
312
|
def negative_failure_message
|
297
313
|
failure_message
|
298
314
|
end
|
315
|
+
|
316
|
+
def failure_description
|
317
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
318
|
+
end
|
299
319
|
end
|
300
320
|
|
301
321
|
class SetAnyOf < DatastoreMatcher
|
@@ -323,16 +343,47 @@ module ScraperWiki
|
|
323
343
|
self
|
324
344
|
end
|
325
345
|
|
326
|
-
def
|
327
|
-
|
328
|
-
|
346
|
+
def at(subfield)
|
347
|
+
@subfield = subfield
|
348
|
+
self
|
349
|
+
end
|
350
|
+
|
351
|
+
# @note +@subfield+ can be a hash or an array of hashes
|
352
|
+
def matcher(meth)
|
353
|
+
if @subfield
|
354
|
+
items.send(meth) do |item|
|
355
|
+
if blank? item[@field]
|
356
|
+
true
|
357
|
+
else
|
358
|
+
v = Yajl::Parser.parse item[@field]
|
359
|
+
if Hash === v
|
360
|
+
v.has_key?(@subfield) && match?(v[@subfield])
|
361
|
+
elsif Array === v
|
362
|
+
v.all? do |w|
|
363
|
+
if Hash === w
|
364
|
+
w.has_key?(@subfield) && match?(w[@subfield])
|
365
|
+
else
|
366
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
367
|
+
end
|
368
|
+
end
|
369
|
+
else
|
370
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
371
|
+
end
|
372
|
+
end
|
373
|
+
end
|
374
|
+
else
|
375
|
+
items.send(meth) do |item|
|
376
|
+
match? item[@field]
|
377
|
+
end
|
329
378
|
end
|
330
379
|
end
|
331
380
|
|
381
|
+
def matches
|
382
|
+
matcher :select
|
383
|
+
end
|
384
|
+
|
332
385
|
def mismatches
|
333
|
-
|
334
|
-
match? item[@field]
|
335
|
-
end
|
386
|
+
matcher :reject
|
336
387
|
end
|
337
388
|
|
338
389
|
def blank?(v)
|
@@ -340,7 +391,15 @@ module ScraperWiki
|
|
340
391
|
end
|
341
392
|
|
342
393
|
def failure_description
|
343
|
-
|
394
|
+
if @subfield
|
395
|
+
"#{@field}:#{@subfield} values #{failure_predicate}"
|
396
|
+
else
|
397
|
+
"#{@field} values #{failure_predicate}"
|
398
|
+
end
|
399
|
+
end
|
400
|
+
|
401
|
+
def failure_predicate
|
402
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
344
403
|
end
|
345
404
|
end
|
346
405
|
|
@@ -392,9 +451,34 @@ module ScraperWiki
|
|
392
451
|
class HaveUniqueValues < FieldMatcher
|
393
452
|
def mismatches
|
394
453
|
counts = Hash.new 0
|
395
|
-
|
396
|
-
|
397
|
-
|
454
|
+
if @subfield
|
455
|
+
items.each do |item|
|
456
|
+
unless blank? item[@field]
|
457
|
+
v = Yajl::Parser.parse item[@field]
|
458
|
+
if Hash === v
|
459
|
+
unless blank? v[@subfield]
|
460
|
+
counts[v[@subfield]] += 1
|
461
|
+
end
|
462
|
+
elsif Array === v
|
463
|
+
v.each do |w|
|
464
|
+
if Hash === w
|
465
|
+
unless blank? w[@subfield]
|
466
|
+
counts[w[@subfield]] += 1
|
467
|
+
end
|
468
|
+
else
|
469
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
470
|
+
end
|
471
|
+
end
|
472
|
+
else
|
473
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
else
|
478
|
+
items.each do |item|
|
479
|
+
unless blank? item[@field]
|
480
|
+
counts[item[@field]] += 1
|
481
|
+
end
|
398
482
|
end
|
399
483
|
end
|
400
484
|
counts.select{|_,count| count > 1}.keys
|
@@ -454,6 +538,67 @@ module ScraperWiki
|
|
454
538
|
def have_integer_values
|
455
539
|
HaveIntegerValues.new nil
|
456
540
|
end
|
541
|
+
|
542
|
+
class FieldKeyMatcher < FieldMatcher
|
543
|
+
def match?(v)
|
544
|
+
if blank? v
|
545
|
+
true
|
546
|
+
else
|
547
|
+
w = Yajl::Parser.parse v
|
548
|
+
if Hash === w
|
549
|
+
difference(w).empty?
|
550
|
+
elsif Array === w
|
551
|
+
w.all? do |x|
|
552
|
+
if Hash === x
|
553
|
+
difference(x).empty?
|
554
|
+
else
|
555
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
556
|
+
end
|
557
|
+
end
|
558
|
+
else
|
559
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
560
|
+
end
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
def difference(v)
|
565
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
566
|
+
end
|
567
|
+
|
568
|
+
def failure_predicate
|
569
|
+
"#{predicate}: #{difference.join ', '}"
|
570
|
+
end
|
571
|
+
end
|
572
|
+
|
573
|
+
class HaveValuesWithAtLeastTheKeys < FieldKeyMatcher
|
574
|
+
def difference(v)
|
575
|
+
@expected - v.keys
|
576
|
+
end
|
577
|
+
|
578
|
+
def failure_predicate
|
579
|
+
'have missing keys'
|
580
|
+
end
|
581
|
+
end
|
582
|
+
# @example
|
583
|
+
# it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
584
|
+
def have_values_with_at_least_the_keys(expected)
|
585
|
+
HaveValuesWithAtLeastTheKeys.new expected
|
586
|
+
end
|
587
|
+
|
588
|
+
class HaveValuesWithAtMostTheKeys < FieldKeyMatcher
|
589
|
+
def difference(v)
|
590
|
+
v.keys - @expected
|
591
|
+
end
|
592
|
+
|
593
|
+
def failure_predicate
|
594
|
+
'have extra keys'
|
595
|
+
end
|
596
|
+
end
|
597
|
+
# @example
|
598
|
+
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
599
|
+
def have_values_with_at_most_the_keys(expected)
|
600
|
+
HaveValuesWithAtMostTheKeys.new expected
|
601
|
+
end
|
457
602
|
end
|
458
603
|
end
|
459
604
|
end
|
data/scraperwiki-api.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
|
-
s.add_runtime_dependency('
|
21
|
-
s.
|
20
|
+
s.add_runtime_dependency('yajl-ruby', '~> 1.0')
|
21
|
+
s.add_runtime_dependency('httparty', '~> 0.8.0')
|
22
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
22
23
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraperwiki-api
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,28 +11,39 @@ bindir: bin
|
|
11
11
|
cert_chain: []
|
12
12
|
date: 2012-05-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &70366370555240 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70366370555240
|
14
25
|
- !ruby/object:Gem::Dependency
|
15
26
|
name: httparty
|
16
|
-
requirement: &
|
27
|
+
requirement: &70366370554220 !ruby/object:Gem::Requirement
|
17
28
|
none: false
|
18
29
|
requirements:
|
19
30
|
- - ~>
|
20
31
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0.
|
32
|
+
version: 0.8.0
|
22
33
|
type: :runtime
|
23
34
|
prerelease: false
|
24
|
-
version_requirements: *
|
35
|
+
version_requirements: *70366370554220
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rspec
|
27
|
-
requirement: &
|
38
|
+
requirement: &70366370553000 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
31
42
|
- !ruby/object:Gem::Version
|
32
|
-
version: 2.10
|
43
|
+
version: '2.10'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70366370553000
|
36
47
|
description: A Ruby wrapper for the ScraperWiki API
|
37
48
|
email:
|
38
49
|
- info@opennorth.ca
|