scraperwiki-api 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +18 -2
- data/lib/scraperwiki-api/matchers.rb +168 -23
- data/lib/scraperwiki-api/version.rb +1 -1
- data/scraperwiki-api.gemspec +3 -2
- metadata +18 -7
data/README.md
CHANGED
@@ -49,9 +49,13 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
49
49
|
it {should be_editable_by('frabcus')}
|
50
50
|
it {should run(:daily)}
|
51
51
|
it {should_not be_broken}
|
52
|
+
it {should have_a_row_count_of(42).on('swdata')}
|
53
|
+
|
54
|
+
# Check for missing keys:
|
52
55
|
it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
56
|
+
|
57
|
+
# Check for extra keys:
|
53
58
|
it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
54
|
-
it {should have_a_row_count_of(42).on('swdata')}
|
55
59
|
end
|
56
60
|
|
57
61
|
data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
@@ -60,6 +64,9 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
60
64
|
include ScraperWiki::API::Matchers
|
61
65
|
subject {data}
|
62
66
|
|
67
|
+
it {should set_any_of(['name', 'first_name', 'last_name'])}
|
68
|
+
|
69
|
+
# Validate the values of individual fields:
|
63
70
|
it {should_not have_blank_values.in('name')}
|
64
71
|
it {should have_unique_values.in('email')}
|
65
72
|
it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -67,7 +74,16 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
67
74
|
it {should have_values_starting_with('http://').in('url')}
|
68
75
|
it {should have_values_ending_with('Inc.').in('company_name')}
|
69
76
|
it {should have_integer_values.in('year')}
|
70
|
-
|
77
|
+
|
78
|
+
# If you store a hash or an array of hashes in a field as a JSON string,
|
79
|
+
# you can validate the values of these subfields by chaining on an +at+:
|
80
|
+
it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
81
|
+
|
82
|
+
# Check for missing keys within subfields:
|
83
|
+
it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
84
|
+
|
85
|
+
# Check for extra keys within subfields:
|
86
|
+
it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
71
87
|
end
|
72
88
|
|
73
89
|
More documentation at [RubyDoc.info](http://rdoc.info/gems/scraperwiki-api/ScraperWiki/API/Matchers).
|
@@ -16,9 +16,13 @@ module ScraperWiki
|
|
16
16
|
# it {should be_editable_by('frabcus')}
|
17
17
|
# it {should run(:daily)}
|
18
18
|
# it {should_not be_broken}
|
19
|
+
# it {should have_a_row_count_of(42).on('swdata')}
|
20
|
+
#
|
21
|
+
# # Check for missing keys:
|
19
22
|
# it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
23
|
+
#
|
24
|
+
# # Check for extra keys:
|
20
25
|
# it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
21
|
-
# it {should have_a_row_count_of(42).on('swdata')}
|
22
26
|
# end
|
23
27
|
#
|
24
28
|
# data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
@@ -27,6 +31,9 @@ module ScraperWiki
|
|
27
31
|
# include ScraperWiki::API::Matchers
|
28
32
|
# subject {data}
|
29
33
|
#
|
34
|
+
# it {should set_any_of(['name', 'first_name', 'last_name'])}
|
35
|
+
#
|
36
|
+
# # Validate the values of individual fields:
|
30
37
|
# it {should_not have_blank_values.in('name')}
|
31
38
|
# it {should have_unique_values.in('email')}
|
32
39
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -34,7 +41,16 @@ module ScraperWiki
|
|
34
41
|
# it {should have_values_starting_with('http://').in('url')}
|
35
42
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
36
43
|
# it {should have_integer_values.in('year')}
|
37
|
-
#
|
44
|
+
#
|
45
|
+
# # If you store a hash or an array of hashes in a field as a JSON string,
|
46
|
+
# # you can validate the values of these subfields by chaining on an +at+:
|
47
|
+
# it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
48
|
+
#
|
49
|
+
# # Check for missing keys within subfields:
|
50
|
+
# it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
51
|
+
#
|
52
|
+
# # Check for extra keys within subfields:
|
53
|
+
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
38
54
|
# end
|
39
55
|
#
|
40
56
|
# RSpec matchers for ScraperWiki scrapers.
|
@@ -54,7 +70,7 @@ module ScraperWiki
|
|
54
70
|
end
|
55
71
|
|
56
72
|
def failure_message
|
57
|
-
NotImplementerError
|
73
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
58
74
|
end
|
59
75
|
|
60
76
|
def negative_failure_message
|
@@ -149,12 +165,16 @@ module ScraperWiki
|
|
149
165
|
difference.empty?
|
150
166
|
end
|
151
167
|
|
168
|
+
def failure_message
|
169
|
+
"#{@actual['short_name']} #{failure_predicate}: #{difference.join ', '}"
|
170
|
+
end
|
171
|
+
|
152
172
|
def failure_predicate
|
153
|
-
raise NotImplementerError
|
173
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
154
174
|
end
|
155
175
|
|
156
|
-
def
|
157
|
-
|
176
|
+
def difference
|
177
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
158
178
|
end
|
159
179
|
end
|
160
180
|
|
@@ -245,7 +265,7 @@ module ScraperWiki
|
|
245
265
|
hash
|
246
266
|
end
|
247
267
|
else
|
248
|
-
raise NotImplementerError
|
268
|
+
raise NotImplementerError, "Can only handle jsondict or jsonlist formats"
|
249
269
|
end
|
250
270
|
end
|
251
271
|
|
@@ -262,11 +282,11 @@ module ScraperWiki
|
|
262
282
|
end
|
263
283
|
|
264
284
|
def matches
|
265
|
-
raise NotImplementerError
|
285
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
266
286
|
end
|
267
287
|
|
268
288
|
def mismatches
|
269
|
-
raise NotImplementerError
|
289
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
270
290
|
end
|
271
291
|
|
272
292
|
def failures
|
@@ -285,10 +305,6 @@ module ScraperWiki
|
|
285
305
|
end
|
286
306
|
end
|
287
307
|
|
288
|
-
def failure_description
|
289
|
-
raise NotImplementerError
|
290
|
-
end
|
291
|
-
|
292
308
|
def failure_message
|
293
309
|
"#{failure_size} of #{items.size} #{failure_description}\n#{failures.map(&:inspect).join "\n"}"
|
294
310
|
end
|
@@ -296,6 +312,10 @@ module ScraperWiki
|
|
296
312
|
def negative_failure_message
|
297
313
|
failure_message
|
298
314
|
end
|
315
|
+
|
316
|
+
def failure_description
|
317
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
318
|
+
end
|
299
319
|
end
|
300
320
|
|
301
321
|
class SetAnyOf < DatastoreMatcher
|
@@ -323,16 +343,47 @@ module ScraperWiki
|
|
323
343
|
self
|
324
344
|
end
|
325
345
|
|
326
|
-
def
|
327
|
-
|
328
|
-
|
346
|
+
def at(subfield)
|
347
|
+
@subfield = subfield
|
348
|
+
self
|
349
|
+
end
|
350
|
+
|
351
|
+
# @note +@subfield+ can be a hash or an array of hashes
|
352
|
+
def matcher(meth)
|
353
|
+
if @subfield
|
354
|
+
items.send(meth) do |item|
|
355
|
+
if blank? item[@field]
|
356
|
+
true
|
357
|
+
else
|
358
|
+
v = Yajl::Parser.parse item[@field]
|
359
|
+
if Hash === v
|
360
|
+
v.has_key?(@subfield) && match?(v[@subfield])
|
361
|
+
elsif Array === v
|
362
|
+
v.all? do |w|
|
363
|
+
if Hash === w
|
364
|
+
w.has_key?(@subfield) && match?(w[@subfield])
|
365
|
+
else
|
366
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
367
|
+
end
|
368
|
+
end
|
369
|
+
else
|
370
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
371
|
+
end
|
372
|
+
end
|
373
|
+
end
|
374
|
+
else
|
375
|
+
items.send(meth) do |item|
|
376
|
+
match? item[@field]
|
377
|
+
end
|
329
378
|
end
|
330
379
|
end
|
331
380
|
|
381
|
+
def matches
|
382
|
+
matcher :select
|
383
|
+
end
|
384
|
+
|
332
385
|
def mismatches
|
333
|
-
|
334
|
-
match? item[@field]
|
335
|
-
end
|
386
|
+
matcher :reject
|
336
387
|
end
|
337
388
|
|
338
389
|
def blank?(v)
|
@@ -340,7 +391,15 @@ module ScraperWiki
|
|
340
391
|
end
|
341
392
|
|
342
393
|
def failure_description
|
343
|
-
|
394
|
+
if @subfield
|
395
|
+
"#{@field}:#{@subfield} values #{failure_predicate}"
|
396
|
+
else
|
397
|
+
"#{@field} values #{failure_predicate}"
|
398
|
+
end
|
399
|
+
end
|
400
|
+
|
401
|
+
def failure_predicate
|
402
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
344
403
|
end
|
345
404
|
end
|
346
405
|
|
@@ -392,9 +451,34 @@ module ScraperWiki
|
|
392
451
|
class HaveUniqueValues < FieldMatcher
|
393
452
|
def mismatches
|
394
453
|
counts = Hash.new 0
|
395
|
-
|
396
|
-
|
397
|
-
|
454
|
+
if @subfield
|
455
|
+
items.each do |item|
|
456
|
+
unless blank? item[@field]
|
457
|
+
v = Yajl::Parser.parse item[@field]
|
458
|
+
if Hash === v
|
459
|
+
unless blank? v[@subfield]
|
460
|
+
counts[v[@subfield]] += 1
|
461
|
+
end
|
462
|
+
elsif Array === v
|
463
|
+
v.each do |w|
|
464
|
+
if Hash === w
|
465
|
+
unless blank? w[@subfield]
|
466
|
+
counts[w[@subfield]] += 1
|
467
|
+
end
|
468
|
+
else
|
469
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
470
|
+
end
|
471
|
+
end
|
472
|
+
else
|
473
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
474
|
+
end
|
475
|
+
end
|
476
|
+
end
|
477
|
+
else
|
478
|
+
items.each do |item|
|
479
|
+
unless blank? item[@field]
|
480
|
+
counts[item[@field]] += 1
|
481
|
+
end
|
398
482
|
end
|
399
483
|
end
|
400
484
|
counts.select{|_,count| count > 1}.keys
|
@@ -454,6 +538,67 @@ module ScraperWiki
|
|
454
538
|
def have_integer_values
|
455
539
|
HaveIntegerValues.new nil
|
456
540
|
end
|
541
|
+
|
542
|
+
class FieldKeyMatcher < FieldMatcher
|
543
|
+
def match?(v)
|
544
|
+
if blank? v
|
545
|
+
true
|
546
|
+
else
|
547
|
+
w = Yajl::Parser.parse v
|
548
|
+
if Hash === w
|
549
|
+
difference(w).empty?
|
550
|
+
elsif Array === w
|
551
|
+
w.all? do |x|
|
552
|
+
if Hash === x
|
553
|
+
difference(x).empty?
|
554
|
+
else
|
555
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
556
|
+
end
|
557
|
+
end
|
558
|
+
else
|
559
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
560
|
+
end
|
561
|
+
end
|
562
|
+
end
|
563
|
+
|
564
|
+
def difference(v)
|
565
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
566
|
+
end
|
567
|
+
|
568
|
+
def failure_predicate
|
569
|
+
"#{predicate}: #{difference.join ', '}"
|
570
|
+
end
|
571
|
+
end
|
572
|
+
|
573
|
+
class HaveValuesWithAtLeastTheKeys < FieldKeyMatcher
|
574
|
+
def difference(v)
|
575
|
+
@expected - v.keys
|
576
|
+
end
|
577
|
+
|
578
|
+
def failure_predicate
|
579
|
+
'have missing keys'
|
580
|
+
end
|
581
|
+
end
|
582
|
+
# @example
|
583
|
+
# it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
584
|
+
def have_values_with_at_least_the_keys(expected)
|
585
|
+
HaveValuesWithAtLeastTheKeys.new expected
|
586
|
+
end
|
587
|
+
|
588
|
+
class HaveValuesWithAtMostTheKeys < FieldKeyMatcher
|
589
|
+
def difference(v)
|
590
|
+
v.keys - @expected
|
591
|
+
end
|
592
|
+
|
593
|
+
def failure_predicate
|
594
|
+
'have extra keys'
|
595
|
+
end
|
596
|
+
end
|
597
|
+
# @example
|
598
|
+
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
599
|
+
def have_values_with_at_most_the_keys(expected)
|
600
|
+
HaveValuesWithAtMostTheKeys.new expected
|
601
|
+
end
|
457
602
|
end
|
458
603
|
end
|
459
604
|
end
|
data/scraperwiki-api.gemspec
CHANGED
@@ -17,6 +17,7 @@ Gem::Specification.new do |s|
|
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
18
18
|
s.require_paths = ["lib"]
|
19
19
|
|
20
|
-
s.add_runtime_dependency('
|
21
|
-
s.
|
20
|
+
s.add_runtime_dependency('yajl-ruby', '~> 1.0')
|
21
|
+
s.add_runtime_dependency('httparty', '~> 0.8.0')
|
22
|
+
s.add_development_dependency('rspec', '~> 2.10')
|
22
23
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraperwiki-api
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -11,28 +11,39 @@ bindir: bin
|
|
11
11
|
cert_chain: []
|
12
12
|
date: 2012-05-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &70366370555240 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '1.0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70366370555240
|
14
25
|
- !ruby/object:Gem::Dependency
|
15
26
|
name: httparty
|
16
|
-
requirement: &
|
27
|
+
requirement: &70366370554220 !ruby/object:Gem::Requirement
|
17
28
|
none: false
|
18
29
|
requirements:
|
19
30
|
- - ~>
|
20
31
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0.
|
32
|
+
version: 0.8.0
|
22
33
|
type: :runtime
|
23
34
|
prerelease: false
|
24
|
-
version_requirements: *
|
35
|
+
version_requirements: *70366370554220
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rspec
|
27
|
-
requirement: &
|
38
|
+
requirement: &70366370553000 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
31
42
|
- !ruby/object:Gem::Version
|
32
|
-
version: 2.10
|
43
|
+
version: '2.10'
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70366370553000
|
36
47
|
description: A Ruby wrapper for the ScraperWiki API
|
37
48
|
email:
|
38
49
|
- info@opennorth.ca
|