scraperwiki-api 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +18 -8
- data/lib/scraperwiki-api/matchers.rb +139 -31
- data/lib/scraperwiki-api/version.rb +1 -1
- metadata +8 -8
data/README.md
CHANGED
@@ -34,7 +34,9 @@ More documentation at [RubyDoc.info](http://rdoc.info/gems/scraperwiki-api/Scrap
|
|
34
34
|
|
35
35
|
## Scraper validations
|
36
36
|
|
37
|
-
If your project uses a lot of scrapers – for example, [OpenCorporates](http://opencorporates.com/), which [scrapes company registries around the world](http://blog.opencorporates.com/2011/03/25/building-a-global-database-the-open-distributed-way/), or [Represent](http://represent.opennorth.ca/), which scrapes information on elected officials from government websites in Canada – you'll want to check that your scrapers behave the way you expect them to. This gem defines [RSpec](https://www.relishapp.com/rspec) matchers to do just that.
|
37
|
+
If your project uses a lot of scrapers – for example, [OpenCorporates](http://opencorporates.com/), which [scrapes company registries around the world](http://blog.opencorporates.com/2011/03/25/building-a-global-database-the-open-distributed-way/), or [Represent](http://represent.opennorth.ca/), which scrapes information on elected officials from government websites in Canada – you'll want to check that your scrapers behave the way you expect them to. This gem defines [RSpec](https://www.relishapp.com/rspec) matchers to do just that.
|
38
|
+
|
39
|
+
You can validate a scraper's metadata (how often it runs, what fields it stores, etc.) like so:
|
38
40
|
|
39
41
|
require 'scraperwiki-api'
|
40
42
|
api = ScraperWiki::API.new
|
@@ -49,40 +51,48 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
49
51
|
it {should be_editable_by('frabcus')}
|
50
52
|
it {should run(:daily)}
|
51
53
|
it {should_not be_broken}
|
54
|
+
|
55
|
+
# Validate the properties of a SQLite table by chaining on a +on+.
|
52
56
|
it {should have_a_row_count_of(42).on('swdata')}
|
53
57
|
|
54
|
-
#
|
58
|
+
# Ensure that the scraper sets required fields.
|
55
59
|
it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
56
60
|
|
57
|
-
#
|
61
|
+
# Ensure that the scraper doesn't set too many fields.
|
58
62
|
it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
59
63
|
end
|
60
64
|
|
65
|
+
And you can validate the scraped data like so:
|
66
|
+
|
67
|
+
require 'scraperwiki-api'
|
68
|
+
api = ScraperWiki::API.new
|
69
|
+
|
61
70
|
data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
62
71
|
|
63
72
|
describe 'example-scraper' do
|
64
73
|
include ScraperWiki::API::Matchers
|
65
74
|
subject {data}
|
66
75
|
|
76
|
+
# If you need at least one of a set of fields to be set:
|
67
77
|
it {should set_any_of(['name', 'first_name', 'last_name'])}
|
68
78
|
|
69
|
-
# Validate the values of individual fields
|
79
|
+
# Validate the values of individual fields by chaining on an +in+.
|
70
80
|
it {should_not have_blank_values.in('name')}
|
71
81
|
it {should have_unique_values.in('email')}
|
72
82
|
it {should have_values_of(['M', 'F']).in('gender')}
|
73
|
-
it {should have_values_matching(/\A[^@\s]+@[
|
83
|
+
it {should have_values_matching(/\A[^@\s]+@[^@\s]+\z/).in('email')}
|
74
84
|
it {should have_values_starting_with('http://').in('url')}
|
75
85
|
it {should have_values_ending_with('Inc.').in('company_name')}
|
76
86
|
it {should have_integer_values.in('year')}
|
77
87
|
|
78
88
|
# If you store a hash or an array of hashes in a field as a JSON string,
|
79
|
-
# you can validate the values of these subfields by chaining on an +at
|
89
|
+
# you can validate the values of these subfields by chaining on an +at+.
|
80
90
|
it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
81
91
|
|
82
|
-
# Check for missing keys within subfields
|
92
|
+
# Check for missing keys within subfields.
|
83
93
|
it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
84
94
|
|
85
|
-
# Check for extra keys within subfields
|
95
|
+
# Check for extra keys within subfields.
|
86
96
|
it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
87
97
|
end
|
88
98
|
|
@@ -37,7 +37,7 @@ module ScraperWiki
|
|
37
37
|
# it {should_not have_blank_values.in('name')}
|
38
38
|
# it {should have_unique_values.in('email')}
|
39
39
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
40
|
-
# it {should have_values_matching(/\A[^@\s]+@[
|
40
|
+
# it {should have_values_matching(/\A[^@\s]+@[^@\s]+\z/).in('email')}
|
41
41
|
# it {should have_values_starting_with('http://').in('url')}
|
42
42
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
43
43
|
# it {should have_integer_values.in('year')}
|
@@ -74,7 +74,7 @@ module ScraperWiki
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def negative_failure_message
|
77
|
-
|
77
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -92,6 +92,10 @@ module ScraperWiki
|
|
92
92
|
def failure_message
|
93
93
|
"expected #{@actual['short_name']} to be #{@expected}"
|
94
94
|
end
|
95
|
+
|
96
|
+
def negative_failure_message
|
97
|
+
"expected #{@actual['short_name']} to not be #{@expected}"
|
98
|
+
end
|
95
99
|
end
|
96
100
|
# @example
|
97
101
|
# it {should be_public}
|
@@ -120,6 +124,10 @@ module ScraperWiki
|
|
120
124
|
def failure_message
|
121
125
|
"expected #{@actual['short_name']} to be editable by #{@expected}"
|
122
126
|
end
|
127
|
+
|
128
|
+
def negative_failure_message
|
129
|
+
"expected #{@actual['short_name']} to not be editable by #{@expected}"
|
130
|
+
end
|
123
131
|
end
|
124
132
|
# @example
|
125
133
|
# it {should be_editable_by 'frabcus'}
|
@@ -140,6 +148,14 @@ module ScraperWiki
|
|
140
148
|
"expected #{@actual['short_name']} to run #{@expected}"
|
141
149
|
end
|
142
150
|
end
|
151
|
+
|
152
|
+
def negative_failure_message
|
153
|
+
if @expected == -1
|
154
|
+
"expected #{@actual['short_name']} to run at some time"
|
155
|
+
else
|
156
|
+
"expected #{@actual['short_name']} to not run #{@expected}"
|
157
|
+
end
|
158
|
+
end
|
143
159
|
end
|
144
160
|
# @example
|
145
161
|
# it {should run(:daily)}
|
@@ -169,10 +185,18 @@ module ScraperWiki
|
|
169
185
|
"#{@actual['short_name']} #{failure_predicate}: #{difference.join ', '}"
|
170
186
|
end
|
171
187
|
|
188
|
+
def negative_failure_message
|
189
|
+
"#{@actual['short_name']} #{negative_failure_predicate}: #{difference.join ', '}"
|
190
|
+
end
|
191
|
+
|
172
192
|
def failure_predicate
|
173
193
|
raise NotImplementerError, 'Subclasses must implement this method'
|
174
194
|
end
|
175
195
|
|
196
|
+
def negative_failure_message
|
197
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
198
|
+
end
|
199
|
+
|
176
200
|
def difference
|
177
201
|
raise NotImplementerError, 'Subclasses must implement this method'
|
178
202
|
end
|
@@ -186,6 +210,10 @@ module ScraperWiki
|
|
186
210
|
def failure_predicate
|
187
211
|
'is missing keys'
|
188
212
|
end
|
213
|
+
|
214
|
+
def negative_failure_predicate
|
215
|
+
"isn't missing keys"
|
216
|
+
end
|
189
217
|
end
|
190
218
|
# @example
|
191
219
|
# it {should have_at_least_the_keys(['fieldA', 'fieldB']).on('swdata')}
|
@@ -201,6 +229,10 @@ module ScraperWiki
|
|
201
229
|
def failure_predicate
|
202
230
|
'has extra keys'
|
203
231
|
end
|
232
|
+
|
233
|
+
def negative_failure_predicate
|
234
|
+
'has no extra keys'
|
235
|
+
end
|
204
236
|
end
|
205
237
|
# @example
|
206
238
|
# it {should have_at_most_the_keys(['fieldA', 'fieldB', 'fieldC', 'fieldD']).on('swdata')}
|
@@ -217,6 +249,10 @@ module ScraperWiki
|
|
217
249
|
def failure_message
|
218
250
|
"expected #{@actual['short_name']} to have #{@expected} rows, not #{@actual['datasummary']['tables'][@table]['count']}"
|
219
251
|
end
|
252
|
+
|
253
|
+
def negative_failure_message
|
254
|
+
"expected #{@actual['short_name']} to not have #{@expected} rows"
|
255
|
+
end
|
220
256
|
end
|
221
257
|
# @example
|
222
258
|
# it {should have_a_row_count_of(42).on('swdata')}
|
@@ -243,6 +279,10 @@ module ScraperWiki
|
|
243
279
|
def failure_message
|
244
280
|
"#{@actual['short_name']} is broken: #{exception_message}"
|
245
281
|
end
|
282
|
+
|
283
|
+
def negative_failure_message
|
284
|
+
"#{@actual['short_name']} isn't broken: #{exception_message}"
|
285
|
+
end
|
246
286
|
end
|
247
287
|
# @example
|
248
288
|
# it {should_not be_broken}
|
@@ -310,12 +350,16 @@ module ScraperWiki
|
|
310
350
|
end
|
311
351
|
|
312
352
|
def negative_failure_message
|
313
|
-
|
353
|
+
"#{failure_size} of #{items.size} #{negative_failure_description}\n#{failures.map(&:inspect).join "\n"}"
|
314
354
|
end
|
315
355
|
|
316
356
|
def failure_description
|
317
357
|
raise NotImplementerError, 'Subclasses must implement this method'
|
318
358
|
end
|
359
|
+
|
360
|
+
def negative_failure_description
|
361
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
362
|
+
end
|
319
363
|
end
|
320
364
|
|
321
365
|
class SetAnyOf < DatastoreMatcher
|
@@ -330,6 +374,10 @@ module ScraperWiki
|
|
330
374
|
def failure_description
|
331
375
|
"records didn't set any of #{@expected.join ','}"
|
332
376
|
end
|
377
|
+
|
378
|
+
def negative_failure_description
|
379
|
+
"records set any of #{@expected.join ','}"
|
380
|
+
end
|
333
381
|
end
|
334
382
|
# @example
|
335
383
|
# it {should set_any_of(['name', 'first_name', 'last_name'])}
|
@@ -353,15 +401,23 @@ module ScraperWiki
|
|
353
401
|
if @subfield
|
354
402
|
items.send(meth) do |item|
|
355
403
|
if blank? item[@field]
|
356
|
-
|
404
|
+
meth == :reject
|
357
405
|
else
|
358
406
|
v = Yajl::Parser.parse item[@field]
|
359
407
|
if Hash === v
|
360
|
-
|
408
|
+
if blank? v[@subfield]
|
409
|
+
meth == :reject
|
410
|
+
else
|
411
|
+
match? v[@subfield]
|
412
|
+
end
|
361
413
|
elsif Array === v
|
362
414
|
v.all? do |w|
|
363
415
|
if Hash === w
|
364
|
-
|
416
|
+
if blank? w[@subfield]
|
417
|
+
meth == :reject
|
418
|
+
else
|
419
|
+
match? w[@subfield]
|
420
|
+
end
|
365
421
|
else
|
366
422
|
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
367
423
|
end
|
@@ -373,7 +429,11 @@ module ScraperWiki
|
|
373
429
|
end
|
374
430
|
else
|
375
431
|
items.send(meth) do |item|
|
376
|
-
|
432
|
+
if blank? item[@field]
|
433
|
+
meth == :reject
|
434
|
+
else
|
435
|
+
match? item[@field]
|
436
|
+
end
|
377
437
|
end
|
378
438
|
end
|
379
439
|
end
|
@@ -398,9 +458,21 @@ module ScraperWiki
|
|
398
458
|
end
|
399
459
|
end
|
400
460
|
|
461
|
+
def negative_failure_description
|
462
|
+
if @subfield
|
463
|
+
"#{@field}:#{@subfield} values #{negative_failure_predicate}"
|
464
|
+
else
|
465
|
+
"#{@field} values #{negative_failure_predicate}"
|
466
|
+
end
|
467
|
+
end
|
468
|
+
|
401
469
|
def failure_predicate
|
402
470
|
raise NotImplementerError, 'Subclasses must implement this method'
|
403
471
|
end
|
472
|
+
|
473
|
+
def negative_failure_predicate
|
474
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
475
|
+
end
|
404
476
|
end
|
405
477
|
|
406
478
|
class HaveBlankValues < FieldMatcher
|
@@ -411,6 +483,10 @@ module ScraperWiki
|
|
411
483
|
def failure_predicate
|
412
484
|
'are blank'
|
413
485
|
end
|
486
|
+
|
487
|
+
def negative_failure_predicate
|
488
|
+
'are present'
|
489
|
+
end
|
414
490
|
end
|
415
491
|
# @example
|
416
492
|
# it {should_not have_blank_values.in('name')}
|
@@ -420,12 +496,16 @@ module ScraperWiki
|
|
420
496
|
|
421
497
|
class HaveValuesOf < FieldMatcher
|
422
498
|
def match?(v)
|
423
|
-
|
499
|
+
@expected.include? v
|
424
500
|
end
|
425
501
|
|
426
502
|
def failure_predicate
|
427
503
|
"aren't one of #{@expected.join ', '}"
|
428
504
|
end
|
505
|
+
|
506
|
+
def negative_failure_predicate
|
507
|
+
"are one of #{@expected.join ', '}"
|
508
|
+
end
|
429
509
|
end
|
430
510
|
# @example
|
431
511
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -435,12 +515,16 @@ module ScraperWiki
|
|
435
515
|
|
436
516
|
class HaveValuesMatching < FieldMatcher
|
437
517
|
def match?(v)
|
438
|
-
|
518
|
+
v[@expected]
|
439
519
|
end
|
440
520
|
|
441
521
|
def failure_predicate
|
442
522
|
"don't match #{@expected.inspect}"
|
443
523
|
end
|
524
|
+
|
525
|
+
def negative_failure_predicate
|
526
|
+
"match #{@expected.inspect}"
|
527
|
+
end
|
444
528
|
end
|
445
529
|
# @example
|
446
530
|
# it {should have_values_matching(/\A[^@\s]+@[^a\s]+\z/).in('email')}
|
@@ -485,7 +569,11 @@ module ScraperWiki
|
|
485
569
|
end
|
486
570
|
|
487
571
|
def failure_predicate
|
488
|
-
'
|
572
|
+
"aren't unique"
|
573
|
+
end
|
574
|
+
|
575
|
+
def negative_failure_predicate
|
576
|
+
'are unique'
|
489
577
|
end
|
490
578
|
end
|
491
579
|
# @example
|
@@ -496,12 +584,16 @@ module ScraperWiki
|
|
496
584
|
|
497
585
|
class HaveValuesStartingWith < FieldMatcher
|
498
586
|
def match?(v)
|
499
|
-
|
587
|
+
v.start_with? @expected
|
500
588
|
end
|
501
589
|
|
502
590
|
def failure_predicate
|
503
591
|
"don't start with #{@expected}"
|
504
592
|
end
|
593
|
+
|
594
|
+
def negative_failure_predicate
|
595
|
+
"start with #{@expected}"
|
596
|
+
end
|
505
597
|
end
|
506
598
|
# @example
|
507
599
|
# it {should have_values_starting_with('http://').in('url')}
|
@@ -511,12 +603,16 @@ module ScraperWiki
|
|
511
603
|
|
512
604
|
class HaveValuesEndingWith < FieldMatcher
|
513
605
|
def match?(v)
|
514
|
-
|
606
|
+
v.end_with? @expected
|
515
607
|
end
|
516
608
|
|
517
609
|
def failure_predicate
|
518
610
|
"don't end with #{@expected}"
|
519
611
|
end
|
612
|
+
|
613
|
+
def negative_failure_predicate
|
614
|
+
"end with #{@expected}"
|
615
|
+
end
|
520
616
|
end
|
521
617
|
# @example
|
522
618
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
@@ -526,12 +622,16 @@ module ScraperWiki
|
|
526
622
|
|
527
623
|
class HaveIntegerValues < FieldMatcher
|
528
624
|
def match?(v)
|
529
|
-
|
625
|
+
Integer(v) rescue false
|
530
626
|
end
|
531
627
|
|
532
628
|
def failure_predicate
|
533
629
|
"aren't integers"
|
534
630
|
end
|
631
|
+
|
632
|
+
def negative_failure_predicate
|
633
|
+
'are integers'
|
634
|
+
end
|
535
635
|
end
|
536
636
|
# @example
|
537
637
|
# it {should have_integer_values.in('year')}
|
@@ -541,23 +641,19 @@ module ScraperWiki
|
|
541
641
|
|
542
642
|
class FieldKeyMatcher < FieldMatcher
|
543
643
|
def match?(v)
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
difference(x).empty?
|
554
|
-
else
|
555
|
-
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
556
|
-
end
|
644
|
+
w = Yajl::Parser.parse v
|
645
|
+
if Hash === w
|
646
|
+
difference(w).empty?
|
647
|
+
elsif Array === w
|
648
|
+
w.all? do |x|
|
649
|
+
if Hash === x
|
650
|
+
difference(x).empty?
|
651
|
+
else
|
652
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
557
653
|
end
|
558
|
-
else
|
559
|
-
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
560
654
|
end
|
655
|
+
else
|
656
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
561
657
|
end
|
562
658
|
end
|
563
659
|
|
@@ -568,6 +664,10 @@ module ScraperWiki
|
|
568
664
|
def failure_predicate
|
569
665
|
"#{predicate}: #{difference.join ', '}"
|
570
666
|
end
|
667
|
+
|
668
|
+
def negative_failure_predicate
|
669
|
+
"#{negative_predicate}: #{difference.join ', '}"
|
670
|
+
end
|
571
671
|
end
|
572
672
|
|
573
673
|
class HaveValuesWithAtLeastTheKeys < FieldKeyMatcher
|
@@ -575,8 +675,12 @@ module ScraperWiki
|
|
575
675
|
@expected - v.keys
|
576
676
|
end
|
577
677
|
|
578
|
-
def
|
579
|
-
'
|
678
|
+
def predicate
|
679
|
+
'are missing keys'
|
680
|
+
end
|
681
|
+
|
682
|
+
def negative_predicate
|
683
|
+
"aren't missing keys"
|
580
684
|
end
|
581
685
|
end
|
582
686
|
# @example
|
@@ -590,9 +694,13 @@ module ScraperWiki
|
|
590
694
|
v.keys - @expected
|
591
695
|
end
|
592
696
|
|
593
|
-
def
|
697
|
+
def predicate
|
594
698
|
'have extra keys'
|
595
699
|
end
|
700
|
+
|
701
|
+
def negative_predicate
|
702
|
+
'have no extra keys'
|
703
|
+
end
|
596
704
|
end
|
597
705
|
# @example
|
598
706
|
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraperwiki-api
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &70265218891100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '1.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70265218891100
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: httparty
|
27
|
-
requirement: &
|
27
|
+
requirement: &70265218882020 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.8.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70265218882020
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &70265218880500 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '2.10'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70265218880500
|
47
47
|
description: A Ruby wrapper for the ScraperWiki API
|
48
48
|
email:
|
49
49
|
- info@opennorth.ca
|