scraperwiki-api 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +18 -8
- data/lib/scraperwiki-api/matchers.rb +139 -31
- data/lib/scraperwiki-api/version.rb +1 -1
- metadata +8 -8
data/README.md
CHANGED
@@ -34,7 +34,9 @@ More documentation at [RubyDoc.info](http://rdoc.info/gems/scraperwiki-api/Scrap
|
|
34
34
|
|
35
35
|
## Scraper validations
|
36
36
|
|
37
|
-
If your project uses a lot of scrapers – for example, [OpenCorporates](http://opencorporates.com/), which [scrapes company registries around the world](http://blog.opencorporates.com/2011/03/25/building-a-global-database-the-open-distributed-way/), or [Represent](http://represent.opennorth.ca/), which scrapes information on elected officials from government websites in Canada – you'll want to check that your scrapers behave the way you expect them to. This gem defines [RSpec](https://www.relishapp.com/rspec) matchers to do just that.
|
37
|
+
If your project uses a lot of scrapers – for example, [OpenCorporates](http://opencorporates.com/), which [scrapes company registries around the world](http://blog.opencorporates.com/2011/03/25/building-a-global-database-the-open-distributed-way/), or [Represent](http://represent.opennorth.ca/), which scrapes information on elected officials from government websites in Canada – you'll want to check that your scrapers behave the way you expect them to. This gem defines [RSpec](https://www.relishapp.com/rspec) matchers to do just that.
|
38
|
+
|
39
|
+
You can validate a scraper's metadata (how often it runs, what fields it stores, etc.) like so:
|
38
40
|
|
39
41
|
require 'scraperwiki-api'
|
40
42
|
api = ScraperWiki::API.new
|
@@ -49,40 +51,48 @@ If your project uses a lot of scrapers – for example, [OpenCorporates](http:/
|
|
49
51
|
it {should be_editable_by('frabcus')}
|
50
52
|
it {should run(:daily)}
|
51
53
|
it {should_not be_broken}
|
54
|
+
|
55
|
+
# Validate the properties of a SQLite table by chaining on a +on+.
|
52
56
|
it {should have_a_row_count_of(42).on('swdata')}
|
53
57
|
|
54
|
-
#
|
58
|
+
# Ensure that the scraper sets required fields.
|
55
59
|
it {should have_at_least_the_keys(['name', 'email']).on('swdata')}
|
56
60
|
|
57
|
-
#
|
61
|
+
# Ensure that the scraper doesn't set too many fields.
|
58
62
|
it {should have_at_most_the_keys(['name', 'email', 'tel', 'fax']).on('swdata')}
|
59
63
|
end
|
60
64
|
|
65
|
+
And you can validate the scraped data like so:
|
66
|
+
|
67
|
+
require 'scraperwiki-api'
|
68
|
+
api = ScraperWiki::API.new
|
69
|
+
|
61
70
|
data = api.datastore_sqlite('example-scraper', 'SELECT * from `swdata`')
|
62
71
|
|
63
72
|
describe 'example-scraper' do
|
64
73
|
include ScraperWiki::API::Matchers
|
65
74
|
subject {data}
|
66
75
|
|
76
|
+
# If you need at least one of a set of fields to be set:
|
67
77
|
it {should set_any_of(['name', 'first_name', 'last_name'])}
|
68
78
|
|
69
|
-
# Validate the values of individual fields
|
79
|
+
# Validate the values of individual fields by chaining on an +in+.
|
70
80
|
it {should_not have_blank_values.in('name')}
|
71
81
|
it {should have_unique_values.in('email')}
|
72
82
|
it {should have_values_of(['M', 'F']).in('gender')}
|
73
|
-
it {should have_values_matching(/\A[^@\s]+@[
|
83
|
+
it {should have_values_matching(/\A[^@\s]+@[^@\s]+\z/).in('email')}
|
74
84
|
it {should have_values_starting_with('http://').in('url')}
|
75
85
|
it {should have_values_ending_with('Inc.').in('company_name')}
|
76
86
|
it {should have_integer_values.in('year')}
|
77
87
|
|
78
88
|
# If you store a hash or an array of hashes in a field as a JSON string,
|
79
|
-
# you can validate the values of these subfields by chaining on an +at
|
89
|
+
# you can validate the values of these subfields by chaining on an +at+.
|
80
90
|
it {should have_values_of(['M', 'F']).in('extra').at('gender')}
|
81
91
|
|
82
|
-
# Check for missing keys within subfields
|
92
|
+
# Check for missing keys within subfields.
|
83
93
|
it {should have_values_with_at_least_the_keys(['subfield1', 'subfield2']).in('fieldA')}
|
84
94
|
|
85
|
-
# Check for extra keys within subfields
|
95
|
+
# Check for extra keys within subfields.
|
86
96
|
it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
87
97
|
end
|
88
98
|
|
@@ -37,7 +37,7 @@ module ScraperWiki
|
|
37
37
|
# it {should_not have_blank_values.in('name')}
|
38
38
|
# it {should have_unique_values.in('email')}
|
39
39
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
40
|
-
# it {should have_values_matching(/\A[^@\s]+@[
|
40
|
+
# it {should have_values_matching(/\A[^@\s]+@[^@\s]+\z/).in('email')}
|
41
41
|
# it {should have_values_starting_with('http://').in('url')}
|
42
42
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
43
43
|
# it {should have_integer_values.in('year')}
|
@@ -74,7 +74,7 @@ module ScraperWiki
|
|
74
74
|
end
|
75
75
|
|
76
76
|
def negative_failure_message
|
77
|
-
|
77
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
78
78
|
end
|
79
79
|
end
|
80
80
|
|
@@ -92,6 +92,10 @@ module ScraperWiki
|
|
92
92
|
def failure_message
|
93
93
|
"expected #{@actual['short_name']} to be #{@expected}"
|
94
94
|
end
|
95
|
+
|
96
|
+
def negative_failure_message
|
97
|
+
"expected #{@actual['short_name']} to not be #{@expected}"
|
98
|
+
end
|
95
99
|
end
|
96
100
|
# @example
|
97
101
|
# it {should be_public}
|
@@ -120,6 +124,10 @@ module ScraperWiki
|
|
120
124
|
def failure_message
|
121
125
|
"expected #{@actual['short_name']} to be editable by #{@expected}"
|
122
126
|
end
|
127
|
+
|
128
|
+
def negative_failure_message
|
129
|
+
"expected #{@actual['short_name']} to not be editable by #{@expected}"
|
130
|
+
end
|
123
131
|
end
|
124
132
|
# @example
|
125
133
|
# it {should be_editable_by 'frabcus'}
|
@@ -140,6 +148,14 @@ module ScraperWiki
|
|
140
148
|
"expected #{@actual['short_name']} to run #{@expected}"
|
141
149
|
end
|
142
150
|
end
|
151
|
+
|
152
|
+
def negative_failure_message
|
153
|
+
if @expected == -1
|
154
|
+
"expected #{@actual['short_name']} to run at some time"
|
155
|
+
else
|
156
|
+
"expected #{@actual['short_name']} to not run #{@expected}"
|
157
|
+
end
|
158
|
+
end
|
143
159
|
end
|
144
160
|
# @example
|
145
161
|
# it {should run(:daily)}
|
@@ -169,10 +185,18 @@ module ScraperWiki
|
|
169
185
|
"#{@actual['short_name']} #{failure_predicate}: #{difference.join ', '}"
|
170
186
|
end
|
171
187
|
|
188
|
+
def negative_failure_message
|
189
|
+
"#{@actual['short_name']} #{negative_failure_predicate}: #{difference.join ', '}"
|
190
|
+
end
|
191
|
+
|
172
192
|
def failure_predicate
|
173
193
|
raise NotImplementerError, 'Subclasses must implement this method'
|
174
194
|
end
|
175
195
|
|
196
|
+
def negative_failure_message
|
197
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
198
|
+
end
|
199
|
+
|
176
200
|
def difference
|
177
201
|
raise NotImplementerError, 'Subclasses must implement this method'
|
178
202
|
end
|
@@ -186,6 +210,10 @@ module ScraperWiki
|
|
186
210
|
def failure_predicate
|
187
211
|
'is missing keys'
|
188
212
|
end
|
213
|
+
|
214
|
+
def negative_failure_predicate
|
215
|
+
"isn't missing keys"
|
216
|
+
end
|
189
217
|
end
|
190
218
|
# @example
|
191
219
|
# it {should have_at_least_the_keys(['fieldA', 'fieldB']).on('swdata')}
|
@@ -201,6 +229,10 @@ module ScraperWiki
|
|
201
229
|
def failure_predicate
|
202
230
|
'has extra keys'
|
203
231
|
end
|
232
|
+
|
233
|
+
def negative_failure_predicate
|
234
|
+
'has no extra keys'
|
235
|
+
end
|
204
236
|
end
|
205
237
|
# @example
|
206
238
|
# it {should have_at_most_the_keys(['fieldA', 'fieldB', 'fieldC', 'fieldD']).on('swdata')}
|
@@ -217,6 +249,10 @@ module ScraperWiki
|
|
217
249
|
def failure_message
|
218
250
|
"expected #{@actual['short_name']} to have #{@expected} rows, not #{@actual['datasummary']['tables'][@table]['count']}"
|
219
251
|
end
|
252
|
+
|
253
|
+
def negative_failure_message
|
254
|
+
"expected #{@actual['short_name']} to not have #{@expected} rows"
|
255
|
+
end
|
220
256
|
end
|
221
257
|
# @example
|
222
258
|
# it {should have_a_row_count_of(42).on('swdata')}
|
@@ -243,6 +279,10 @@ module ScraperWiki
|
|
243
279
|
def failure_message
|
244
280
|
"#{@actual['short_name']} is broken: #{exception_message}"
|
245
281
|
end
|
282
|
+
|
283
|
+
def negative_failure_message
|
284
|
+
"#{@actual['short_name']} isn't broken: #{exception_message}"
|
285
|
+
end
|
246
286
|
end
|
247
287
|
# @example
|
248
288
|
# it {should_not be_broken}
|
@@ -310,12 +350,16 @@ module ScraperWiki
|
|
310
350
|
end
|
311
351
|
|
312
352
|
def negative_failure_message
|
313
|
-
|
353
|
+
"#{failure_size} of #{items.size} #{negative_failure_description}\n#{failures.map(&:inspect).join "\n"}"
|
314
354
|
end
|
315
355
|
|
316
356
|
def failure_description
|
317
357
|
raise NotImplementerError, 'Subclasses must implement this method'
|
318
358
|
end
|
359
|
+
|
360
|
+
def negative_failure_description
|
361
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
362
|
+
end
|
319
363
|
end
|
320
364
|
|
321
365
|
class SetAnyOf < DatastoreMatcher
|
@@ -330,6 +374,10 @@ module ScraperWiki
|
|
330
374
|
def failure_description
|
331
375
|
"records didn't set any of #{@expected.join ','}"
|
332
376
|
end
|
377
|
+
|
378
|
+
def negative_failure_description
|
379
|
+
"records set any of #{@expected.join ','}"
|
380
|
+
end
|
333
381
|
end
|
334
382
|
# @example
|
335
383
|
# it {should set_any_of(['name', 'first_name', 'last_name'])}
|
@@ -353,15 +401,23 @@ module ScraperWiki
|
|
353
401
|
if @subfield
|
354
402
|
items.send(meth) do |item|
|
355
403
|
if blank? item[@field]
|
356
|
-
|
404
|
+
meth == :reject
|
357
405
|
else
|
358
406
|
v = Yajl::Parser.parse item[@field]
|
359
407
|
if Hash === v
|
360
|
-
|
408
|
+
if blank? v[@subfield]
|
409
|
+
meth == :reject
|
410
|
+
else
|
411
|
+
match? v[@subfield]
|
412
|
+
end
|
361
413
|
elsif Array === v
|
362
414
|
v.all? do |w|
|
363
415
|
if Hash === w
|
364
|
-
|
416
|
+
if blank? w[@subfield]
|
417
|
+
meth == :reject
|
418
|
+
else
|
419
|
+
match? w[@subfield]
|
420
|
+
end
|
365
421
|
else
|
366
422
|
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
367
423
|
end
|
@@ -373,7 +429,11 @@ module ScraperWiki
|
|
373
429
|
end
|
374
430
|
else
|
375
431
|
items.send(meth) do |item|
|
376
|
-
|
432
|
+
if blank? item[@field]
|
433
|
+
meth == :reject
|
434
|
+
else
|
435
|
+
match? item[@field]
|
436
|
+
end
|
377
437
|
end
|
378
438
|
end
|
379
439
|
end
|
@@ -398,9 +458,21 @@ module ScraperWiki
|
|
398
458
|
end
|
399
459
|
end
|
400
460
|
|
461
|
+
def negative_failure_description
|
462
|
+
if @subfield
|
463
|
+
"#{@field}:#{@subfield} values #{negative_failure_predicate}"
|
464
|
+
else
|
465
|
+
"#{@field} values #{negative_failure_predicate}"
|
466
|
+
end
|
467
|
+
end
|
468
|
+
|
401
469
|
def failure_predicate
|
402
470
|
raise NotImplementerError, 'Subclasses must implement this method'
|
403
471
|
end
|
472
|
+
|
473
|
+
def negative_failure_predicate
|
474
|
+
raise NotImplementerError, 'Subclasses must implement this method'
|
475
|
+
end
|
404
476
|
end
|
405
477
|
|
406
478
|
class HaveBlankValues < FieldMatcher
|
@@ -411,6 +483,10 @@ module ScraperWiki
|
|
411
483
|
def failure_predicate
|
412
484
|
'are blank'
|
413
485
|
end
|
486
|
+
|
487
|
+
def negative_failure_predicate
|
488
|
+
'are present'
|
489
|
+
end
|
414
490
|
end
|
415
491
|
# @example
|
416
492
|
# it {should_not have_blank_values.in('name')}
|
@@ -420,12 +496,16 @@ module ScraperWiki
|
|
420
496
|
|
421
497
|
class HaveValuesOf < FieldMatcher
|
422
498
|
def match?(v)
|
423
|
-
|
499
|
+
@expected.include? v
|
424
500
|
end
|
425
501
|
|
426
502
|
def failure_predicate
|
427
503
|
"aren't one of #{@expected.join ', '}"
|
428
504
|
end
|
505
|
+
|
506
|
+
def negative_failure_predicate
|
507
|
+
"are one of #{@expected.join ', '}"
|
508
|
+
end
|
429
509
|
end
|
430
510
|
# @example
|
431
511
|
# it {should have_values_of(['M', 'F']).in('gender')}
|
@@ -435,12 +515,16 @@ module ScraperWiki
|
|
435
515
|
|
436
516
|
class HaveValuesMatching < FieldMatcher
|
437
517
|
def match?(v)
|
438
|
-
|
518
|
+
v[@expected]
|
439
519
|
end
|
440
520
|
|
441
521
|
def failure_predicate
|
442
522
|
"don't match #{@expected.inspect}"
|
443
523
|
end
|
524
|
+
|
525
|
+
def negative_failure_predicate
|
526
|
+
"match #{@expected.inspect}"
|
527
|
+
end
|
444
528
|
end
|
445
529
|
# @example
|
446
530
|
# it {should have_values_matching(/\A[^@\s]+@[^a\s]+\z/).in('email')}
|
@@ -485,7 +569,11 @@ module ScraperWiki
|
|
485
569
|
end
|
486
570
|
|
487
571
|
def failure_predicate
|
488
|
-
'
|
572
|
+
"aren't unique"
|
573
|
+
end
|
574
|
+
|
575
|
+
def negative_failure_predicate
|
576
|
+
'are unique'
|
489
577
|
end
|
490
578
|
end
|
491
579
|
# @example
|
@@ -496,12 +584,16 @@ module ScraperWiki
|
|
496
584
|
|
497
585
|
class HaveValuesStartingWith < FieldMatcher
|
498
586
|
def match?(v)
|
499
|
-
|
587
|
+
v.start_with? @expected
|
500
588
|
end
|
501
589
|
|
502
590
|
def failure_predicate
|
503
591
|
"don't start with #{@expected}"
|
504
592
|
end
|
593
|
+
|
594
|
+
def negative_failure_predicate
|
595
|
+
"start with #{@expected}"
|
596
|
+
end
|
505
597
|
end
|
506
598
|
# @example
|
507
599
|
# it {should have_values_starting_with('http://').in('url')}
|
@@ -511,12 +603,16 @@ module ScraperWiki
|
|
511
603
|
|
512
604
|
class HaveValuesEndingWith < FieldMatcher
|
513
605
|
def match?(v)
|
514
|
-
|
606
|
+
v.end_with? @expected
|
515
607
|
end
|
516
608
|
|
517
609
|
def failure_predicate
|
518
610
|
"don't end with #{@expected}"
|
519
611
|
end
|
612
|
+
|
613
|
+
def negative_failure_predicate
|
614
|
+
"end with #{@expected}"
|
615
|
+
end
|
520
616
|
end
|
521
617
|
# @example
|
522
618
|
# it {should have_values_ending_with('Inc.').in('company_name')}
|
@@ -526,12 +622,16 @@ module ScraperWiki
|
|
526
622
|
|
527
623
|
class HaveIntegerValues < FieldMatcher
|
528
624
|
def match?(v)
|
529
|
-
|
625
|
+
Integer(v) rescue false
|
530
626
|
end
|
531
627
|
|
532
628
|
def failure_predicate
|
533
629
|
"aren't integers"
|
534
630
|
end
|
631
|
+
|
632
|
+
def negative_failure_predicate
|
633
|
+
'are integers'
|
634
|
+
end
|
535
635
|
end
|
536
636
|
# @example
|
537
637
|
# it {should have_integer_values.in('year')}
|
@@ -541,23 +641,19 @@ module ScraperWiki
|
|
541
641
|
|
542
642
|
class FieldKeyMatcher < FieldMatcher
|
543
643
|
def match?(v)
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
difference(x).empty?
|
554
|
-
else
|
555
|
-
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
556
|
-
end
|
644
|
+
w = Yajl::Parser.parse v
|
645
|
+
if Hash === w
|
646
|
+
difference(w).empty?
|
647
|
+
elsif Array === w
|
648
|
+
w.all? do |x|
|
649
|
+
if Hash === x
|
650
|
+
difference(x).empty?
|
651
|
+
else
|
652
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
557
653
|
end
|
558
|
-
else
|
559
|
-
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
560
654
|
end
|
655
|
+
else
|
656
|
+
raise NotImplementerError, 'Can only handle subfields that are hashes or arrays of hashes'
|
561
657
|
end
|
562
658
|
end
|
563
659
|
|
@@ -568,6 +664,10 @@ module ScraperWiki
|
|
568
664
|
def failure_predicate
|
569
665
|
"#{predicate}: #{difference.join ', '}"
|
570
666
|
end
|
667
|
+
|
668
|
+
def negative_failure_predicate
|
669
|
+
"#{negative_predicate}: #{difference.join ', '}"
|
670
|
+
end
|
571
671
|
end
|
572
672
|
|
573
673
|
class HaveValuesWithAtLeastTheKeys < FieldKeyMatcher
|
@@ -575,8 +675,12 @@ module ScraperWiki
|
|
575
675
|
@expected - v.keys
|
576
676
|
end
|
577
677
|
|
578
|
-
def
|
579
|
-
'
|
678
|
+
def predicate
|
679
|
+
'are missing keys'
|
680
|
+
end
|
681
|
+
|
682
|
+
def negative_predicate
|
683
|
+
"aren't missing keys"
|
580
684
|
end
|
581
685
|
end
|
582
686
|
# @example
|
@@ -590,9 +694,13 @@ module ScraperWiki
|
|
590
694
|
v.keys - @expected
|
591
695
|
end
|
592
696
|
|
593
|
-
def
|
697
|
+
def predicate
|
594
698
|
'have extra keys'
|
595
699
|
end
|
700
|
+
|
701
|
+
def negative_predicate
|
702
|
+
'have no extra keys'
|
703
|
+
end
|
596
704
|
end
|
597
705
|
# @example
|
598
706
|
# it {should have_values_with_at_most_the_keys(['subfield1', 'subfield2', 'subfield3', 'subfield4']).in('fieldA')}
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scraperwiki-api
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-06-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: yajl-ruby
|
16
|
-
requirement: &
|
16
|
+
requirement: &70265218891100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '1.0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70265218891100
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: httparty
|
27
|
-
requirement: &
|
27
|
+
requirement: &70265218882020 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 0.8.0
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70265218882020
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: rspec
|
38
|
-
requirement: &
|
38
|
+
requirement: &70265218880500 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '2.10'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70265218880500
|
47
47
|
description: A Ruby wrapper for the ScraperWiki API
|
48
48
|
email:
|
49
49
|
- info@opennorth.ca
|