lcbo 0.9.9 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG.md +9 -0
- data/README.md +14 -3
- data/examples/crawlers/inventories_crawler.rb +22 -0
- data/examples/crawlers/product_lists_crawler.rb +17 -0
- data/examples/crawlers/products_crawler.rb +22 -0
- data/examples/crawlers/products_queue_crawler.rb +23 -0
- data/examples/crawlers/stores_crawler.rb +22 -0
- data/lib/lcbo.rb +4 -2
- data/lib/lcbo/crawlkit.rb +6 -0
- data/lib/lcbo/crawlkit/crawler.rb +79 -0
- data/lib/lcbo/crawlkit/page.rb +0 -5
- data/lib/lcbo/crawlkit/request.rb +16 -1
- data/lib/lcbo/pages/product_page.rb +4 -4
- data/lib/lcbo/pages/store_list_page.rb +1 -1
- data/lib/lcbo/pages/store_page.rb +6 -6
- data/lib/lcbo/version.rb +1 -1
- data/spec/crawlkit/crawler_spec.rb +196 -0
- metadata +12 -4
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
Version 0.10.0
|
2
|
+
|
3
|
+
* Moved `CrawlKit` related errors into the `CrawlKit` namespace.
|
4
|
+
* Added `:timeout` and `:max_retries` to configuration options and enabled
|
5
|
+
auto _n_-retries for timed-out requests.
|
6
|
+
* Added `LCBO::CrawlKit::Crawler` mixin as a helper for making crawlers.
|
7
|
+
* Added example crawlers for inventories, products, stores, and product list
|
8
|
+
pages.
|
9
|
+
|
1
10
|
Version 0.9.9
|
2
11
|
|
3
12
|
* Added `ProductPage#is_kosher` to designate Kosher products.
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# LCBO: The Ruby Gem
|
2
2
|
|
3
3
|
This library is used to gather data for [LCBO API](http://lcboapi.com). It
|
4
|
-
allows you to request and parse store, product, inventory,
|
5
|
-
pages directly from the [LCBO](http://lcbo.com) website.
|
4
|
+
allows you to request and parse store, product, inventory, product list, and
|
5
|
+
store list pages directly from the [LCBO](http://lcbo.com) website.
|
6
6
|
|
7
7
|
## Synopsis
|
8
8
|
|
@@ -12,7 +12,7 @@ pages directly from the [LCBO](http://lcbo.com) website.
|
|
12
12
|
# => { :store_no => 511, :name => "King & Spadina", ... }
|
13
13
|
|
14
14
|
LCBO.product(18)
|
15
|
-
# => { :product_no =>
|
15
|
+
# => { :product_no => 18, :name => "Heineken Lager", ... }
|
16
16
|
|
17
17
|
LCBO.inventory(18)
|
18
18
|
# => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
|
@@ -20,6 +20,17 @@ pages directly from the [LCBO](http://lcbo.com) website.
|
|
20
20
|
LCBO.products_list(1)
|
21
21
|
# => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
|
22
22
|
|
23
|
+
LCBO.store_list
|
24
|
+
# => { :store_nos => [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, ...] }
|
25
|
+
|
26
|
+
## Crawlers
|
27
|
+
|
28
|
+
Some examples of crawlers exist
|
29
|
+
[here](http://github.com/heycarsten/lcbo/blob/master/examples). You can also
|
30
|
+
check out the
|
31
|
+
[crawler spec](http://github.com/heycarsten/lcbo/blob/master/spec/crawlkit/crawler_spec.rb)
|
32
|
+
to see how to interact with them.
|
33
|
+
|
23
34
|
## Installation
|
24
35
|
|
25
36
|
Use RubyGems: `gem install lcbo`
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class InventoriesCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
ProductListsCrawler.run
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.inventory(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped inventory for product ##{product_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class ProductListsCrawler
|
2
|
+
|
3
|
+
include CrawlKit::Crawler
|
4
|
+
|
5
|
+
def request(params)
|
6
|
+
LCBO.product_list(params[:next_page] || 1)
|
7
|
+
end
|
8
|
+
|
9
|
+
def continue?(current_params)
|
10
|
+
current_params[:next_page] ? true : false
|
11
|
+
end
|
12
|
+
|
13
|
+
def reduce
|
14
|
+
requests.map { |params| params[:product_nos] }.flatten
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class ProductsCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
ProductListsCrawler.run
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.product(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped product ##{product_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class ProductsQueueCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def pop
|
6
|
+
$redis.rpop('lcbo.products.queue')
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.product(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped product ##{product_no}"
|
17
|
+
$redis.rpush('lcbo.products.missing', product_no)
|
18
|
+
else
|
19
|
+
raise error
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class StoresCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
LCBO.store_list[:store_nos]
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(store_no)
|
10
|
+
LCBO.store(store_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, store_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped store ##{store_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/lcbo.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module LCBO
|
2
2
|
|
3
3
|
DEFAULT_CONFIG = {
|
4
|
-
:user_agent
|
5
|
-
|
4
|
+
:user_agent => nil, # Use the default User-Agent by default
|
5
|
+
:max_retries => 8, # Number of times to retry a request that fails
|
6
|
+
:timeout => 2 # Seconds to wait for a request before timing out
|
7
|
+
}.freeze
|
6
8
|
|
7
9
|
def self.config
|
8
10
|
reset_config! unless @config
|
data/lib/lcbo/crawlkit.rb
CHANGED
@@ -10,6 +10,11 @@ module LCBO
|
|
10
10
|
ENV['LCBO_USER_AGENT'] ||
|
11
11
|
Typhoeus::USER_AGENT
|
12
12
|
end
|
13
|
+
|
14
|
+
class MalformedError < StandardError; end
|
15
|
+
class NotFoundError < StandardError; end
|
16
|
+
class RequestFailedError < StandardError; end
|
17
|
+
class TimeoutError < StandardError; end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
@@ -19,5 +24,6 @@ require 'lcbo/crawlkit/page'
|
|
19
24
|
require 'lcbo/crawlkit/request'
|
20
25
|
require 'lcbo/crawlkit/response'
|
21
26
|
require 'lcbo/crawlkit/request_prototype'
|
27
|
+
require 'lcbo/crawlkit/crawler'
|
22
28
|
require 'lcbo/crawlkit/titlecase_helper'
|
23
29
|
require 'lcbo/crawlkit/volume_helper'
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
module Crawler
|
4
|
+
|
5
|
+
MAX_RETRIES = 8
|
6
|
+
|
7
|
+
class NotImplementedError < StandardError; end
|
8
|
+
|
9
|
+
def self.included(host)
|
10
|
+
host.extend(ClassMethods)
|
11
|
+
host.instance_eval { include InstanceMethods }
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
def run(params = {}, &emitter)
|
16
|
+
crawler = new(&emitter)
|
17
|
+
result = crawler.run(params)
|
18
|
+
crawler.respond_to?(:reduce) ? crawler.reduce : result
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
module InstanceMethods
|
23
|
+
attr_reader :responses
|
24
|
+
|
25
|
+
def initialize(&emitter)
|
26
|
+
@emitter = emitter
|
27
|
+
@responses = []
|
28
|
+
end
|
29
|
+
|
30
|
+
def run(params = {})
|
31
|
+
case
|
32
|
+
when params.is_a?(Array) && params.any?
|
33
|
+
runeach(params)
|
34
|
+
when respond_to?(:pop)
|
35
|
+
runpop
|
36
|
+
when respond_to?(:enum)
|
37
|
+
runeach(enum)
|
38
|
+
else
|
39
|
+
_request(params)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def failure(error, params)
|
44
|
+
raise error
|
45
|
+
end
|
46
|
+
|
47
|
+
def continue?(response)
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
51
|
+
def request(params = {})
|
52
|
+
raise NotImplementedError, "#{self.class} must implement #request"
|
53
|
+
end
|
54
|
+
|
55
|
+
protected
|
56
|
+
|
57
|
+
def runpop
|
58
|
+
while (params = pop)
|
59
|
+
_request(params)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def runeach(params)
|
64
|
+
params.each { |p| _request(p) }
|
65
|
+
end
|
66
|
+
|
67
|
+
def _request(params = {})
|
68
|
+
response = request(params)
|
69
|
+
@responses << response if respond_to?(:reduce)
|
70
|
+
@emitter.(response) if @emitter
|
71
|
+
continue?(response) ? run(response) : response
|
72
|
+
rescue => error
|
73
|
+
failure(error, params)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/lcbo/crawlkit/page.rb
CHANGED
@@ -2,11 +2,6 @@ module LCBO
|
|
2
2
|
module CrawlKit
|
3
3
|
module Page
|
4
4
|
|
5
|
-
class Error < StandardError; end
|
6
|
-
class MalformedDocumentError < Error; end
|
7
|
-
class MissingResourceError < Error; end
|
8
|
-
class RequestFailedError < Error; end
|
9
|
-
|
10
5
|
def self.included(mod)
|
11
6
|
mod.module_eval do
|
12
7
|
include Eventable
|
@@ -2,6 +2,8 @@ module LCBO
|
|
2
2
|
module CrawlKit
|
3
3
|
class Request
|
4
4
|
|
5
|
+
MAX_RETRIES = 8
|
6
|
+
|
5
7
|
attr_reader :request_prototype, :query_params, :body_params
|
6
8
|
|
7
9
|
def initialize(request_prototype, query_p = {}, body_p = {})
|
@@ -38,7 +40,15 @@ module LCBO
|
|
38
40
|
end
|
39
41
|
|
40
42
|
def run
|
41
|
-
|
43
|
+
_run
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def _run(tries = 0)
|
49
|
+
response = Timeout.timeout(LCBO.config[:timeout]) do
|
50
|
+
Typhoeus::Request.run(uri, config)
|
51
|
+
end
|
42
52
|
Response.new \
|
43
53
|
:code => response.code,
|
44
54
|
:uri => response.request.url,
|
@@ -47,6 +57,11 @@ module LCBO
|
|
47
57
|
:query_params => query_params,
|
48
58
|
:body_params => body_params,
|
49
59
|
:body => response.body
|
60
|
+
rescue Errno::ETIMEDOUT, Timeout::Error
|
61
|
+
if tries > LCBO.config[:max_retries]
|
62
|
+
raise TimeoutError, "Request failed after timing out #{tries} times"
|
63
|
+
end
|
64
|
+
_run(tries + 1)
|
50
65
|
end
|
51
66
|
|
52
67
|
end
|
@@ -286,26 +286,26 @@ module LCBO
|
|
286
286
|
|
287
287
|
def verify_third_info_cell
|
288
288
|
return unless has_package? && info_cell_lines[2][0,1] != '|'
|
289
|
-
raise CrawlKit::
|
289
|
+
raise CrawlKit::MalformedError,
|
290
290
|
"Expected third line in info cell to begin with bar. LCBO No: " \
|
291
291
|
"#{product_no}, Dump: #{info_cell_lines[2].inspect}"
|
292
292
|
end
|
293
293
|
|
294
294
|
def verify_response_not_blank
|
295
295
|
return unless html.strip == ''
|
296
|
-
raise CrawlKit::
|
296
|
+
raise CrawlKit::NotFoundError,
|
297
297
|
"product #{product_no} does not appear to exist"
|
298
298
|
end
|
299
299
|
|
300
300
|
def verify_product_name
|
301
301
|
return unless product_details_form('itemName').strip == ''
|
302
|
-
raise CrawlKit::
|
302
|
+
raise CrawlKit::NotFoundError,
|
303
303
|
"can not locate name for product #{product_no}"
|
304
304
|
end
|
305
305
|
|
306
306
|
def verify_product_details_form
|
307
307
|
return unless doc.css('form[name="productdetails"]').empty?
|
308
|
-
raise CrawlKit::
|
308
|
+
raise CrawlKit::MalformedError,
|
309
309
|
"productdetails form not found in doc for product #{product_no}"
|
310
310
|
end
|
311
311
|
|
@@ -42,7 +42,7 @@ module LCBO
|
|
42
42
|
|
43
43
|
def verify_number_of_stores
|
44
44
|
return if STORE_COUNT_RANGE.include?(store_nos.length)
|
45
|
-
raise CrawlKit::
|
45
|
+
raise CrawlKit::MalformedError,
|
46
46
|
"Store count (#{total_stores}) not in range: #{STORE_COUNT_RANGE}"
|
47
47
|
end
|
48
48
|
|
@@ -53,8 +53,8 @@ module LCBO
|
|
53
53
|
emits :address_line_1 do
|
54
54
|
data = info_nodes[2].content.strip.split(',')[0]
|
55
55
|
unless data
|
56
|
-
raise
|
57
|
-
|
56
|
+
raise CrawlKit::MalformedError,
|
57
|
+
"unable to locate address for store #{store_no}"
|
58
58
|
end
|
59
59
|
CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
|
60
60
|
end
|
@@ -72,7 +72,7 @@ module LCBO
|
|
72
72
|
emits :postal_code do
|
73
73
|
data = info_nodes[3].content.strip.split(',')[1]
|
74
74
|
unless data
|
75
|
-
raise
|
75
|
+
raise CrawlKit::MalformedError,
|
76
76
|
"unable to locate postal code for store #{store_no}"
|
77
77
|
end
|
78
78
|
data.gsub(/[\n\r\t]+/, ' ').strip.upcase
|
@@ -174,18 +174,18 @@ module LCBO
|
|
174
174
|
|
175
175
|
def verify_store_returned
|
176
176
|
return if !@html.include?('No stores were located using your criteria.')
|
177
|
-
raise
|
177
|
+
raise CrawlKit::NotFoundError, "store #{store_no} does not exist"
|
178
178
|
end
|
179
179
|
|
180
180
|
def verify_telephone_number
|
181
181
|
return if telephone
|
182
|
-
raise
|
182
|
+
raise CrawlKit::MalformedError,
|
183
183
|
"unable to locate telephone number for store #{store_no}"
|
184
184
|
end
|
185
185
|
|
186
186
|
def verify_node_count
|
187
187
|
return if expected_node_count == info_nodes.size
|
188
|
-
raise
|
188
|
+
raise CrawlKit::MalformedError,
|
189
189
|
"Expected #{expected_node_count} nodes for store #{store_no} but found " \
|
190
190
|
"#{info_nodes.size} instead."
|
191
191
|
end
|
data/lib/lcbo/version.rb
CHANGED
@@ -0,0 +1,196 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module CrawlerSpec
|
4
|
+
module TheInternets
|
5
|
+
|
6
|
+
RESOURCES = {
|
7
|
+
'list?page=1' => {
|
8
|
+
:ids => [1, 2],
|
9
|
+
:page => 1,
|
10
|
+
:next_page => 2
|
11
|
+
},
|
12
|
+
'list?page=2' => {
|
13
|
+
:ids => [3, 4],
|
14
|
+
:page => 2,
|
15
|
+
:next_page => 3
|
16
|
+
},
|
17
|
+
'list?page=3' => {
|
18
|
+
:ids => [5, 6],
|
19
|
+
:page => 3,
|
20
|
+
:next_page => nil
|
21
|
+
},
|
22
|
+
'books/1' => {
|
23
|
+
:id => 1,
|
24
|
+
:title => 'book_1'
|
25
|
+
},
|
26
|
+
'books/2' => {
|
27
|
+
:id => 2,
|
28
|
+
:title => 'book_2'
|
29
|
+
},
|
30
|
+
'books/3' => {
|
31
|
+
:id => 3,
|
32
|
+
:title => 'book_3'
|
33
|
+
},
|
34
|
+
'books/4' => {
|
35
|
+
:id => 4,
|
36
|
+
:title => 'book_4'
|
37
|
+
},
|
38
|
+
'books/5' => {
|
39
|
+
:id => 5,
|
40
|
+
:title => 'book_5'
|
41
|
+
},
|
42
|
+
'books/6' => {
|
43
|
+
:id => 6,
|
44
|
+
:title => 'book_6'
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
def self.get(uri)
|
49
|
+
RESOURCES.fetch(uri) do
|
50
|
+
raise LCBO::CrawlKit::NotFoundError, "#{uri} does not exist"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
class BookCrawler
|
58
|
+
|
59
|
+
include LCBO::CrawlKit::Crawler
|
60
|
+
|
61
|
+
def request(book_id)
|
62
|
+
TheInternets.get("books/#{book_id}")
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
class BookListsCrawler
|
69
|
+
|
70
|
+
include LCBO::CrawlKit::Crawler
|
71
|
+
|
72
|
+
def request(params)
|
73
|
+
TheInternets.get("list?page=#{params[:next_page] || 1}")
|
74
|
+
end
|
75
|
+
|
76
|
+
def continue?(page)
|
77
|
+
page[:next_page] ? true : false
|
78
|
+
end
|
79
|
+
|
80
|
+
def reduce
|
81
|
+
responses.map { |page| page[:ids] }.flatten
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
class EnumBooksCrawler
|
88
|
+
|
89
|
+
include LCBO::CrawlKit::Crawler
|
90
|
+
|
91
|
+
def enum
|
92
|
+
BookListsCrawler.run
|
93
|
+
end
|
94
|
+
|
95
|
+
def request(book_id)
|
96
|
+
TheInternets.get("books/#{book_id}")
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
class QueueBooksCrawler
|
103
|
+
|
104
|
+
QUEUE = [1, 2, 3, 4, 0, 5, 6]
|
105
|
+
MISSING = []
|
106
|
+
|
107
|
+
include LCBO::CrawlKit::Crawler
|
108
|
+
|
109
|
+
def pop
|
110
|
+
QUEUE.pop
|
111
|
+
end
|
112
|
+
|
113
|
+
def request(book_id)
|
114
|
+
TheInternets.get("books/#{book_id}")
|
115
|
+
end
|
116
|
+
|
117
|
+
def failure(error, book_id)
|
118
|
+
case error
|
119
|
+
when LCBO::CrawlKit::NotFoundError
|
120
|
+
MISSING.push(book_id)
|
121
|
+
else
|
122
|
+
raise error
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe CrawlerSpec::TheInternets do
|
130
|
+
it 'should let you get a resource' do
|
131
|
+
CrawlerSpec::TheInternets.get('books/1')[:title].must_equal 'book_1'
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'should throw an error when a resource does not exist' do
|
135
|
+
-> { CrawlerSpec::TheInternets.get('books/0') }.must_raise LCBO::CrawlKit::NotFoundError
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe CrawlerSpec::BookCrawler do
|
140
|
+
it 'should return a book' do
|
141
|
+
CrawlerSpec::BookCrawler.run(1)[:title].must_equal 'book_1'
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should yield a book' do
|
145
|
+
title = nil
|
146
|
+
CrawlerSpec::BookCrawler.run(1) { |page| title = page[:title] }
|
147
|
+
title.must_equal 'book_1'
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'should raise an error if a book does not exist' do
|
151
|
+
-> { CrawlerSpec::BookCrawler.run(0) }.must_raise LCBO::CrawlKit::NotFoundError
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
describe CrawlerSpec::BookListsCrawler do
|
156
|
+
it 'should return all the book ids' do
|
157
|
+
CrawlerSpec::BookListsCrawler.run.must_equal [1, 2, 3, 4, 5, 6]
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'should emit all the pages' do
|
161
|
+
pages = []
|
162
|
+
CrawlerSpec::BookListsCrawler.run { |page| pages << page }
|
163
|
+
pages.size.must_equal 3
|
164
|
+
end
|
165
|
+
|
166
|
+
it 'should consider provided params' do
|
167
|
+
pages = []
|
168
|
+
CrawlerSpec::BookListsCrawler.run(:next_page => 2) { |page| pages << page }
|
169
|
+
pages.size.must_equal 2
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
describe CrawlerSpec::EnumBooksCrawler do
|
174
|
+
it 'should emit all the books' do
|
175
|
+
books = []
|
176
|
+
CrawlerSpec::EnumBooksCrawler.run { |book| books << book }
|
177
|
+
books.size.must_equal 6
|
178
|
+
end
|
179
|
+
|
180
|
+
it 'should emit books when provided with their params' do
|
181
|
+
books = []
|
182
|
+
CrawlerSpec::EnumBooksCrawler.run([1, 2, 3]) { |book| books << book }
|
183
|
+
books.size.must_equal 3
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe CrawlerSpec::QueueBooksCrawler do
|
188
|
+
it 'should emit all the books that did not fail' do
|
189
|
+
CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 7
|
190
|
+
books = []
|
191
|
+
CrawlerSpec::QueueBooksCrawler.run { |book| books << book }
|
192
|
+
books.size.must_equal 6
|
193
|
+
CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 0
|
194
|
+
CrawlerSpec::QueueBooksCrawler::MISSING.size.must_equal 1
|
195
|
+
end
|
196
|
+
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 10
|
8
|
+
- 0
|
9
|
+
version: 0.10.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Nielsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-09 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -74,9 +74,15 @@ files:
|
|
74
74
|
- LICENSE
|
75
75
|
- README.md
|
76
76
|
- Rakefile
|
77
|
+
- examples/crawlers/inventories_crawler.rb
|
78
|
+
- examples/crawlers/product_lists_crawler.rb
|
79
|
+
- examples/crawlers/products_crawler.rb
|
80
|
+
- examples/crawlers/products_queue_crawler.rb
|
81
|
+
- examples/crawlers/stores_crawler.rb
|
77
82
|
- lcbo.gemspec
|
78
83
|
- lib/lcbo.rb
|
79
84
|
- lib/lcbo/crawlkit.rb
|
85
|
+
- lib/lcbo/crawlkit/crawler.rb
|
80
86
|
- lib/lcbo/crawlkit/eventable.rb
|
81
87
|
- lib/lcbo/crawlkit/fastdate_helper.rb
|
82
88
|
- lib/lcbo/crawlkit/page.rb
|
@@ -94,6 +100,7 @@ files:
|
|
94
100
|
- lib/lcbo/pages/store_list_page.rb
|
95
101
|
- lib/lcbo/pages/store_page.rb
|
96
102
|
- lib/lcbo/version.rb
|
103
|
+
- spec/crawlkit/crawler_spec.rb
|
97
104
|
- spec/crawlkit/eventable_spec.rb
|
98
105
|
- spec/crawlkit/fastdate_helper_spec.rb
|
99
106
|
- spec/crawlkit/page_spec.rb
|
@@ -160,6 +167,7 @@ signing_key:
|
|
160
167
|
specification_version: 3
|
161
168
|
summary: A library for parsing HTML pages from http://lcbo.com
|
162
169
|
test_files:
|
170
|
+
- spec/crawlkit/crawler_spec.rb
|
163
171
|
- spec/crawlkit/eventable_spec.rb
|
164
172
|
- spec/crawlkit/fastdate_helper_spec.rb
|
165
173
|
- spec/crawlkit/page_spec.rb
|