lcbo 0.9.9 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG.md +9 -0
- data/README.md +14 -3
- data/examples/crawlers/inventories_crawler.rb +22 -0
- data/examples/crawlers/product_lists_crawler.rb +17 -0
- data/examples/crawlers/products_crawler.rb +22 -0
- data/examples/crawlers/products_queue_crawler.rb +23 -0
- data/examples/crawlers/stores_crawler.rb +22 -0
- data/lib/lcbo.rb +4 -2
- data/lib/lcbo/crawlkit.rb +6 -0
- data/lib/lcbo/crawlkit/crawler.rb +79 -0
- data/lib/lcbo/crawlkit/page.rb +0 -5
- data/lib/lcbo/crawlkit/request.rb +16 -1
- data/lib/lcbo/pages/product_page.rb +4 -4
- data/lib/lcbo/pages/store_list_page.rb +1 -1
- data/lib/lcbo/pages/store_page.rb +6 -6
- data/lib/lcbo/version.rb +1 -1
- data/spec/crawlkit/crawler_spec.rb +196 -0
- metadata +12 -4
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,12 @@
|
|
1
|
+
Version 0.10.0
|
2
|
+
|
3
|
+
* Moved `CrawlKit` related errors into the `CrawlKit` namespace.
|
4
|
+
* Added `:timeout` and `:max_retries` to configuration options and enabled
|
5
|
+
auto _n_-retries for timed-out requests.
|
6
|
+
* Added `LCBO::CrawlKit::Crawler` mixin as a helper for making crawlers.
|
7
|
+
* Added example crawlers for inventories, products, stores, and product list
|
8
|
+
pages.
|
9
|
+
|
1
10
|
Version 0.9.9
|
2
11
|
|
3
12
|
* Added `ProductPage#is_kosher` to designate Kosher products.
|
data/README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
# LCBO: The Ruby Gem
|
2
2
|
|
3
3
|
This library is used to gather data for [LCBO API](http://lcboapi.com). It
|
4
|
-
allows you to request and parse store, product, inventory,
|
5
|
-
pages directly from the [LCBO](http://lcbo.com) website.
|
4
|
+
allows you to request and parse store, product, inventory, product list, and
|
5
|
+
store list pages directly from the [LCBO](http://lcbo.com) website.
|
6
6
|
|
7
7
|
## Synopsis
|
8
8
|
|
@@ -12,7 +12,7 @@ pages directly from the [LCBO](http://lcbo.com) website.
|
|
12
12
|
# => { :store_no => 511, :name => "King & Spadina", ... }
|
13
13
|
|
14
14
|
LCBO.product(18)
|
15
|
-
# => { :product_no =>
|
15
|
+
# => { :product_no => 18, :name => "Heineken Lager", ... }
|
16
16
|
|
17
17
|
LCBO.inventory(18)
|
18
18
|
# => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
|
@@ -20,6 +20,17 @@ pages directly from the [LCBO](http://lcbo.com) website.
|
|
20
20
|
LCBO.products_list(1)
|
21
21
|
# => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
|
22
22
|
|
23
|
+
LCBO.store_list
|
24
|
+
# => { :store_nos => [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, ...] }
|
25
|
+
|
26
|
+
## Crawlers
|
27
|
+
|
28
|
+
Some examples of crawlers exist
|
29
|
+
[here](http://github.com/heycarsten/lcbo/blob/master/examples). You can also
|
30
|
+
check out the
|
31
|
+
[crawler spec](http://github.com/heycarsten/lcbo/blob/master/spec/crawlkit/crawler_spec.rb)
|
32
|
+
to see how to interact with them.
|
33
|
+
|
23
34
|
## Installation
|
24
35
|
|
25
36
|
Use RubyGems: `gem install lcbo`
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class InventoriesCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
ProductListsCrawler.run
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.inventory(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped inventory for product ##{product_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
class ProductListsCrawler
|
2
|
+
|
3
|
+
include CrawlKit::Crawler
|
4
|
+
|
5
|
+
def request(params)
|
6
|
+
LCBO.product_list(params[:next_page] || 1)
|
7
|
+
end
|
8
|
+
|
9
|
+
def continue?(current_params)
|
10
|
+
current_params[:next_page] ? true : false
|
11
|
+
end
|
12
|
+
|
13
|
+
def reduce
|
14
|
+
requests.map { |params| params[:product_nos] }.flatten
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class ProductsCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
ProductListsCrawler.run
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.product(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped product ##{product_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
class ProductsQueueCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def pop
|
6
|
+
$redis.rpop('lcbo.products.queue')
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(product_no)
|
10
|
+
LCBO.product(product_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, product_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped product ##{product_no}"
|
17
|
+
$redis.rpush('lcbo.products.missing', product_no)
|
18
|
+
else
|
19
|
+
raise error
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
class StoresCrawler
|
2
|
+
|
3
|
+
include LCBO::CrawlKit::Crawler
|
4
|
+
|
5
|
+
def enum
|
6
|
+
LCBO.store_list[:store_nos]
|
7
|
+
end
|
8
|
+
|
9
|
+
def request(store_no)
|
10
|
+
LCBO.store(store_no)
|
11
|
+
end
|
12
|
+
|
13
|
+
def failure(error, store_no)
|
14
|
+
case error
|
15
|
+
when LCBO::CrawlKit::NotFoundError
|
16
|
+
puts "[missing] Skipped store ##{store_no}"
|
17
|
+
else
|
18
|
+
raise error
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
data/lib/lcbo.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
1
|
module LCBO
|
2
2
|
|
3
3
|
DEFAULT_CONFIG = {
|
4
|
-
:user_agent
|
5
|
-
|
4
|
+
:user_agent => nil, # Use the default User-Agent by default
|
5
|
+
:max_retries => 8, # Number of times to retry a request that fails
|
6
|
+
:timeout => 2 # Seconds to wait for a request before timing out
|
7
|
+
}.freeze
|
6
8
|
|
7
9
|
def self.config
|
8
10
|
reset_config! unless @config
|
data/lib/lcbo/crawlkit.rb
CHANGED
@@ -10,6 +10,11 @@ module LCBO
|
|
10
10
|
ENV['LCBO_USER_AGENT'] ||
|
11
11
|
Typhoeus::USER_AGENT
|
12
12
|
end
|
13
|
+
|
14
|
+
class MalformedError < StandardError; end
|
15
|
+
class NotFoundError < StandardError; end
|
16
|
+
class RequestFailedError < StandardError; end
|
17
|
+
class TimeoutError < StandardError; end
|
13
18
|
end
|
14
19
|
end
|
15
20
|
|
@@ -19,5 +24,6 @@ require 'lcbo/crawlkit/page'
|
|
19
24
|
require 'lcbo/crawlkit/request'
|
20
25
|
require 'lcbo/crawlkit/response'
|
21
26
|
require 'lcbo/crawlkit/request_prototype'
|
27
|
+
require 'lcbo/crawlkit/crawler'
|
22
28
|
require 'lcbo/crawlkit/titlecase_helper'
|
23
29
|
require 'lcbo/crawlkit/volume_helper'
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
module Crawler
|
4
|
+
|
5
|
+
MAX_RETRIES = 8
|
6
|
+
|
7
|
+
class NotImplementedError < StandardError; end
|
8
|
+
|
9
|
+
def self.included(host)
|
10
|
+
host.extend(ClassMethods)
|
11
|
+
host.instance_eval { include InstanceMethods }
|
12
|
+
end
|
13
|
+
|
14
|
+
module ClassMethods
|
15
|
+
def run(params = {}, &emitter)
|
16
|
+
crawler = new(&emitter)
|
17
|
+
result = crawler.run(params)
|
18
|
+
crawler.respond_to?(:reduce) ? crawler.reduce : result
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
module InstanceMethods
|
23
|
+
attr_reader :responses
|
24
|
+
|
25
|
+
def initialize(&emitter)
|
26
|
+
@emitter = emitter
|
27
|
+
@responses = []
|
28
|
+
end
|
29
|
+
|
30
|
+
def run(params = {})
|
31
|
+
case
|
32
|
+
when params.is_a?(Array) && params.any?
|
33
|
+
runeach(params)
|
34
|
+
when respond_to?(:pop)
|
35
|
+
runpop
|
36
|
+
when respond_to?(:enum)
|
37
|
+
runeach(enum)
|
38
|
+
else
|
39
|
+
_request(params)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def failure(error, params)
|
44
|
+
raise error
|
45
|
+
end
|
46
|
+
|
47
|
+
def continue?(response)
|
48
|
+
false
|
49
|
+
end
|
50
|
+
|
51
|
+
def request(params = {})
|
52
|
+
raise NotImplementedError, "#{self.class} must implement #request"
|
53
|
+
end
|
54
|
+
|
55
|
+
protected
|
56
|
+
|
57
|
+
def runpop
|
58
|
+
while (params = pop)
|
59
|
+
_request(params)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
def runeach(params)
|
64
|
+
params.each { |p| _request(p) }
|
65
|
+
end
|
66
|
+
|
67
|
+
def _request(params = {})
|
68
|
+
response = request(params)
|
69
|
+
@responses << response if respond_to?(:reduce)
|
70
|
+
@emitter.(response) if @emitter
|
71
|
+
continue?(response) ? run(response) : response
|
72
|
+
rescue => error
|
73
|
+
failure(error, params)
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
data/lib/lcbo/crawlkit/page.rb
CHANGED
@@ -2,11 +2,6 @@ module LCBO
|
|
2
2
|
module CrawlKit
|
3
3
|
module Page
|
4
4
|
|
5
|
-
class Error < StandardError; end
|
6
|
-
class MalformedDocumentError < Error; end
|
7
|
-
class MissingResourceError < Error; end
|
8
|
-
class RequestFailedError < Error; end
|
9
|
-
|
10
5
|
def self.included(mod)
|
11
6
|
mod.module_eval do
|
12
7
|
include Eventable
|
@@ -2,6 +2,8 @@ module LCBO
|
|
2
2
|
module CrawlKit
|
3
3
|
class Request
|
4
4
|
|
5
|
+
MAX_RETRIES = 8
|
6
|
+
|
5
7
|
attr_reader :request_prototype, :query_params, :body_params
|
6
8
|
|
7
9
|
def initialize(request_prototype, query_p = {}, body_p = {})
|
@@ -38,7 +40,15 @@ module LCBO
|
|
38
40
|
end
|
39
41
|
|
40
42
|
def run
|
41
|
-
|
43
|
+
_run
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
def _run(tries = 0)
|
49
|
+
response = Timeout.timeout(LCBO.config[:timeout]) do
|
50
|
+
Typhoeus::Request.run(uri, config)
|
51
|
+
end
|
42
52
|
Response.new \
|
43
53
|
:code => response.code,
|
44
54
|
:uri => response.request.url,
|
@@ -47,6 +57,11 @@ module LCBO
|
|
47
57
|
:query_params => query_params,
|
48
58
|
:body_params => body_params,
|
49
59
|
:body => response.body
|
60
|
+
rescue Errno::ETIMEDOUT, Timeout::Error
|
61
|
+
if tries > LCBO.config[:max_retries]
|
62
|
+
raise TimeoutError, "Request failed after timing out #{tries} times"
|
63
|
+
end
|
64
|
+
_run(tries + 1)
|
50
65
|
end
|
51
66
|
|
52
67
|
end
|
@@ -286,26 +286,26 @@ module LCBO
|
|
286
286
|
|
287
287
|
def verify_third_info_cell
|
288
288
|
return unless has_package? && info_cell_lines[2][0,1] != '|'
|
289
|
-
raise CrawlKit::
|
289
|
+
raise CrawlKit::MalformedError,
|
290
290
|
"Expected third line in info cell to begin with bar. LCBO No: " \
|
291
291
|
"#{product_no}, Dump: #{info_cell_lines[2].inspect}"
|
292
292
|
end
|
293
293
|
|
294
294
|
def verify_response_not_blank
|
295
295
|
return unless html.strip == ''
|
296
|
-
raise CrawlKit::
|
296
|
+
raise CrawlKit::NotFoundError,
|
297
297
|
"product #{product_no} does not appear to exist"
|
298
298
|
end
|
299
299
|
|
300
300
|
def verify_product_name
|
301
301
|
return unless product_details_form('itemName').strip == ''
|
302
|
-
raise CrawlKit::
|
302
|
+
raise CrawlKit::NotFoundError,
|
303
303
|
"can not locate name for product #{product_no}"
|
304
304
|
end
|
305
305
|
|
306
306
|
def verify_product_details_form
|
307
307
|
return unless doc.css('form[name="productdetails"]').empty?
|
308
|
-
raise CrawlKit::
|
308
|
+
raise CrawlKit::MalformedError,
|
309
309
|
"productdetails form not found in doc for product #{product_no}"
|
310
310
|
end
|
311
311
|
|
@@ -42,7 +42,7 @@ module LCBO
|
|
42
42
|
|
43
43
|
def verify_number_of_stores
|
44
44
|
return if STORE_COUNT_RANGE.include?(store_nos.length)
|
45
|
-
raise CrawlKit::
|
45
|
+
raise CrawlKit::MalformedError,
|
46
46
|
"Store count (#{total_stores}) not in range: #{STORE_COUNT_RANGE}"
|
47
47
|
end
|
48
48
|
|
@@ -53,8 +53,8 @@ module LCBO
|
|
53
53
|
emits :address_line_1 do
|
54
54
|
data = info_nodes[2].content.strip.split(',')[0]
|
55
55
|
unless data
|
56
|
-
raise
|
57
|
-
|
56
|
+
raise CrawlKit::MalformedError,
|
57
|
+
"unable to locate address for store #{store_no}"
|
58
58
|
end
|
59
59
|
CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
|
60
60
|
end
|
@@ -72,7 +72,7 @@ module LCBO
|
|
72
72
|
emits :postal_code do
|
73
73
|
data = info_nodes[3].content.strip.split(',')[1]
|
74
74
|
unless data
|
75
|
-
raise
|
75
|
+
raise CrawlKit::MalformedError,
|
76
76
|
"unable to locate postal code for store #{store_no}"
|
77
77
|
end
|
78
78
|
data.gsub(/[\n\r\t]+/, ' ').strip.upcase
|
@@ -174,18 +174,18 @@ module LCBO
|
|
174
174
|
|
175
175
|
def verify_store_returned
|
176
176
|
return if !@html.include?('No stores were located using your criteria.')
|
177
|
-
raise
|
177
|
+
raise CrawlKit::NotFoundError, "store #{store_no} does not exist"
|
178
178
|
end
|
179
179
|
|
180
180
|
def verify_telephone_number
|
181
181
|
return if telephone
|
182
|
-
raise
|
182
|
+
raise CrawlKit::MalformedError,
|
183
183
|
"unable to locate telephone number for store #{store_no}"
|
184
184
|
end
|
185
185
|
|
186
186
|
def verify_node_count
|
187
187
|
return if expected_node_count == info_nodes.size
|
188
|
-
raise
|
188
|
+
raise CrawlKit::MalformedError,
|
189
189
|
"Expected #{expected_node_count} nodes for store #{store_no} but found " \
|
190
190
|
"#{info_nodes.size} instead."
|
191
191
|
end
|
data/lib/lcbo/version.rb
CHANGED
@@ -0,0 +1,196 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module CrawlerSpec
|
4
|
+
module TheInternets
|
5
|
+
|
6
|
+
RESOURCES = {
|
7
|
+
'list?page=1' => {
|
8
|
+
:ids => [1, 2],
|
9
|
+
:page => 1,
|
10
|
+
:next_page => 2
|
11
|
+
},
|
12
|
+
'list?page=2' => {
|
13
|
+
:ids => [3, 4],
|
14
|
+
:page => 2,
|
15
|
+
:next_page => 3
|
16
|
+
},
|
17
|
+
'list?page=3' => {
|
18
|
+
:ids => [5, 6],
|
19
|
+
:page => 3,
|
20
|
+
:next_page => nil
|
21
|
+
},
|
22
|
+
'books/1' => {
|
23
|
+
:id => 1,
|
24
|
+
:title => 'book_1'
|
25
|
+
},
|
26
|
+
'books/2' => {
|
27
|
+
:id => 2,
|
28
|
+
:title => 'book_2'
|
29
|
+
},
|
30
|
+
'books/3' => {
|
31
|
+
:id => 3,
|
32
|
+
:title => 'book_3'
|
33
|
+
},
|
34
|
+
'books/4' => {
|
35
|
+
:id => 4,
|
36
|
+
:title => 'book_4'
|
37
|
+
},
|
38
|
+
'books/5' => {
|
39
|
+
:id => 5,
|
40
|
+
:title => 'book_5'
|
41
|
+
},
|
42
|
+
'books/6' => {
|
43
|
+
:id => 6,
|
44
|
+
:title => 'book_6'
|
45
|
+
}
|
46
|
+
}
|
47
|
+
|
48
|
+
def self.get(uri)
|
49
|
+
RESOURCES.fetch(uri) do
|
50
|
+
raise LCBO::CrawlKit::NotFoundError, "#{uri} does not exist"
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
end
|
55
|
+
|
56
|
+
|
57
|
+
class BookCrawler
|
58
|
+
|
59
|
+
include LCBO::CrawlKit::Crawler
|
60
|
+
|
61
|
+
def request(book_id)
|
62
|
+
TheInternets.get("books/#{book_id}")
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
|
68
|
+
class BookListsCrawler
|
69
|
+
|
70
|
+
include LCBO::CrawlKit::Crawler
|
71
|
+
|
72
|
+
def request(params)
|
73
|
+
TheInternets.get("list?page=#{params[:next_page] || 1}")
|
74
|
+
end
|
75
|
+
|
76
|
+
def continue?(page)
|
77
|
+
page[:next_page] ? true : false
|
78
|
+
end
|
79
|
+
|
80
|
+
def reduce
|
81
|
+
responses.map { |page| page[:ids] }.flatten
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
class EnumBooksCrawler
|
88
|
+
|
89
|
+
include LCBO::CrawlKit::Crawler
|
90
|
+
|
91
|
+
def enum
|
92
|
+
BookListsCrawler.run
|
93
|
+
end
|
94
|
+
|
95
|
+
def request(book_id)
|
96
|
+
TheInternets.get("books/#{book_id}")
|
97
|
+
end
|
98
|
+
|
99
|
+
end
|
100
|
+
|
101
|
+
|
102
|
+
class QueueBooksCrawler
|
103
|
+
|
104
|
+
QUEUE = [1, 2, 3, 4, 0, 5, 6]
|
105
|
+
MISSING = []
|
106
|
+
|
107
|
+
include LCBO::CrawlKit::Crawler
|
108
|
+
|
109
|
+
def pop
|
110
|
+
QUEUE.pop
|
111
|
+
end
|
112
|
+
|
113
|
+
def request(book_id)
|
114
|
+
TheInternets.get("books/#{book_id}")
|
115
|
+
end
|
116
|
+
|
117
|
+
def failure(error, book_id)
|
118
|
+
case error
|
119
|
+
when LCBO::CrawlKit::NotFoundError
|
120
|
+
MISSING.push(book_id)
|
121
|
+
else
|
122
|
+
raise error
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
describe CrawlerSpec::TheInternets do
|
130
|
+
it 'should let you get a resource' do
|
131
|
+
CrawlerSpec::TheInternets.get('books/1')[:title].must_equal 'book_1'
|
132
|
+
end
|
133
|
+
|
134
|
+
it 'should throw an error when a resource does not exist' do
|
135
|
+
-> { CrawlerSpec::TheInternets.get('books/0') }.must_raise LCBO::CrawlKit::NotFoundError
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe CrawlerSpec::BookCrawler do
|
140
|
+
it 'should return a book' do
|
141
|
+
CrawlerSpec::BookCrawler.run(1)[:title].must_equal 'book_1'
|
142
|
+
end
|
143
|
+
|
144
|
+
it 'should yield a book' do
|
145
|
+
title = nil
|
146
|
+
CrawlerSpec::BookCrawler.run(1) { |page| title = page[:title] }
|
147
|
+
title.must_equal 'book_1'
|
148
|
+
end
|
149
|
+
|
150
|
+
it 'should raise an error if a book does not exist' do
|
151
|
+
-> { CrawlerSpec::BookCrawler.run(0) }.must_raise LCBO::CrawlKit::NotFoundError
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
describe CrawlerSpec::BookListsCrawler do
|
156
|
+
it 'should return all the book ids' do
|
157
|
+
CrawlerSpec::BookListsCrawler.run.must_equal [1, 2, 3, 4, 5, 6]
|
158
|
+
end
|
159
|
+
|
160
|
+
it 'should emit all the pages' do
|
161
|
+
pages = []
|
162
|
+
CrawlerSpec::BookListsCrawler.run { |page| pages << page }
|
163
|
+
pages.size.must_equal 3
|
164
|
+
end
|
165
|
+
|
166
|
+
it 'should consider provided params' do
|
167
|
+
pages = []
|
168
|
+
CrawlerSpec::BookListsCrawler.run(:next_page => 2) { |page| pages << page }
|
169
|
+
pages.size.must_equal 2
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
describe CrawlerSpec::EnumBooksCrawler do
|
174
|
+
it 'should emit all the books' do
|
175
|
+
books = []
|
176
|
+
CrawlerSpec::EnumBooksCrawler.run { |book| books << book }
|
177
|
+
books.size.must_equal 6
|
178
|
+
end
|
179
|
+
|
180
|
+
it 'should emit books when provided with their params' do
|
181
|
+
books = []
|
182
|
+
CrawlerSpec::EnumBooksCrawler.run([1, 2, 3]) { |book| books << book }
|
183
|
+
books.size.must_equal 3
|
184
|
+
end
|
185
|
+
end
|
186
|
+
|
187
|
+
describe CrawlerSpec::QueueBooksCrawler do
|
188
|
+
it 'should emit all the books that did not fail' do
|
189
|
+
CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 7
|
190
|
+
books = []
|
191
|
+
CrawlerSpec::QueueBooksCrawler.run { |book| books << book }
|
192
|
+
books.size.must_equal 6
|
193
|
+
CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 0
|
194
|
+
CrawlerSpec::QueueBooksCrawler::MISSING.size.must_equal 1
|
195
|
+
end
|
196
|
+
end
|
metadata
CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
|
|
4
4
|
prerelease: false
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
version: 0.
|
7
|
+
- 10
|
8
|
+
- 0
|
9
|
+
version: 0.10.0
|
10
10
|
platform: ruby
|
11
11
|
authors:
|
12
12
|
- Carsten Nielsen
|
@@ -14,7 +14,7 @@ autorequire:
|
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
16
|
|
17
|
-
date: 2010-12-
|
17
|
+
date: 2010-12-09 00:00:00 -05:00
|
18
18
|
default_executable:
|
19
19
|
dependencies:
|
20
20
|
- !ruby/object:Gem::Dependency
|
@@ -74,9 +74,15 @@ files:
|
|
74
74
|
- LICENSE
|
75
75
|
- README.md
|
76
76
|
- Rakefile
|
77
|
+
- examples/crawlers/inventories_crawler.rb
|
78
|
+
- examples/crawlers/product_lists_crawler.rb
|
79
|
+
- examples/crawlers/products_crawler.rb
|
80
|
+
- examples/crawlers/products_queue_crawler.rb
|
81
|
+
- examples/crawlers/stores_crawler.rb
|
77
82
|
- lcbo.gemspec
|
78
83
|
- lib/lcbo.rb
|
79
84
|
- lib/lcbo/crawlkit.rb
|
85
|
+
- lib/lcbo/crawlkit/crawler.rb
|
80
86
|
- lib/lcbo/crawlkit/eventable.rb
|
81
87
|
- lib/lcbo/crawlkit/fastdate_helper.rb
|
82
88
|
- lib/lcbo/crawlkit/page.rb
|
@@ -94,6 +100,7 @@ files:
|
|
94
100
|
- lib/lcbo/pages/store_list_page.rb
|
95
101
|
- lib/lcbo/pages/store_page.rb
|
96
102
|
- lib/lcbo/version.rb
|
103
|
+
- spec/crawlkit/crawler_spec.rb
|
97
104
|
- spec/crawlkit/eventable_spec.rb
|
98
105
|
- spec/crawlkit/fastdate_helper_spec.rb
|
99
106
|
- spec/crawlkit/page_spec.rb
|
@@ -160,6 +167,7 @@ signing_key:
|
|
160
167
|
specification_version: 3
|
161
168
|
summary: A library for parsing HTML pages from http://lcbo.com
|
162
169
|
test_files:
|
170
|
+
- spec/crawlkit/crawler_spec.rb
|
163
171
|
- spec/crawlkit/eventable_spec.rb
|
164
172
|
- spec/crawlkit/fastdate_helper_spec.rb
|
165
173
|
- spec/crawlkit/page_spec.rb
|