lcbo 0.9.9 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,3 +1,12 @@
1
+ Version 0.10.0
2
+
3
+ * Moved `CrawlKit` related errors into the `CrawlKit` namespace.
4
+ * Added `:timeout` and `:max_retries` to configuration options and enabled
5
+ auto _n_-retries for timed-out requests.
6
+ * Added `LCBO::CrawlKit::Crawler` mixin as a helper for making crawlers.
7
+ * Added example crawlers for inventories, products, stores, and product list
8
+ pages.
9
+
1
10
  Version 0.9.9
2
11
 
3
12
  * Added `ProductPage#is_kosher` to designate Kosher products.
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # LCBO: The Ruby Gem
2
2
 
3
3
  This library is used to gather data for [LCBO API](http://lcboapi.com). It
4
- allows you to request and parse store, product, inventory, and product list
5
- pages directly from the [LCBO](http://lcbo.com) website.
4
+ allows you to request and parse store, product, inventory, product list, and
5
+ store list pages directly from the [LCBO](http://lcbo.com) website.
6
6
 
7
7
  ## Synopsis
8
8
 
@@ -12,7 +12,7 @@ pages directly from the [LCBO](http://lcbo.com) website.
12
12
  # => { :store_no => 511, :name => "King & Spadina", ... }
13
13
 
14
14
  LCBO.product(18)
15
- # => { :product_no => 11, :name => "Heineken Lager", ... }
15
+ # => { :product_no => 18, :name => "Heineken Lager", ... }
16
16
 
17
17
  LCBO.inventory(18)
18
18
  # => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
@@ -20,6 +20,17 @@ pages directly from the [LCBO](http://lcbo.com) website.
20
20
  LCBO.products_list(1)
21
21
  # => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
22
22
 
23
+ LCBO.store_list
24
+ # => { :store_nos => [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, ...] }
25
+
26
+ ## Crawlers
27
+
28
+ Some examples of crawlers exist
29
+ [here](http://github.com/heycarsten/lcbo/blob/master/examples). You can also
30
+ check out the
31
+ [crawler spec](http://github.com/heycarsten/lcbo/blob/master/spec/crawlkit/crawler_spec.rb)
32
+ to see how to interact with them.
33
+
23
34
  ## Installation
24
35
 
25
36
  Use RubyGems: `gem install lcbo`
@@ -0,0 +1,22 @@
1
+ class InventoriesCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ ProductListsCrawler.run
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.inventory(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped inventory for product ##{product_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,17 @@
1
+ class ProductListsCrawler
2
+
3
+ include CrawlKit::Crawler
4
+
5
+ def request(params)
6
+ LCBO.product_list(params[:next_page] || 1)
7
+ end
8
+
9
+ def continue?(current_params)
10
+ current_params[:next_page] ? true : false
11
+ end
12
+
13
+ def reduce
14
+ requests.map { |params| params[:product_nos] }.flatten
15
+ end
16
+
17
+ end
@@ -0,0 +1,22 @@
1
+ class ProductsCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ ProductListsCrawler.run
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.product(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped product ##{product_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,23 @@
1
+ class ProductsQueueCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def pop
6
+ $redis.rpop('lcbo.products.queue')
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.product(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped product ##{product_no}"
17
+ $redis.rpush('lcbo.products.missing', product_no)
18
+ else
19
+ raise error
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,22 @@
1
+ class StoresCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ LCBO.store_list[:store_nos]
7
+ end
8
+
9
+ def request(store_no)
10
+ LCBO.store(store_no)
11
+ end
12
+
13
+ def failure(error, store_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped store ##{store_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -1,8 +1,10 @@
1
1
  module LCBO
2
2
 
3
3
  DEFAULT_CONFIG = {
4
- :user_agent => nil,
5
- }
4
+ :user_agent => nil, # Use the default User-Agent by default
5
+ :max_retries => 8, # Number of times to retry a request that fails
6
+ :timeout => 2 # Seconds to wait for a request before timing out
7
+ }.freeze
6
8
 
7
9
  def self.config
8
10
  reset_config! unless @config
@@ -10,6 +10,11 @@ module LCBO
10
10
  ENV['LCBO_USER_AGENT'] ||
11
11
  Typhoeus::USER_AGENT
12
12
  end
13
+
14
+ class MalformedError < StandardError; end
15
+ class NotFoundError < StandardError; end
16
+ class RequestFailedError < StandardError; end
17
+ class TimeoutError < StandardError; end
13
18
  end
14
19
  end
15
20
 
@@ -19,5 +24,6 @@ require 'lcbo/crawlkit/page'
19
24
  require 'lcbo/crawlkit/request'
20
25
  require 'lcbo/crawlkit/response'
21
26
  require 'lcbo/crawlkit/request_prototype'
27
+ require 'lcbo/crawlkit/crawler'
22
28
  require 'lcbo/crawlkit/titlecase_helper'
23
29
  require 'lcbo/crawlkit/volume_helper'
@@ -0,0 +1,79 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Crawler
4
+
5
+ MAX_RETRIES = 8
6
+
7
+ class NotImplementedError < StandardError; end
8
+
9
+ def self.included(host)
10
+ host.extend(ClassMethods)
11
+ host.instance_eval { include InstanceMethods }
12
+ end
13
+
14
+ module ClassMethods
15
+ def run(params = {}, &emitter)
16
+ crawler = new(&emitter)
17
+ result = crawler.run(params)
18
+ crawler.respond_to?(:reduce) ? crawler.reduce : result
19
+ end
20
+ end
21
+
22
+ module InstanceMethods
23
+ attr_reader :responses
24
+
25
+ def initialize(&emitter)
26
+ @emitter = emitter
27
+ @responses = []
28
+ end
29
+
30
+ def run(params = {})
31
+ case
32
+ when params.is_a?(Array) && params.any?
33
+ runeach(params)
34
+ when respond_to?(:pop)
35
+ runpop
36
+ when respond_to?(:enum)
37
+ runeach(enum)
38
+ else
39
+ _request(params)
40
+ end
41
+ end
42
+
43
+ def failure(error, params)
44
+ raise error
45
+ end
46
+
47
+ def continue?(response)
48
+ false
49
+ end
50
+
51
+ def request(params = {})
52
+ raise NotImplementedError, "#{self.class} must implement #request"
53
+ end
54
+
55
+ protected
56
+
57
+ def runpop
58
+ while (params = pop)
59
+ _request(params)
60
+ end
61
+ end
62
+
63
+ def runeach(params)
64
+ params.each { |p| _request(p) }
65
+ end
66
+
67
+ def _request(params = {})
68
+ response = request(params)
69
+ @responses << response if respond_to?(:reduce)
70
+ @emitter.(response) if @emitter
71
+ continue?(response) ? run(response) : response
72
+ rescue => error
73
+ failure(error, params)
74
+ end
75
+ end
76
+
77
+ end
78
+ end
79
+ end
@@ -2,11 +2,6 @@ module LCBO
2
2
  module CrawlKit
3
3
  module Page
4
4
 
5
- class Error < StandardError; end
6
- class MalformedDocumentError < Error; end
7
- class MissingResourceError < Error; end
8
- class RequestFailedError < Error; end
9
-
10
5
  def self.included(mod)
11
6
  mod.module_eval do
12
7
  include Eventable
@@ -2,6 +2,8 @@ module LCBO
2
2
  module CrawlKit
3
3
  class Request
4
4
 
5
+ MAX_RETRIES = 8
6
+
5
7
  attr_reader :request_prototype, :query_params, :body_params
6
8
 
7
9
  def initialize(request_prototype, query_p = {}, body_p = {})
@@ -38,7 +40,15 @@ module LCBO
38
40
  end
39
41
 
40
42
  def run
41
- response = Typhoeus::Request.run(uri, config)
43
+ _run
44
+ end
45
+
46
+ protected
47
+
48
+ def _run(tries = 0)
49
+ response = Timeout.timeout(LCBO.config[:timeout]) do
50
+ Typhoeus::Request.run(uri, config)
51
+ end
42
52
  Response.new \
43
53
  :code => response.code,
44
54
  :uri => response.request.url,
@@ -47,6 +57,11 @@ module LCBO
47
57
  :query_params => query_params,
48
58
  :body_params => body_params,
49
59
  :body => response.body
60
+ rescue Errno::ETIMEDOUT, Timeout::Error
61
+ if tries > LCBO.config[:max_retries]
62
+ raise TimeoutError, "Request failed after timing out #{tries} times"
63
+ end
64
+ _run(tries + 1)
50
65
  end
51
66
 
52
67
  end
@@ -286,26 +286,26 @@ module LCBO
286
286
 
287
287
  def verify_third_info_cell
288
288
  return unless has_package? && info_cell_lines[2][0,1] != '|'
289
- raise CrawlKit::MalformedDocumentError,
289
+ raise CrawlKit::MalformedError,
290
290
  "Expected third line in info cell to begin with bar. LCBO No: " \
291
291
  "#{product_no}, Dump: #{info_cell_lines[2].inspect}"
292
292
  end
293
293
 
294
294
  def verify_response_not_blank
295
295
  return unless html.strip == ''
296
- raise CrawlKit::MissingResourceError,
296
+ raise CrawlKit::NotFoundError,
297
297
  "product #{product_no} does not appear to exist"
298
298
  end
299
299
 
300
300
  def verify_product_name
301
301
  return unless product_details_form('itemName').strip == ''
302
- raise CrawlKit::MissingResourceError,
302
+ raise CrawlKit::NotFoundError,
303
303
  "can not locate name for product #{product_no}"
304
304
  end
305
305
 
306
306
  def verify_product_details_form
307
307
  return unless doc.css('form[name="productdetails"]').empty?
308
- raise CrawlKit::MalformedDocumentError,
308
+ raise CrawlKit::MalformedError,
309
309
  "productdetails form not found in doc for product #{product_no}"
310
310
  end
311
311
 
@@ -42,7 +42,7 @@ module LCBO
42
42
 
43
43
  def verify_number_of_stores
44
44
  return if STORE_COUNT_RANGE.include?(store_nos.length)
45
- raise CrawlKit::MalformedDocumentError,
45
+ raise CrawlKit::MalformedError,
46
46
  "Store count (#{total_stores}) not in range: #{STORE_COUNT_RANGE}"
47
47
  end
48
48
 
@@ -53,8 +53,8 @@ module LCBO
53
53
  emits :address_line_1 do
54
54
  data = info_nodes[2].content.strip.split(',')[0]
55
55
  unless data
56
- raise MalformedDocumentError,
57
- "unable to locate address for store #{store_no}"
56
+ raise CrawlKit::MalformedError,
57
+ "unable to locate address for store #{store_no}"
58
58
  end
59
59
  CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
60
60
  end
@@ -72,7 +72,7 @@ module LCBO
72
72
  emits :postal_code do
73
73
  data = info_nodes[3].content.strip.split(',')[1]
74
74
  unless data
75
- raise MalformedDocumentError,
75
+ raise CrawlKit::MalformedError,
76
76
  "unable to locate postal code for store #{store_no}"
77
77
  end
78
78
  data.gsub(/[\n\r\t]+/, ' ').strip.upcase
@@ -174,18 +174,18 @@ module LCBO
174
174
 
175
175
  def verify_store_returned
176
176
  return if !@html.include?('No stores were located using your criteria.')
177
- raise MissingResourceError, "store #{store_no} does not exist"
177
+ raise CrawlKit::NotFoundError, "store #{store_no} does not exist"
178
178
  end
179
179
 
180
180
  def verify_telephone_number
181
181
  return if telephone
182
- raise MalformedDocumentError,
182
+ raise CrawlKit::MalformedError,
183
183
  "unable to locate telephone number for store #{store_no}"
184
184
  end
185
185
 
186
186
  def verify_node_count
187
187
  return if expected_node_count == info_nodes.size
188
- raise MalformedDocumentError,
188
+ raise CrawlKit::MalformedError,
189
189
  "Expected #{expected_node_count} nodes for store #{store_no} but found " \
190
190
  "#{info_nodes.size} instead."
191
191
  end
@@ -1,3 +1,3 @@
1
1
  module LCBO
2
- VERSION = '0.9.9'
2
+ VERSION = '0.10.0'
3
3
  end
@@ -0,0 +1,196 @@
1
+ require 'spec_helper'
2
+
3
+ module CrawlerSpec
4
+ module TheInternets
5
+
6
+ RESOURCES = {
7
+ 'list?page=1' => {
8
+ :ids => [1, 2],
9
+ :page => 1,
10
+ :next_page => 2
11
+ },
12
+ 'list?page=2' => {
13
+ :ids => [3, 4],
14
+ :page => 2,
15
+ :next_page => 3
16
+ },
17
+ 'list?page=3' => {
18
+ :ids => [5, 6],
19
+ :page => 3,
20
+ :next_page => nil
21
+ },
22
+ 'books/1' => {
23
+ :id => 1,
24
+ :title => 'book_1'
25
+ },
26
+ 'books/2' => {
27
+ :id => 2,
28
+ :title => 'book_2'
29
+ },
30
+ 'books/3' => {
31
+ :id => 3,
32
+ :title => 'book_3'
33
+ },
34
+ 'books/4' => {
35
+ :id => 4,
36
+ :title => 'book_4'
37
+ },
38
+ 'books/5' => {
39
+ :id => 5,
40
+ :title => 'book_5'
41
+ },
42
+ 'books/6' => {
43
+ :id => 6,
44
+ :title => 'book_6'
45
+ }
46
+ }
47
+
48
+ def self.get(uri)
49
+ RESOURCES.fetch(uri) do
50
+ raise LCBO::CrawlKit::NotFoundError, "#{uri} does not exist"
51
+ end
52
+ end
53
+
54
+ end
55
+
56
+
57
+ class BookCrawler
58
+
59
+ include LCBO::CrawlKit::Crawler
60
+
61
+ def request(book_id)
62
+ TheInternets.get("books/#{book_id}")
63
+ end
64
+
65
+ end
66
+
67
+
68
+ class BookListsCrawler
69
+
70
+ include LCBO::CrawlKit::Crawler
71
+
72
+ def request(params)
73
+ TheInternets.get("list?page=#{params[:next_page] || 1}")
74
+ end
75
+
76
+ def continue?(page)
77
+ page[:next_page] ? true : false
78
+ end
79
+
80
+ def reduce
81
+ responses.map { |page| page[:ids] }.flatten
82
+ end
83
+
84
+ end
85
+
86
+
87
+ class EnumBooksCrawler
88
+
89
+ include LCBO::CrawlKit::Crawler
90
+
91
+ def enum
92
+ BookListsCrawler.run
93
+ end
94
+
95
+ def request(book_id)
96
+ TheInternets.get("books/#{book_id}")
97
+ end
98
+
99
+ end
100
+
101
+
102
+ class QueueBooksCrawler
103
+
104
+ QUEUE = [1, 2, 3, 4, 0, 5, 6]
105
+ MISSING = []
106
+
107
+ include LCBO::CrawlKit::Crawler
108
+
109
+ def pop
110
+ QUEUE.pop
111
+ end
112
+
113
+ def request(book_id)
114
+ TheInternets.get("books/#{book_id}")
115
+ end
116
+
117
+ def failure(error, book_id)
118
+ case error
119
+ when LCBO::CrawlKit::NotFoundError
120
+ MISSING.push(book_id)
121
+ else
122
+ raise error
123
+ end
124
+ end
125
+
126
+ end
127
+ end
128
+
129
+ describe CrawlerSpec::TheInternets do
130
+ it 'should let you get a resource' do
131
+ CrawlerSpec::TheInternets.get('books/1')[:title].must_equal 'book_1'
132
+ end
133
+
134
+ it 'should throw an error when a resource does not exist' do
135
+ -> { CrawlerSpec::TheInternets.get('books/0') }.must_raise LCBO::CrawlKit::NotFoundError
136
+ end
137
+ end
138
+
139
+ describe CrawlerSpec::BookCrawler do
140
+ it 'should return a book' do
141
+ CrawlerSpec::BookCrawler.run(1)[:title].must_equal 'book_1'
142
+ end
143
+
144
+ it 'should yield a book' do
145
+ title = nil
146
+ CrawlerSpec::BookCrawler.run(1) { |page| title = page[:title] }
147
+ title.must_equal 'book_1'
148
+ end
149
+
150
+ it 'should raise an error if a book does not exist' do
151
+ -> { CrawlerSpec::BookCrawler.run(0) }.must_raise LCBO::CrawlKit::NotFoundError
152
+ end
153
+ end
154
+
155
+ describe CrawlerSpec::BookListsCrawler do
156
+ it 'should return all the book ids' do
157
+ CrawlerSpec::BookListsCrawler.run.must_equal [1, 2, 3, 4, 5, 6]
158
+ end
159
+
160
+ it 'should emit all the pages' do
161
+ pages = []
162
+ CrawlerSpec::BookListsCrawler.run { |page| pages << page }
163
+ pages.size.must_equal 3
164
+ end
165
+
166
+ it 'should consider provided params' do
167
+ pages = []
168
+ CrawlerSpec::BookListsCrawler.run(:next_page => 2) { |page| pages << page }
169
+ pages.size.must_equal 2
170
+ end
171
+ end
172
+
173
+ describe CrawlerSpec::EnumBooksCrawler do
174
+ it 'should emit all the books' do
175
+ books = []
176
+ CrawlerSpec::EnumBooksCrawler.run { |book| books << book }
177
+ books.size.must_equal 6
178
+ end
179
+
180
+ it 'should emit books when provided with their params' do
181
+ books = []
182
+ CrawlerSpec::EnumBooksCrawler.run([1, 2, 3]) { |book| books << book }
183
+ books.size.must_equal 3
184
+ end
185
+ end
186
+
187
+ describe CrawlerSpec::QueueBooksCrawler do
188
+ it 'should emit all the books that did not fail' do
189
+ CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 7
190
+ books = []
191
+ CrawlerSpec::QueueBooksCrawler.run { |book| books << book }
192
+ books.size.must_equal 6
193
+ CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 0
194
+ CrawlerSpec::QueueBooksCrawler::MISSING.size.must_equal 1
195
+ end
196
+ end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 9
8
- - 9
9
- version: 0.9.9
7
+ - 10
8
+ - 0
9
+ version: 0.10.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Carsten Nielsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-06 00:00:00 -05:00
17
+ date: 2010-12-09 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -74,9 +74,15 @@ files:
74
74
  - LICENSE
75
75
  - README.md
76
76
  - Rakefile
77
+ - examples/crawlers/inventories_crawler.rb
78
+ - examples/crawlers/product_lists_crawler.rb
79
+ - examples/crawlers/products_crawler.rb
80
+ - examples/crawlers/products_queue_crawler.rb
81
+ - examples/crawlers/stores_crawler.rb
77
82
  - lcbo.gemspec
78
83
  - lib/lcbo.rb
79
84
  - lib/lcbo/crawlkit.rb
85
+ - lib/lcbo/crawlkit/crawler.rb
80
86
  - lib/lcbo/crawlkit/eventable.rb
81
87
  - lib/lcbo/crawlkit/fastdate_helper.rb
82
88
  - lib/lcbo/crawlkit/page.rb
@@ -94,6 +100,7 @@ files:
94
100
  - lib/lcbo/pages/store_list_page.rb
95
101
  - lib/lcbo/pages/store_page.rb
96
102
  - lib/lcbo/version.rb
103
+ - spec/crawlkit/crawler_spec.rb
97
104
  - spec/crawlkit/eventable_spec.rb
98
105
  - spec/crawlkit/fastdate_helper_spec.rb
99
106
  - spec/crawlkit/page_spec.rb
@@ -160,6 +167,7 @@ signing_key:
160
167
  specification_version: 3
161
168
  summary: A library for parsing HTML pages from http://lcbo.com
162
169
  test_files:
170
+ - spec/crawlkit/crawler_spec.rb
163
171
  - spec/crawlkit/eventable_spec.rb
164
172
  - spec/crawlkit/fastdate_helper_spec.rb
165
173
  - spec/crawlkit/page_spec.rb