lcbo 0.9.9 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,12 @@
1
+ Version 0.10.0
2
+
3
+ * Moved `CrawlKit` related errors into the `CrawlKit` namespace.
4
+ * Added `:timeout` and `:max_retries` to configuration options and enabled
5
+ auto _n_-retries for timed-out requests.
6
+ * Added `LCBO::CrawlKit::Crawler` mixin as a helper for making crawlers.
7
+ * Added example crawlers for inventories, products, stores, and product list
8
+ pages.
9
+
1
10
  Version 0.9.9
2
11
 
3
12
  * Added `ProductPage#is_kosher` to designate Kosher products.
data/README.md CHANGED
@@ -1,8 +1,8 @@
1
1
  # LCBO: The Ruby Gem
2
2
 
3
3
  This library is used to gather data for [LCBO API](http://lcboapi.com). It
4
- allows you to request and parse store, product, inventory, and product list
5
- pages directly from the [LCBO](http://lcbo.com) website.
4
+ allows you to request and parse store, product, inventory, product list, and
5
+ store list pages directly from the [LCBO](http://lcbo.com) website.
6
6
 
7
7
  ## Synopsis
8
8
 
@@ -12,7 +12,7 @@ pages directly from the [LCBO](http://lcbo.com) website.
12
12
  # => { :store_no => 511, :name => "King & Spadina", ... }
13
13
 
14
14
  LCBO.product(18)
15
- # => { :product_no => 11, :name => "Heineken Lager", ... }
15
+ # => { :product_no => 18, :name => "Heineken Lager", ... }
16
16
 
17
17
  LCBO.inventory(18)
18
18
  # => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
@@ -20,6 +20,17 @@ pages directly from the [LCBO](http://lcbo.com) website.
20
20
  LCBO.products_list(1)
21
21
  # => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
22
22
 
23
+ LCBO.store_list
24
+ # => { :store_nos => [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, ...] }
25
+
26
+ ## Crawlers
27
+
28
+ Some examples of crawlers exist
29
+ [here](http://github.com/heycarsten/lcbo/blob/master/examples). You can also
30
+ check out the
31
+ [crawler spec](http://github.com/heycarsten/lcbo/blob/master/spec/crawlkit/crawler_spec.rb)
32
+ to see how to interact with them.
33
+
23
34
  ## Installation
24
35
 
25
36
  Use RubyGems: `gem install lcbo`
@@ -0,0 +1,22 @@
1
+ class InventoriesCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ ProductListsCrawler.run
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.inventory(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped inventory for product ##{product_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,17 @@
1
+ class ProductListsCrawler
2
+
3
+ include CrawlKit::Crawler
4
+
5
+ def request(params)
6
+ LCBO.product_list(params[:next_page] || 1)
7
+ end
8
+
9
+ def continue?(current_params)
10
+ current_params[:next_page] ? true : false
11
+ end
12
+
13
+ def reduce
14
+ requests.map { |params| params[:product_nos] }.flatten
15
+ end
16
+
17
+ end
@@ -0,0 +1,22 @@
1
+ class ProductsCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ ProductListsCrawler.run
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.product(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped product ##{product_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -0,0 +1,23 @@
1
+ class ProductsQueueCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def pop
6
+ $redis.rpop('lcbo.products.queue')
7
+ end
8
+
9
+ def request(product_no)
10
+ LCBO.product(product_no)
11
+ end
12
+
13
+ def failure(error, product_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped product ##{product_no}"
17
+ $redis.rpush('lcbo.products.missing', product_no)
18
+ else
19
+ raise error
20
+ end
21
+ end
22
+
23
+ end
@@ -0,0 +1,22 @@
1
+ class StoresCrawler
2
+
3
+ include LCBO::CrawlKit::Crawler
4
+
5
+ def enum
6
+ LCBO.store_list[:store_nos]
7
+ end
8
+
9
+ def request(store_no)
10
+ LCBO.store(store_no)
11
+ end
12
+
13
+ def failure(error, store_no)
14
+ case error
15
+ when LCBO::CrawlKit::NotFoundError
16
+ puts "[missing] Skipped store ##{store_no}"
17
+ else
18
+ raise error
19
+ end
20
+ end
21
+
22
+ end
@@ -1,8 +1,10 @@
1
1
  module LCBO
2
2
 
3
3
  DEFAULT_CONFIG = {
4
- :user_agent => nil,
5
- }
4
+ :user_agent => nil, # Use the default User-Agent by default
5
+ :max_retries => 8, # Number of times to retry a request that fails
6
+ :timeout => 2 # Seconds to wait for a request before timing out
7
+ }.freeze
6
8
 
7
9
  def self.config
8
10
  reset_config! unless @config
@@ -10,6 +10,11 @@ module LCBO
10
10
  ENV['LCBO_USER_AGENT'] ||
11
11
  Typhoeus::USER_AGENT
12
12
  end
13
+
14
+ class MalformedError < StandardError; end
15
+ class NotFoundError < StandardError; end
16
+ class RequestFailedError < StandardError; end
17
+ class TimeoutError < StandardError; end
13
18
  end
14
19
  end
15
20
 
@@ -19,5 +24,6 @@ require 'lcbo/crawlkit/page'
19
24
  require 'lcbo/crawlkit/request'
20
25
  require 'lcbo/crawlkit/response'
21
26
  require 'lcbo/crawlkit/request_prototype'
27
+ require 'lcbo/crawlkit/crawler'
22
28
  require 'lcbo/crawlkit/titlecase_helper'
23
29
  require 'lcbo/crawlkit/volume_helper'
@@ -0,0 +1,79 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Crawler
4
+
5
+ MAX_RETRIES = 8
6
+
7
+ class NotImplementedError < StandardError; end
8
+
9
+ def self.included(host)
10
+ host.extend(ClassMethods)
11
+ host.instance_eval { include InstanceMethods }
12
+ end
13
+
14
+ module ClassMethods
15
+ def run(params = {}, &emitter)
16
+ crawler = new(&emitter)
17
+ result = crawler.run(params)
18
+ crawler.respond_to?(:reduce) ? crawler.reduce : result
19
+ end
20
+ end
21
+
22
+ module InstanceMethods
23
+ attr_reader :responses
24
+
25
+ def initialize(&emitter)
26
+ @emitter = emitter
27
+ @responses = []
28
+ end
29
+
30
+ def run(params = {})
31
+ case
32
+ when params.is_a?(Array) && params.any?
33
+ runeach(params)
34
+ when respond_to?(:pop)
35
+ runpop
36
+ when respond_to?(:enum)
37
+ runeach(enum)
38
+ else
39
+ _request(params)
40
+ end
41
+ end
42
+
43
+ def failure(error, params)
44
+ raise error
45
+ end
46
+
47
+ def continue?(response)
48
+ false
49
+ end
50
+
51
+ def request(params = {})
52
+ raise NotImplementedError, "#{self.class} must implement #request"
53
+ end
54
+
55
+ protected
56
+
57
+ def runpop
58
+ while (params = pop)
59
+ _request(params)
60
+ end
61
+ end
62
+
63
+ def runeach(params)
64
+ params.each { |p| _request(p) }
65
+ end
66
+
67
+ def _request(params = {})
68
+ response = request(params)
69
+ @responses << response if respond_to?(:reduce)
70
+ @emitter.(response) if @emitter
71
+ continue?(response) ? run(response) : response
72
+ rescue => error
73
+ failure(error, params)
74
+ end
75
+ end
76
+
77
+ end
78
+ end
79
+ end
@@ -2,11 +2,6 @@ module LCBO
2
2
  module CrawlKit
3
3
  module Page
4
4
 
5
- class Error < StandardError; end
6
- class MalformedDocumentError < Error; end
7
- class MissingResourceError < Error; end
8
- class RequestFailedError < Error; end
9
-
10
5
  def self.included(mod)
11
6
  mod.module_eval do
12
7
  include Eventable
@@ -2,6 +2,8 @@ module LCBO
2
2
  module CrawlKit
3
3
  class Request
4
4
 
5
+ MAX_RETRIES = 8
6
+
5
7
  attr_reader :request_prototype, :query_params, :body_params
6
8
 
7
9
  def initialize(request_prototype, query_p = {}, body_p = {})
@@ -38,7 +40,15 @@ module LCBO
38
40
  end
39
41
 
40
42
  def run
41
- response = Typhoeus::Request.run(uri, config)
43
+ _run
44
+ end
45
+
46
+ protected
47
+
48
+ def _run(tries = 0)
49
+ response = Timeout.timeout(LCBO.config[:timeout]) do
50
+ Typhoeus::Request.run(uri, config)
51
+ end
42
52
  Response.new \
43
53
  :code => response.code,
44
54
  :uri => response.request.url,
@@ -47,6 +57,11 @@ module LCBO
47
57
  :query_params => query_params,
48
58
  :body_params => body_params,
49
59
  :body => response.body
60
+ rescue Errno::ETIMEDOUT, Timeout::Error
61
+ if tries > LCBO.config[:max_retries]
62
+ raise TimeoutError, "Request failed after timing out #{tries} times"
63
+ end
64
+ _run(tries + 1)
50
65
  end
51
66
 
52
67
  end
@@ -286,26 +286,26 @@ module LCBO
286
286
 
287
287
  def verify_third_info_cell
288
288
  return unless has_package? && info_cell_lines[2][0,1] != '|'
289
- raise CrawlKit::MalformedDocumentError,
289
+ raise CrawlKit::MalformedError,
290
290
  "Expected third line in info cell to begin with bar. LCBO No: " \
291
291
  "#{product_no}, Dump: #{info_cell_lines[2].inspect}"
292
292
  end
293
293
 
294
294
  def verify_response_not_blank
295
295
  return unless html.strip == ''
296
- raise CrawlKit::MissingResourceError,
296
+ raise CrawlKit::NotFoundError,
297
297
  "product #{product_no} does not appear to exist"
298
298
  end
299
299
 
300
300
  def verify_product_name
301
301
  return unless product_details_form('itemName').strip == ''
302
- raise CrawlKit::MissingResourceError,
302
+ raise CrawlKit::NotFoundError,
303
303
  "can not locate name for product #{product_no}"
304
304
  end
305
305
 
306
306
  def verify_product_details_form
307
307
  return unless doc.css('form[name="productdetails"]').empty?
308
- raise CrawlKit::MalformedDocumentError,
308
+ raise CrawlKit::MalformedError,
309
309
  "productdetails form not found in doc for product #{product_no}"
310
310
  end
311
311
 
@@ -42,7 +42,7 @@ module LCBO
42
42
 
43
43
  def verify_number_of_stores
44
44
  return if STORE_COUNT_RANGE.include?(store_nos.length)
45
- raise CrawlKit::MalformedDocumentError,
45
+ raise CrawlKit::MalformedError,
46
46
  "Store count (#{total_stores}) not in range: #{STORE_COUNT_RANGE}"
47
47
  end
48
48
 
@@ -53,8 +53,8 @@ module LCBO
53
53
  emits :address_line_1 do
54
54
  data = info_nodes[2].content.strip.split(',')[0]
55
55
  unless data
56
- raise MalformedDocumentError,
57
- "unable to locate address for store #{store_no}"
56
+ raise CrawlKit::MalformedError,
57
+ "unable to locate address for store #{store_no}"
58
58
  end
59
59
  CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
60
60
  end
@@ -72,7 +72,7 @@ module LCBO
72
72
  emits :postal_code do
73
73
  data = info_nodes[3].content.strip.split(',')[1]
74
74
  unless data
75
- raise MalformedDocumentError,
75
+ raise CrawlKit::MalformedError,
76
76
  "unable to locate postal code for store #{store_no}"
77
77
  end
78
78
  data.gsub(/[\n\r\t]+/, ' ').strip.upcase
@@ -174,18 +174,18 @@ module LCBO
174
174
 
175
175
  def verify_store_returned
176
176
  return if !@html.include?('No stores were located using your criteria.')
177
- raise MissingResourceError, "store #{store_no} does not exist"
177
+ raise CrawlKit::NotFoundError, "store #{store_no} does not exist"
178
178
  end
179
179
 
180
180
  def verify_telephone_number
181
181
  return if telephone
182
- raise MalformedDocumentError,
182
+ raise CrawlKit::MalformedError,
183
183
  "unable to locate telephone number for store #{store_no}"
184
184
  end
185
185
 
186
186
  def verify_node_count
187
187
  return if expected_node_count == info_nodes.size
188
- raise MalformedDocumentError,
188
+ raise CrawlKit::MalformedError,
189
189
  "Expected #{expected_node_count} nodes for store #{store_no} but found " \
190
190
  "#{info_nodes.size} instead."
191
191
  end
@@ -1,3 +1,3 @@
1
1
  module LCBO
2
- VERSION = '0.9.9'
2
+ VERSION = '0.10.0'
3
3
  end
@@ -0,0 +1,196 @@
1
+ require 'spec_helper'
2
+
3
+ module CrawlerSpec
4
+ module TheInternets
5
+
6
+ RESOURCES = {
7
+ 'list?page=1' => {
8
+ :ids => [1, 2],
9
+ :page => 1,
10
+ :next_page => 2
11
+ },
12
+ 'list?page=2' => {
13
+ :ids => [3, 4],
14
+ :page => 2,
15
+ :next_page => 3
16
+ },
17
+ 'list?page=3' => {
18
+ :ids => [5, 6],
19
+ :page => 3,
20
+ :next_page => nil
21
+ },
22
+ 'books/1' => {
23
+ :id => 1,
24
+ :title => 'book_1'
25
+ },
26
+ 'books/2' => {
27
+ :id => 2,
28
+ :title => 'book_2'
29
+ },
30
+ 'books/3' => {
31
+ :id => 3,
32
+ :title => 'book_3'
33
+ },
34
+ 'books/4' => {
35
+ :id => 4,
36
+ :title => 'book_4'
37
+ },
38
+ 'books/5' => {
39
+ :id => 5,
40
+ :title => 'book_5'
41
+ },
42
+ 'books/6' => {
43
+ :id => 6,
44
+ :title => 'book_6'
45
+ }
46
+ }
47
+
48
+ def self.get(uri)
49
+ RESOURCES.fetch(uri) do
50
+ raise LCBO::CrawlKit::NotFoundError, "#{uri} does not exist"
51
+ end
52
+ end
53
+
54
+ end
55
+
56
+
57
+ class BookCrawler
58
+
59
+ include LCBO::CrawlKit::Crawler
60
+
61
+ def request(book_id)
62
+ TheInternets.get("books/#{book_id}")
63
+ end
64
+
65
+ end
66
+
67
+
68
+ class BookListsCrawler
69
+
70
+ include LCBO::CrawlKit::Crawler
71
+
72
+ def request(params)
73
+ TheInternets.get("list?page=#{params[:next_page] || 1}")
74
+ end
75
+
76
+ def continue?(page)
77
+ page[:next_page] ? true : false
78
+ end
79
+
80
+ def reduce
81
+ responses.map { |page| page[:ids] }.flatten
82
+ end
83
+
84
+ end
85
+
86
+
87
+ class EnumBooksCrawler
88
+
89
+ include LCBO::CrawlKit::Crawler
90
+
91
+ def enum
92
+ BookListsCrawler.run
93
+ end
94
+
95
+ def request(book_id)
96
+ TheInternets.get("books/#{book_id}")
97
+ end
98
+
99
+ end
100
+
101
+
102
+ class QueueBooksCrawler
103
+
104
+ QUEUE = [1, 2, 3, 4, 0, 5, 6]
105
+ MISSING = []
106
+
107
+ include LCBO::CrawlKit::Crawler
108
+
109
+ def pop
110
+ QUEUE.pop
111
+ end
112
+
113
+ def request(book_id)
114
+ TheInternets.get("books/#{book_id}")
115
+ end
116
+
117
+ def failure(error, book_id)
118
+ case error
119
+ when LCBO::CrawlKit::NotFoundError
120
+ MISSING.push(book_id)
121
+ else
122
+ raise error
123
+ end
124
+ end
125
+
126
+ end
127
+ end
128
+
129
+ describe CrawlerSpec::TheInternets do
130
+ it 'should let you get a resource' do
131
+ CrawlerSpec::TheInternets.get('books/1')[:title].must_equal 'book_1'
132
+ end
133
+
134
+ it 'should throw an error when a resource does not exist' do
135
+ -> { CrawlerSpec::TheInternets.get('books/0') }.must_raise LCBO::CrawlKit::NotFoundError
136
+ end
137
+ end
138
+
139
+ describe CrawlerSpec::BookCrawler do
140
+ it 'should return a book' do
141
+ CrawlerSpec::BookCrawler.run(1)[:title].must_equal 'book_1'
142
+ end
143
+
144
+ it 'should yield a book' do
145
+ title = nil
146
+ CrawlerSpec::BookCrawler.run(1) { |page| title = page[:title] }
147
+ title.must_equal 'book_1'
148
+ end
149
+
150
+ it 'should raise an error if a book does not exist' do
151
+ -> { CrawlerSpec::BookCrawler.run(0) }.must_raise LCBO::CrawlKit::NotFoundError
152
+ end
153
+ end
154
+
155
+ describe CrawlerSpec::BookListsCrawler do
156
+ it 'should return all the book ids' do
157
+ CrawlerSpec::BookListsCrawler.run.must_equal [1, 2, 3, 4, 5, 6]
158
+ end
159
+
160
+ it 'should emit all the pages' do
161
+ pages = []
162
+ CrawlerSpec::BookListsCrawler.run { |page| pages << page }
163
+ pages.size.must_equal 3
164
+ end
165
+
166
+ it 'should consider provided params' do
167
+ pages = []
168
+ CrawlerSpec::BookListsCrawler.run(:next_page => 2) { |page| pages << page }
169
+ pages.size.must_equal 2
170
+ end
171
+ end
172
+
173
+ describe CrawlerSpec::EnumBooksCrawler do
174
+ it 'should emit all the books' do
175
+ books = []
176
+ CrawlerSpec::EnumBooksCrawler.run { |book| books << book }
177
+ books.size.must_equal 6
178
+ end
179
+
180
+ it 'should emit books when provided with their params' do
181
+ books = []
182
+ CrawlerSpec::EnumBooksCrawler.run([1, 2, 3]) { |book| books << book }
183
+ books.size.must_equal 3
184
+ end
185
+ end
186
+
187
+ describe CrawlerSpec::QueueBooksCrawler do
188
+ it 'should emit all the books that did not fail' do
189
+ CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 7
190
+ books = []
191
+ CrawlerSpec::QueueBooksCrawler.run { |book| books << book }
192
+ books.size.must_equal 6
193
+ CrawlerSpec::QueueBooksCrawler::QUEUE.size.must_equal 0
194
+ CrawlerSpec::QueueBooksCrawler::MISSING.size.must_equal 1
195
+ end
196
+ end
metadata CHANGED
@@ -4,9 +4,9 @@ version: !ruby/object:Gem::Version
4
4
  prerelease: false
5
5
  segments:
6
6
  - 0
7
- - 9
8
- - 9
9
- version: 0.9.9
7
+ - 10
8
+ - 0
9
+ version: 0.10.0
10
10
  platform: ruby
11
11
  authors:
12
12
  - Carsten Nielsen
@@ -14,7 +14,7 @@ autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
16
 
17
- date: 2010-12-06 00:00:00 -05:00
17
+ date: 2010-12-09 00:00:00 -05:00
18
18
  default_executable:
19
19
  dependencies:
20
20
  - !ruby/object:Gem::Dependency
@@ -74,9 +74,15 @@ files:
74
74
  - LICENSE
75
75
  - README.md
76
76
  - Rakefile
77
+ - examples/crawlers/inventories_crawler.rb
78
+ - examples/crawlers/product_lists_crawler.rb
79
+ - examples/crawlers/products_crawler.rb
80
+ - examples/crawlers/products_queue_crawler.rb
81
+ - examples/crawlers/stores_crawler.rb
77
82
  - lcbo.gemspec
78
83
  - lib/lcbo.rb
79
84
  - lib/lcbo/crawlkit.rb
85
+ - lib/lcbo/crawlkit/crawler.rb
80
86
  - lib/lcbo/crawlkit/eventable.rb
81
87
  - lib/lcbo/crawlkit/fastdate_helper.rb
82
88
  - lib/lcbo/crawlkit/page.rb
@@ -94,6 +100,7 @@ files:
94
100
  - lib/lcbo/pages/store_list_page.rb
95
101
  - lib/lcbo/pages/store_page.rb
96
102
  - lib/lcbo/version.rb
103
+ - spec/crawlkit/crawler_spec.rb
97
104
  - spec/crawlkit/eventable_spec.rb
98
105
  - spec/crawlkit/fastdate_helper_spec.rb
99
106
  - spec/crawlkit/page_spec.rb
@@ -160,6 +167,7 @@ signing_key:
160
167
  specification_version: 3
161
168
  summary: A library for parsing HTML pages from http://lcbo.com
162
169
  test_files:
170
+ - spec/crawlkit/crawler_spec.rb
163
171
  - spec/crawlkit/eventable_spec.rb
164
172
  - spec/crawlkit/fastdate_helper_spec.rb
165
173
  - spec/crawlkit/page_spec.rb