lcbo 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +18 -0
- data/README.md +29 -0
- data/Rakefile +62 -0
- data/lcbo.gemspec +29 -0
- data/lib/lcbo.rb +23 -0
- data/lib/lcbo/crawlers.rb +4 -0
- data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
- data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
- data/lib/lcbo/crawlers/products_crawler.rb +16 -0
- data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
- data/lib/lcbo/crawlkit.rb +24 -0
- data/lib/lcbo/crawlkit/eventable.rb +56 -0
- data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
- data/lib/lcbo/crawlkit/page.rb +141 -0
- data/lib/lcbo/crawlkit/request.rb +51 -0
- data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
- data/lib/lcbo/crawlkit/response.rb +48 -0
- data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
- data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
- data/lib/lcbo/ext.rb +13 -0
- data/lib/lcbo/helpers.rb +34 -0
- data/lib/lcbo/pages.rb +4 -0
- data/lib/lcbo/pages/inventory_page.rb +60 -0
- data/lib/lcbo/pages/product_list_page.rb +85 -0
- data/lib/lcbo/pages/product_page.rb +296 -0
- data/lib/lcbo/pages/store_page.rb +196 -0
- data/lib/lcbo/version.rb +3 -0
- data/spec/crawlkit/eventable_spec.rb +23 -0
- data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
- data/spec/crawlkit/page_spec.rb +114 -0
- data/spec/crawlkit/request_prototype_spec.rb +5 -0
- data/spec/crawlkit/request_spec.rb +41 -0
- data/spec/crawlkit/response_spec.rb +5 -0
- data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
- data/spec/crawlkit/volume_helper_spec.rb +21 -0
- data/spec/crawlkit_spec.rb +5 -0
- data/spec/lcbo_spec.rb +38 -0
- data/spec/pages/inventory_pages.yml +1685 -0
- data/spec/pages/inventory_pages/1.html +11649 -0
- data/spec/pages/inventory_pages/2.html +495 -0
- data/spec/pages/product_list_pages.yml +108 -0
- data/spec/pages/product_list_pages/1.html +4866 -0
- data/spec/pages/product_pages.yml +258 -0
- data/spec/pages/product_pages/1.html +1319 -0
- data/spec/pages/product_pages/2.html +1343 -0
- data/spec/pages/product_pages/3.html +1336 -0
- data/spec/pages/product_pages/4.html +1319 -0
- data/spec/pages/product_pages/5.html +1324 -0
- data/spec/pages/product_pages/6.html +1319 -0
- data/spec/pages/product_pages/7.html +1314 -0
- data/spec/pages/store_pages.yml +80 -0
- data/spec/pages/store_pages/1.html +592 -0
- data/spec/pages/store_pages/2.html +592 -0
- data/spec/pages_spec.rb +34 -0
- data/spec/spec_helper.rb +77 -0
- metadata +205 -0
@@ -0,0 +1,296 @@
|
|
1
|
+
module LCBO
|
2
|
+
class ProductPage
|
3
|
+
|
4
|
+
include CrawlKit::Page
|
5
|
+
|
6
|
+
uri 'http://lcbo.com/lcbo-ear/lcbo/product/details.do?' \
|
7
|
+
'language=EN&itemNumber={product_no}'
|
8
|
+
|
9
|
+
on :before_parse, :verify_response_not_blank
|
10
|
+
on :after_parse, :verify_product_details_form
|
11
|
+
on :after_parse, :verify_product_name
|
12
|
+
on :after_parse, :verify_third_info_cell
|
13
|
+
|
14
|
+
emits :product_no do
|
15
|
+
query_params[:product_no].to_i
|
16
|
+
end
|
17
|
+
|
18
|
+
emits :name do
|
19
|
+
CrawlKit::TitleCaseHelper[product_details_form('itemName')]
|
20
|
+
end
|
21
|
+
|
22
|
+
emits :price_in_cents do
|
23
|
+
(product_details_form('price').to_f * 100).to_i
|
24
|
+
end
|
25
|
+
|
26
|
+
emits :regular_price_in_cents do
|
27
|
+
if has_limited_time_offer
|
28
|
+
info_cell_line_after('Was:').sub('$ ', '').to_f * 100
|
29
|
+
else
|
30
|
+
price_in_cents
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
emits :limited_time_offer_savings_in_cents do
|
35
|
+
regular_price_in_cents - price_in_cents
|
36
|
+
end
|
37
|
+
|
38
|
+
emits :limited_time_offer_ends_on do
|
39
|
+
if has_limited_time_offer
|
40
|
+
CrawlKit::FastDateHelper[info_cell_line_after('Until')]
|
41
|
+
else
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
emits :bonus_reward_miles do
|
47
|
+
if has_bonus_reward_miles
|
48
|
+
info_cell_line_after('Earn').to_i
|
49
|
+
else
|
50
|
+
0
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
emits :bonus_reward_miles_ends_on do
|
55
|
+
if has_bonus_reward_miles
|
56
|
+
CrawlKit::FastDateHelper[info_cell_line_after('Until')]
|
57
|
+
else
|
58
|
+
nil
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
emits :stock_type do
|
63
|
+
product_details_form('stock type')
|
64
|
+
end
|
65
|
+
|
66
|
+
emits :primary_category do
|
67
|
+
if stock_category
|
68
|
+
cat = stock_category.split(',')[0]
|
69
|
+
cat ? cat.strip : cat
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
emits :secondary_category do
|
74
|
+
if stock_category
|
75
|
+
cat = stock_category.split(',')[1]
|
76
|
+
cat ? cat.strip : cat
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
emits :origin do
|
81
|
+
match = find_info_line(/\AMade in: /)
|
82
|
+
if match
|
83
|
+
place = match.
|
84
|
+
gsub('Made in: ', '').
|
85
|
+
gsub('/Californie', '').
|
86
|
+
gsub('Bosnia\'Hercegovina', 'Bosnia and Herzegovina').
|
87
|
+
gsub('Is. Of', 'Island of').
|
88
|
+
gsub('Italy Quality', 'Italy').
|
89
|
+
gsub('Usa-', '').
|
90
|
+
gsub(', Rep. Of', '').
|
91
|
+
gsub('&', 'and')
|
92
|
+
place.split(',').map { |s| s.strip }.uniq.join(', ')
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
emits :package do
|
97
|
+
@package ||= begin
|
98
|
+
string = info_cell_lines[2]
|
99
|
+
string.include?('Price: ') ? nil : string.sub('|','').strip
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
emits :package_unit_type do
|
104
|
+
volume_helper.unit_type
|
105
|
+
end
|
106
|
+
|
107
|
+
emits :package_unit_volume_in_milliliters do
|
108
|
+
volume_helper.unit_volume
|
109
|
+
end
|
110
|
+
|
111
|
+
emits :total_package_units do
|
112
|
+
volume_helper.total_units
|
113
|
+
end
|
114
|
+
|
115
|
+
emits :total_package_volume_in_milliliters do
|
116
|
+
volume_helper.package_volume
|
117
|
+
end
|
118
|
+
|
119
|
+
emits :volume_in_milliliters do
|
120
|
+
CrawlKit::VolumeHelper[package]
|
121
|
+
end
|
122
|
+
|
123
|
+
emits :alcohol_content do
|
124
|
+
match = find_info_line(/ Alcohol\/Vol.\Z/)
|
125
|
+
if match
|
126
|
+
ac = match.gsub(/%| Alcohol\/Vol./, '').to_f
|
127
|
+
ac.zero? ? nil : (ac * 100).to_i
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
emits :sugar_content do
|
132
|
+
match = match = find_info_line(/\ASugar Content : /)
|
133
|
+
if match
|
134
|
+
match.gsub('Sugar Content : ', '')
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
emits :producer_name do
|
139
|
+
match = find_info_line(/\ABy: /)
|
140
|
+
if match
|
141
|
+
CrawlKit::TitleCaseHelper[
|
142
|
+
match.gsub(/By: |Tasting Note|Serving Suggestion|NOTE:/, '')
|
143
|
+
]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
emits :released_on do
|
148
|
+
if html.include?('Release Date:')
|
149
|
+
date = info_cell_line_after('Release Date:')
|
150
|
+
date == 'N/A' ? nil : CrawlKit::FastDateHelper[date]
|
151
|
+
else
|
152
|
+
nil
|
153
|
+
end
|
154
|
+
end
|
155
|
+
|
156
|
+
emits :is_discontinued do
|
157
|
+
html.include?('PRODUCT DISCONTINUED')
|
158
|
+
end
|
159
|
+
|
160
|
+
emits :has_limited_time_offer do
|
161
|
+
html.include?('<B>Limited Time Offer</B>')
|
162
|
+
end
|
163
|
+
|
164
|
+
emits :has_bonus_reward_miles do
|
165
|
+
html.include?('<B>Bonus Reward Miles Offer</B>')
|
166
|
+
end
|
167
|
+
|
168
|
+
emits :is_seasonal do
|
169
|
+
html.include?('<font color="#ff0000">SEASONAL/LIMITED QUANTITIES</font>')
|
170
|
+
end
|
171
|
+
|
172
|
+
emits :is_vqa do
|
173
|
+
html.include?('This is a <B>VQA</B> wine')
|
174
|
+
end
|
175
|
+
|
176
|
+
emits :description do
|
177
|
+
if html.include?('<B>Description</B>')
|
178
|
+
match = html.match(/<B>Description<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
|
179
|
+
match ? match.captures[0] : nil
|
180
|
+
else
|
181
|
+
nil
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
emits :serving_suggestion do
|
186
|
+
if html.include?('<B>Serving Suggestion</B>')
|
187
|
+
match = html.match(/<B>Serving Suggestion<\/B><\/font><BR>\n\t\t\t(.*)<BR><BR>/m)
|
188
|
+
match ? match.captures[0] : nil
|
189
|
+
else
|
190
|
+
nil
|
191
|
+
end
|
192
|
+
end
|
193
|
+
|
194
|
+
emits :tasting_note do
|
195
|
+
if html.include?('<B>Tasting Note</B>')
|
196
|
+
match = html.match(/<B>Tasting Note<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
|
197
|
+
match ? match.captures[0] : nil
|
198
|
+
else
|
199
|
+
nil
|
200
|
+
end
|
201
|
+
end
|
202
|
+
|
203
|
+
private
|
204
|
+
|
205
|
+
def volume_helper
|
206
|
+
@volume_helper ||= CrawlKit::VolumeHelper.new(package)
|
207
|
+
end
|
208
|
+
|
209
|
+
def has_package?
|
210
|
+
!info_cell_lines[2].include?('Price:')
|
211
|
+
end
|
212
|
+
|
213
|
+
def stock_category
|
214
|
+
cat = get_info_lines_at_offset(12).reject do |line|
|
215
|
+
l = line.strip
|
216
|
+
l == '' ||
|
217
|
+
l.include?('Price:') ||
|
218
|
+
l.include?('Bonus Reward Miles Offer') ||
|
219
|
+
l.include?('Value Added Promotion') ||
|
220
|
+
l.include?('Limited Time Offer') ||
|
221
|
+
l.include?('NOTE:')
|
222
|
+
end.first
|
223
|
+
cat ? cat.strip : nil
|
224
|
+
end
|
225
|
+
|
226
|
+
def product_details_form(name)
|
227
|
+
doc.css("form[name=\"productdetails\"] input[name=\"#{name}\"]")[0].
|
228
|
+
attributes['value'].to_s
|
229
|
+
end
|
230
|
+
|
231
|
+
def get_info_lines_at_offset(offset)
|
232
|
+
raw_info_cell_lines.select do |line|
|
233
|
+
match = line.scan(/\A[\s]+/)[0]
|
234
|
+
match ? offset == match.size : false
|
235
|
+
end
|
236
|
+
end
|
237
|
+
|
238
|
+
def info_cell_text
|
239
|
+
@info_cell_text ||= info_cell_lines.join("\n")
|
240
|
+
end
|
241
|
+
|
242
|
+
def find_info_line(regexp)
|
243
|
+
info_cell_lines.select { |l| l =~ regexp }.first
|
244
|
+
end
|
245
|
+
|
246
|
+
def raw_info_cell_lines
|
247
|
+
@raw_info_cell_lines ||= info_cell_element.content.split(/\n/)
|
248
|
+
end
|
249
|
+
|
250
|
+
def info_cell_lines
|
251
|
+
@info_cell_lines ||= begin
|
252
|
+
raw_info_cell_lines.map { |l| l.strip }.reject { |l| l == '' }
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def info_cell_line_after(item)
|
257
|
+
i = info_cell_lines.index(item)
|
258
|
+
return unless i
|
259
|
+
info_cell_lines[i + 1]
|
260
|
+
end
|
261
|
+
|
262
|
+
def info_cell_html
|
263
|
+
@info_cell_html ||= info_cell_element.inner_html
|
264
|
+
end
|
265
|
+
|
266
|
+
def info_cell_element
|
267
|
+
doc.css('table[width="478"] td[height="271"] td[colspan="2"].main_font')[0]
|
268
|
+
end
|
269
|
+
|
270
|
+
def verify_third_info_cell
|
271
|
+
return unless has_package? && info_cell_lines[2][0,1] != '|'
|
272
|
+
raise CrawlKit::MalformedDocumentError,
|
273
|
+
"Expected third line in info cell to begin with bar. LCBO No: " \
|
274
|
+
"#{product_no}, Dump: #{info_cell_lines[2].inspect}"
|
275
|
+
end
|
276
|
+
|
277
|
+
def verify_response_not_blank
|
278
|
+
return unless html.strip == ''
|
279
|
+
raise CrawlKit::MissingResourceError,
|
280
|
+
"product #{product_no} does not appear to exist"
|
281
|
+
end
|
282
|
+
|
283
|
+
def verify_product_name
|
284
|
+
return unless product_details_form('itemName').strip == ''
|
285
|
+
raise CrawlKit::MissingResourceError,
|
286
|
+
"can not locate name for product #{product_no}"
|
287
|
+
end
|
288
|
+
|
289
|
+
def verify_product_details_form
|
290
|
+
return unless doc.css('form[name="productdetails"]').empty?
|
291
|
+
raise CrawlKit::MalformedDocumentError,
|
292
|
+
"productdetails form not found in doc for product #{product_no}"
|
293
|
+
end
|
294
|
+
|
295
|
+
end
|
296
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
require 'cgi'
|
2
|
+
|
3
|
+
module LCBO
|
4
|
+
class StorePage
|
5
|
+
|
6
|
+
include CrawlKit::Page
|
7
|
+
|
8
|
+
uri 'http://www.lcbo.com/lcbo-ear/jsp/storeinfo.jsp?' \
|
9
|
+
'STORE={store_no}&language=EN'
|
10
|
+
|
11
|
+
DAY_NAMES = %w[
|
12
|
+
monday
|
13
|
+
tuesday
|
14
|
+
wednesday
|
15
|
+
thursday
|
16
|
+
friday
|
17
|
+
saturday
|
18
|
+
sunday ]
|
19
|
+
|
20
|
+
DETAIL_FIELDS = {
|
21
|
+
:has_wheelchair_accessability => 'wheelchair',
|
22
|
+
:has_bilingual_services => 'bilingual',
|
23
|
+
:has_product_consultant => 'consultant',
|
24
|
+
:has_tasting_bar => 'tasting',
|
25
|
+
:has_beer_cold_room => 'cold',
|
26
|
+
:has_special_occasion_permits => 'permits',
|
27
|
+
:has_vintages_corner => 'vintages',
|
28
|
+
:has_parking => 'parking',
|
29
|
+
:has_transit_access => 'transit' }
|
30
|
+
|
31
|
+
on :before_parse, :verify_store_returned
|
32
|
+
on :after_parse, :verify_node_count
|
33
|
+
on :after_parse, :verify_telephone_number
|
34
|
+
|
35
|
+
emits :store_no do
|
36
|
+
query_params[:store_no].to_i
|
37
|
+
end
|
38
|
+
|
39
|
+
DAY_NAMES.each do |day|
|
40
|
+
emits :"#{day}_open" do
|
41
|
+
time_open_close(day)[0]
|
42
|
+
end
|
43
|
+
|
44
|
+
emits :"#{day}_close" do
|
45
|
+
time_open_close(day)[1]
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
emits :name do
|
50
|
+
CrawlKit::TitleCaseHelper[info_nodes[1].content.strip]
|
51
|
+
end
|
52
|
+
|
53
|
+
emits :address_line_1 do
|
54
|
+
data = info_nodes[2].content.strip.split(',')[0]
|
55
|
+
unless data
|
56
|
+
raise MalformedDocumentError,
|
57
|
+
"unable to locate address for store #{store_no}"
|
58
|
+
end
|
59
|
+
CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
|
60
|
+
end
|
61
|
+
|
62
|
+
emits :address_line_2 do
|
63
|
+
data = info_nodes[2].content.strip.split(',')[1]
|
64
|
+
CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
|
65
|
+
end
|
66
|
+
|
67
|
+
emits :city do
|
68
|
+
data = info_nodes[3].content.strip.split(',')[0]
|
69
|
+
CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
|
70
|
+
end
|
71
|
+
|
72
|
+
emits :postal_code do
|
73
|
+
data = info_nodes[3].content.strip.split(',')[1]
|
74
|
+
unless data
|
75
|
+
raise MalformedDocumentError,
|
76
|
+
"unable to locate postal code for store #{store_no}"
|
77
|
+
end
|
78
|
+
data.gsub(/[\n\r\t]+/, ' ').strip.upcase
|
79
|
+
end
|
80
|
+
|
81
|
+
emits :telephone do
|
82
|
+
info_nodes[4].content.
|
83
|
+
gsub(/[\n\r\t]+/, ' ').
|
84
|
+
gsub('Telephone:', '').
|
85
|
+
strip
|
86
|
+
end
|
87
|
+
|
88
|
+
emits :fax do
|
89
|
+
if has_fax?
|
90
|
+
info_nodes[5].content.gsub(/[\n\r\t]+/, ' ').gsub('Fax:', '').strip
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
emits :latitude do
|
95
|
+
location['latitude'][0].to_f
|
96
|
+
end
|
97
|
+
|
98
|
+
emits :longitude do
|
99
|
+
location['longitude'][0].to_f
|
100
|
+
end
|
101
|
+
|
102
|
+
DETAIL_FIELDS.keys.each do |field|
|
103
|
+
emits(field) { details[field] }
|
104
|
+
end
|
105
|
+
|
106
|
+
protected
|
107
|
+
|
108
|
+
def detail_rows
|
109
|
+
@detail_rows ||= begin
|
110
|
+
doc.css('input[type="checkbox"]').map { |e| e.parent.parent.inner_html }
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def details
|
115
|
+
@details ||= begin
|
116
|
+
DETAIL_FIELDS.reduce({}) do |hsh, (field, term)|
|
117
|
+
row = detail_rows.detect { |row| row.include?(term) }
|
118
|
+
value = row.include?('checked')
|
119
|
+
hsh.merge(field => value)
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
def map_anchor_href
|
125
|
+
info_nodes[has_fax? ? 6 : 5].css('a').first.attributes['href'].to_s
|
126
|
+
end
|
127
|
+
|
128
|
+
def location
|
129
|
+
CGI.parse(URI.parse(map_anchor_href).query)
|
130
|
+
end
|
131
|
+
|
132
|
+
def has_fax?
|
133
|
+
info_nodes.to_s.include?('Fax: ')
|
134
|
+
end
|
135
|
+
|
136
|
+
def time_open_close(day)
|
137
|
+
open_close_times[day.to_s.downcase]
|
138
|
+
end
|
139
|
+
|
140
|
+
def open_close_times
|
141
|
+
@open_close_times ||= begin
|
142
|
+
time_cells.inject({}) do |hsh, td|
|
143
|
+
text = td.text.gsub(/\s+/, ' ')
|
144
|
+
day = text.match(/[MTWTFS]{1}[a-z]+/).to_s.downcase
|
145
|
+
times = text.scan(/[0-9]{1,2}:[0-9]{2}/)
|
146
|
+
open, close = *times.map { |time|
|
147
|
+
hour, min = *time.split(':').map { |t| t.to_i }
|
148
|
+
(hour * 60) + min
|
149
|
+
}
|
150
|
+
hsh.merge(day => (open == close ? [nil, nil] : [open, close]))
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
def container_table
|
156
|
+
@doc.css('table.border[width="478"]')
|
157
|
+
end
|
158
|
+
|
159
|
+
def hours_table
|
160
|
+
container_table.css('table[width="100%"]')
|
161
|
+
end
|
162
|
+
|
163
|
+
def info_nodes
|
164
|
+
container_table.css('td[width="48%"]')
|
165
|
+
end
|
166
|
+
|
167
|
+
def time_cells
|
168
|
+
hours_table.
|
169
|
+
css('td[width="50%"] tr').
|
170
|
+
select { |td| td.to_s =~ /[MTWTFS]{1}[onuesdhriat]{2,5}day/ }
|
171
|
+
end
|
172
|
+
|
173
|
+
def expected_node_count
|
174
|
+
has_fax? ? 8 : 7
|
175
|
+
end
|
176
|
+
|
177
|
+
def verify_store_returned
|
178
|
+
return if !@html.include?('No stores were located using your criteria.')
|
179
|
+
raise MissingResourceError, "store #{store_no} does not exist"
|
180
|
+
end
|
181
|
+
|
182
|
+
def verify_telephone_number
|
183
|
+
return if telephone
|
184
|
+
raise MalformedDocumentError,
|
185
|
+
"unable to locate telephone number for store #{store_no}"
|
186
|
+
end
|
187
|
+
|
188
|
+
def verify_node_count
|
189
|
+
return if expected_node_count == info_nodes.size
|
190
|
+
raise MalformedDocumentError,
|
191
|
+
"Expected #{expected_node_count} nodes for store #{store_no} but found " \
|
192
|
+
"#{info_nodes.size} instead."
|
193
|
+
end
|
194
|
+
|
195
|
+
end
|
196
|
+
end
|