lcbo 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
@@ -0,0 +1,296 @@
1
+ module LCBO
2
+ class ProductPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ uri 'http://lcbo.com/lcbo-ear/lcbo/product/details.do?' \
7
+ 'language=EN&itemNumber={product_no}'
8
+
9
+ on :before_parse, :verify_response_not_blank
10
+ on :after_parse, :verify_product_details_form
11
+ on :after_parse, :verify_product_name
12
+ on :after_parse, :verify_third_info_cell
13
+
14
+ emits :product_no do
15
+ query_params[:product_no].to_i
16
+ end
17
+
18
+ emits :name do
19
+ CrawlKit::TitleCaseHelper[product_details_form('itemName')]
20
+ end
21
+
22
+ emits :price_in_cents do
23
+ (product_details_form('price').to_f * 100).to_i
24
+ end
25
+
26
+ emits :regular_price_in_cents do
27
+ if has_limited_time_offer
28
+ info_cell_line_after('Was:').sub('$ ', '').to_f * 100
29
+ else
30
+ price_in_cents
31
+ end
32
+ end
33
+
34
+ emits :limited_time_offer_savings_in_cents do
35
+ regular_price_in_cents - price_in_cents
36
+ end
37
+
38
+ emits :limited_time_offer_ends_on do
39
+ if has_limited_time_offer
40
+ CrawlKit::FastDateHelper[info_cell_line_after('Until')]
41
+ else
42
+ nil
43
+ end
44
+ end
45
+
46
+ emits :bonus_reward_miles do
47
+ if has_bonus_reward_miles
48
+ info_cell_line_after('Earn').to_i
49
+ else
50
+ 0
51
+ end
52
+ end
53
+
54
+ emits :bonus_reward_miles_ends_on do
55
+ if has_bonus_reward_miles
56
+ CrawlKit::FastDateHelper[info_cell_line_after('Until')]
57
+ else
58
+ nil
59
+ end
60
+ end
61
+
62
+ emits :stock_type do
63
+ product_details_form('stock type')
64
+ end
65
+
66
+ emits :primary_category do
67
+ if stock_category
68
+ cat = stock_category.split(',')[0]
69
+ cat ? cat.strip : cat
70
+ end
71
+ end
72
+
73
+ emits :secondary_category do
74
+ if stock_category
75
+ cat = stock_category.split(',')[1]
76
+ cat ? cat.strip : cat
77
+ end
78
+ end
79
+
80
+ emits :origin do
81
+ match = find_info_line(/\AMade in: /)
82
+ if match
83
+ place = match.
84
+ gsub('Made in: ', '').
85
+ gsub('/Californie', '').
86
+ gsub('Bosnia\'Hercegovina', 'Bosnia and Herzegovina').
87
+ gsub('Is. Of', 'Island of').
88
+ gsub('Italy Quality', 'Italy').
89
+ gsub('Usa-', '').
90
+ gsub(', Rep. Of', '').
91
+ gsub('&', 'and')
92
+ place.split(',').map { |s| s.strip }.uniq.join(', ')
93
+ end
94
+ end
95
+
96
+ emits :package do
97
+ @package ||= begin
98
+ string = info_cell_lines[2]
99
+ string.include?('Price: ') ? nil : string.sub('|','').strip
100
+ end
101
+ end
102
+
103
+ emits :package_unit_type do
104
+ volume_helper.unit_type
105
+ end
106
+
107
+ emits :package_unit_volume_in_milliliters do
108
+ volume_helper.unit_volume
109
+ end
110
+
111
+ emits :total_package_units do
112
+ volume_helper.total_units
113
+ end
114
+
115
+ emits :total_package_volume_in_milliliters do
116
+ volume_helper.package_volume
117
+ end
118
+
119
+ emits :volume_in_milliliters do
120
+ CrawlKit::VolumeHelper[package]
121
+ end
122
+
123
+ emits :alcohol_content do
124
+ match = find_info_line(/ Alcohol\/Vol.\Z/)
125
+ if match
126
+ ac = match.gsub(/%| Alcohol\/Vol./, '').to_f
127
+ ac.zero? ? nil : (ac * 100).to_i
128
+ end
129
+ end
130
+
131
+ emits :sugar_content do
132
+ match = match = find_info_line(/\ASugar Content : /)
133
+ if match
134
+ match.gsub('Sugar Content : ', '')
135
+ end
136
+ end
137
+
138
+ emits :producer_name do
139
+ match = find_info_line(/\ABy: /)
140
+ if match
141
+ CrawlKit::TitleCaseHelper[
142
+ match.gsub(/By: |Tasting Note|Serving Suggestion|NOTE:/, '')
143
+ ]
144
+ end
145
+ end
146
+
147
+ emits :released_on do
148
+ if html.include?('Release Date:')
149
+ date = info_cell_line_after('Release Date:')
150
+ date == 'N/A' ? nil : CrawlKit::FastDateHelper[date]
151
+ else
152
+ nil
153
+ end
154
+ end
155
+
156
+ emits :is_discontinued do
157
+ html.include?('PRODUCT DISCONTINUED')
158
+ end
159
+
160
+ emits :has_limited_time_offer do
161
+ html.include?('<B>Limited Time Offer</B>')
162
+ end
163
+
164
+ emits :has_bonus_reward_miles do
165
+ html.include?('<B>Bonus Reward Miles Offer</B>')
166
+ end
167
+
168
+ emits :is_seasonal do
169
+ html.include?('<font color="#ff0000">SEASONAL/LIMITED QUANTITIES</font>')
170
+ end
171
+
172
+ emits :is_vqa do
173
+ html.include?('This is a <B>VQA</B> wine')
174
+ end
175
+
176
+ emits :description do
177
+ if html.include?('<B>Description</B>')
178
+ match = html.match(/<B>Description<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
179
+ match ? match.captures[0] : nil
180
+ else
181
+ nil
182
+ end
183
+ end
184
+
185
+ emits :serving_suggestion do
186
+ if html.include?('<B>Serving Suggestion</B>')
187
+ match = html.match(/<B>Serving Suggestion<\/B><\/font><BR>\n\t\t\t(.*)<BR><BR>/m)
188
+ match ? match.captures[0] : nil
189
+ else
190
+ nil
191
+ end
192
+ end
193
+
194
+ emits :tasting_note do
195
+ if html.include?('<B>Tasting Note</B>')
196
+ match = html.match(/<B>Tasting Note<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
197
+ match ? match.captures[0] : nil
198
+ else
199
+ nil
200
+ end
201
+ end
202
+
203
+ private
204
+
205
+ def volume_helper
206
+ @volume_helper ||= CrawlKit::VolumeHelper.new(package)
207
+ end
208
+
209
+ def has_package?
210
+ !info_cell_lines[2].include?('Price:')
211
+ end
212
+
213
+ def stock_category
214
+ cat = get_info_lines_at_offset(12).reject do |line|
215
+ l = line.strip
216
+ l == '' ||
217
+ l.include?('Price:') ||
218
+ l.include?('Bonus Reward Miles Offer') ||
219
+ l.include?('Value Added Promotion') ||
220
+ l.include?('Limited Time Offer') ||
221
+ l.include?('NOTE:')
222
+ end.first
223
+ cat ? cat.strip : nil
224
+ end
225
+
226
+ def product_details_form(name)
227
+ doc.css("form[name=\"productdetails\"] input[name=\"#{name}\"]")[0].
228
+ attributes['value'].to_s
229
+ end
230
+
231
+ def get_info_lines_at_offset(offset)
232
+ raw_info_cell_lines.select do |line|
233
+ match = line.scan(/\A[\s]+/)[0]
234
+ match ? offset == match.size : false
235
+ end
236
+ end
237
+
238
+ def info_cell_text
239
+ @info_cell_text ||= info_cell_lines.join("\n")
240
+ end
241
+
242
+ def find_info_line(regexp)
243
+ info_cell_lines.select { |l| l =~ regexp }.first
244
+ end
245
+
246
+ def raw_info_cell_lines
247
+ @raw_info_cell_lines ||= info_cell_element.content.split(/\n/)
248
+ end
249
+
250
+ def info_cell_lines
251
+ @info_cell_lines ||= begin
252
+ raw_info_cell_lines.map { |l| l.strip }.reject { |l| l == '' }
253
+ end
254
+ end
255
+
256
+ def info_cell_line_after(item)
257
+ i = info_cell_lines.index(item)
258
+ return unless i
259
+ info_cell_lines[i + 1]
260
+ end
261
+
262
+ def info_cell_html
263
+ @info_cell_html ||= info_cell_element.inner_html
264
+ end
265
+
266
+ def info_cell_element
267
+ doc.css('table[width="478"] td[height="271"] td[colspan="2"].main_font')[0]
268
+ end
269
+
270
+ def verify_third_info_cell
271
+ return unless has_package? && info_cell_lines[2][0,1] != '|'
272
+ raise CrawlKit::MalformedDocumentError,
273
+ "Expected third line in info cell to begin with bar. LCBO No: " \
274
+ "#{product_no}, Dump: #{info_cell_lines[2].inspect}"
275
+ end
276
+
277
+ def verify_response_not_blank
278
+ return unless html.strip == ''
279
+ raise CrawlKit::MissingResourceError,
280
+ "product #{product_no} does not appear to exist"
281
+ end
282
+
283
+ def verify_product_name
284
+ return unless product_details_form('itemName').strip == ''
285
+ raise CrawlKit::MissingResourceError,
286
+ "can not locate name for product #{product_no}"
287
+ end
288
+
289
+ def verify_product_details_form
290
+ return unless doc.css('form[name="productdetails"]').empty?
291
+ raise CrawlKit::MalformedDocumentError,
292
+ "productdetails form not found in doc for product #{product_no}"
293
+ end
294
+
295
+ end
296
+ end
@@ -0,0 +1,196 @@
1
+ require 'cgi'
2
+
3
+ module LCBO
4
+ class StorePage
5
+
6
+ include CrawlKit::Page
7
+
8
+ uri 'http://www.lcbo.com/lcbo-ear/jsp/storeinfo.jsp?' \
9
+ 'STORE={store_no}&language=EN'
10
+
11
+ DAY_NAMES = %w[
12
+ monday
13
+ tuesday
14
+ wednesday
15
+ thursday
16
+ friday
17
+ saturday
18
+ sunday ]
19
+
20
+ DETAIL_FIELDS = {
21
+ :has_wheelchair_accessability => 'wheelchair',
22
+ :has_bilingual_services => 'bilingual',
23
+ :has_product_consultant => 'consultant',
24
+ :has_tasting_bar => 'tasting',
25
+ :has_beer_cold_room => 'cold',
26
+ :has_special_occasion_permits => 'permits',
27
+ :has_vintages_corner => 'vintages',
28
+ :has_parking => 'parking',
29
+ :has_transit_access => 'transit' }
30
+
31
+ on :before_parse, :verify_store_returned
32
+ on :after_parse, :verify_node_count
33
+ on :after_parse, :verify_telephone_number
34
+
35
+ emits :store_no do
36
+ query_params[:store_no].to_i
37
+ end
38
+
39
+ DAY_NAMES.each do |day|
40
+ emits :"#{day}_open" do
41
+ time_open_close(day)[0]
42
+ end
43
+
44
+ emits :"#{day}_close" do
45
+ time_open_close(day)[1]
46
+ end
47
+ end
48
+
49
+ emits :name do
50
+ CrawlKit::TitleCaseHelper[info_nodes[1].content.strip]
51
+ end
52
+
53
+ emits :address_line_1 do
54
+ data = info_nodes[2].content.strip.split(',')[0]
55
+ unless data
56
+ raise MalformedDocumentError,
57
+ "unable to locate address for store #{store_no}"
58
+ end
59
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
60
+ end
61
+
62
+ emits :address_line_2 do
63
+ data = info_nodes[2].content.strip.split(',')[1]
64
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
65
+ end
66
+
67
+ emits :city do
68
+ data = info_nodes[3].content.strip.split(',')[0]
69
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
70
+ end
71
+
72
+ emits :postal_code do
73
+ data = info_nodes[3].content.strip.split(',')[1]
74
+ unless data
75
+ raise MalformedDocumentError,
76
+ "unable to locate postal code for store #{store_no}"
77
+ end
78
+ data.gsub(/[\n\r\t]+/, ' ').strip.upcase
79
+ end
80
+
81
+ emits :telephone do
82
+ info_nodes[4].content.
83
+ gsub(/[\n\r\t]+/, ' ').
84
+ gsub('Telephone:', '').
85
+ strip
86
+ end
87
+
88
+ emits :fax do
89
+ if has_fax?
90
+ info_nodes[5].content.gsub(/[\n\r\t]+/, ' ').gsub('Fax:', '').strip
91
+ end
92
+ end
93
+
94
+ emits :latitude do
95
+ location['latitude'][0].to_f
96
+ end
97
+
98
+ emits :longitude do
99
+ location['longitude'][0].to_f
100
+ end
101
+
102
+ DETAIL_FIELDS.keys.each do |field|
103
+ emits(field) { details[field] }
104
+ end
105
+
106
+ protected
107
+
108
+ def detail_rows
109
+ @detail_rows ||= begin
110
+ doc.css('input[type="checkbox"]').map { |e| e.parent.parent.inner_html }
111
+ end
112
+ end
113
+
114
+ def details
115
+ @details ||= begin
116
+ DETAIL_FIELDS.reduce({}) do |hsh, (field, term)|
117
+ row = detail_rows.detect { |row| row.include?(term) }
118
+ value = row.include?('checked')
119
+ hsh.merge(field => value)
120
+ end
121
+ end
122
+ end
123
+
124
+ def map_anchor_href
125
+ info_nodes[has_fax? ? 6 : 5].css('a').first.attributes['href'].to_s
126
+ end
127
+
128
+ def location
129
+ CGI.parse(URI.parse(map_anchor_href).query)
130
+ end
131
+
132
+ def has_fax?
133
+ info_nodes.to_s.include?('Fax: ')
134
+ end
135
+
136
+ def time_open_close(day)
137
+ open_close_times[day.to_s.downcase]
138
+ end
139
+
140
+ def open_close_times
141
+ @open_close_times ||= begin
142
+ time_cells.inject({}) do |hsh, td|
143
+ text = td.text.gsub(/\s+/, ' ')
144
+ day = text.match(/[MTWTFS]{1}[a-z]+/).to_s.downcase
145
+ times = text.scan(/[0-9]{1,2}:[0-9]{2}/)
146
+ open, close = *times.map { |time|
147
+ hour, min = *time.split(':').map { |t| t.to_i }
148
+ (hour * 60) + min
149
+ }
150
+ hsh.merge(day => (open == close ? [nil, nil] : [open, close]))
151
+ end
152
+ end
153
+ end
154
+
155
+ def container_table
156
+ @doc.css('table.border[width="478"]')
157
+ end
158
+
159
+ def hours_table
160
+ container_table.css('table[width="100%"]')
161
+ end
162
+
163
+ def info_nodes
164
+ container_table.css('td[width="48%"]')
165
+ end
166
+
167
+ def time_cells
168
+ hours_table.
169
+ css('td[width="50%"] tr').
170
+ select { |td| td.to_s =~ /[MTWTFS]{1}[onuesdhriat]{2,5}day/ }
171
+ end
172
+
173
+ def expected_node_count
174
+ has_fax? ? 8 : 7
175
+ end
176
+
177
+ def verify_store_returned
178
+ return if !@html.include?('No stores were located using your criteria.')
179
+ raise MissingResourceError, "store #{store_no} does not exist"
180
+ end
181
+
182
+ def verify_telephone_number
183
+ return if telephone
184
+ raise MalformedDocumentError,
185
+ "unable to locate telephone number for store #{store_no}"
186
+ end
187
+
188
+ def verify_node_count
189
+ return if expected_node_count == info_nodes.size
190
+ raise MalformedDocumentError,
191
+ "Expected #{expected_node_count} nodes for store #{store_no} but found " \
192
+ "#{info_nodes.size} instead."
193
+ end
194
+
195
+ end
196
+ end