lcbo 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
@@ -0,0 +1,296 @@
1
+ module LCBO
2
+ class ProductPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ uri 'http://lcbo.com/lcbo-ear/lcbo/product/details.do?' \
7
+ 'language=EN&itemNumber={product_no}'
8
+
9
+ on :before_parse, :verify_response_not_blank
10
+ on :after_parse, :verify_product_details_form
11
+ on :after_parse, :verify_product_name
12
+ on :after_parse, :verify_third_info_cell
13
+
14
+ emits :product_no do
15
+ query_params[:product_no].to_i
16
+ end
17
+
18
+ emits :name do
19
+ CrawlKit::TitleCaseHelper[product_details_form('itemName')]
20
+ end
21
+
22
+ emits :price_in_cents do
23
+ (product_details_form('price').to_f * 100).to_i
24
+ end
25
+
26
+ emits :regular_price_in_cents do
27
+ if has_limited_time_offer
28
+ info_cell_line_after('Was:').sub('$ ', '').to_f * 100
29
+ else
30
+ price_in_cents
31
+ end
32
+ end
33
+
34
+ emits :limited_time_offer_savings_in_cents do
35
+ regular_price_in_cents - price_in_cents
36
+ end
37
+
38
+ emits :limited_time_offer_ends_on do
39
+ if has_limited_time_offer
40
+ CrawlKit::FastDateHelper[info_cell_line_after('Until')]
41
+ else
42
+ nil
43
+ end
44
+ end
45
+
46
+ emits :bonus_reward_miles do
47
+ if has_bonus_reward_miles
48
+ info_cell_line_after('Earn').to_i
49
+ else
50
+ 0
51
+ end
52
+ end
53
+
54
+ emits :bonus_reward_miles_ends_on do
55
+ if has_bonus_reward_miles
56
+ CrawlKit::FastDateHelper[info_cell_line_after('Until')]
57
+ else
58
+ nil
59
+ end
60
+ end
61
+
62
+ emits :stock_type do
63
+ product_details_form('stock type')
64
+ end
65
+
66
+ emits :primary_category do
67
+ if stock_category
68
+ cat = stock_category.split(',')[0]
69
+ cat ? cat.strip : cat
70
+ end
71
+ end
72
+
73
+ emits :secondary_category do
74
+ if stock_category
75
+ cat = stock_category.split(',')[1]
76
+ cat ? cat.strip : cat
77
+ end
78
+ end
79
+
80
+ emits :origin do
81
+ match = find_info_line(/\AMade in: /)
82
+ if match
83
+ place = match.
84
+ gsub('Made in: ', '').
85
+ gsub('/Californie', '').
86
+ gsub('Bosnia\'Hercegovina', 'Bosnia and Herzegovina').
87
+ gsub('Is. Of', 'Island of').
88
+ gsub('Italy Quality', 'Italy').
89
+ gsub('Usa-', '').
90
+ gsub(', Rep. Of', '').
91
+ gsub('&', 'and')
92
+ place.split(',').map { |s| s.strip }.uniq.join(', ')
93
+ end
94
+ end
95
+
96
+ emits :package do
97
+ @package ||= begin
98
+ string = info_cell_lines[2]
99
+ string.include?('Price: ') ? nil : string.sub('|','').strip
100
+ end
101
+ end
102
+
103
+ emits :package_unit_type do
104
+ volume_helper.unit_type
105
+ end
106
+
107
+ emits :package_unit_volume_in_milliliters do
108
+ volume_helper.unit_volume
109
+ end
110
+
111
+ emits :total_package_units do
112
+ volume_helper.total_units
113
+ end
114
+
115
+ emits :total_package_volume_in_milliliters do
116
+ volume_helper.package_volume
117
+ end
118
+
119
+ emits :volume_in_milliliters do
120
+ CrawlKit::VolumeHelper[package]
121
+ end
122
+
123
+ emits :alcohol_content do
124
+ match = find_info_line(/ Alcohol\/Vol.\Z/)
125
+ if match
126
+ ac = match.gsub(/%| Alcohol\/Vol./, '').to_f
127
+ ac.zero? ? nil : (ac * 100).to_i
128
+ end
129
+ end
130
+
131
+ emits :sugar_content do
132
+ match = match = find_info_line(/\ASugar Content : /)
133
+ if match
134
+ match.gsub('Sugar Content : ', '')
135
+ end
136
+ end
137
+
138
+ emits :producer_name do
139
+ match = find_info_line(/\ABy: /)
140
+ if match
141
+ CrawlKit::TitleCaseHelper[
142
+ match.gsub(/By: |Tasting Note|Serving Suggestion|NOTE:/, '')
143
+ ]
144
+ end
145
+ end
146
+
147
+ emits :released_on do
148
+ if html.include?('Release Date:')
149
+ date = info_cell_line_after('Release Date:')
150
+ date == 'N/A' ? nil : CrawlKit::FastDateHelper[date]
151
+ else
152
+ nil
153
+ end
154
+ end
155
+
156
+ emits :is_discontinued do
157
+ html.include?('PRODUCT DISCONTINUED')
158
+ end
159
+
160
+ emits :has_limited_time_offer do
161
+ html.include?('<B>Limited Time Offer</B>')
162
+ end
163
+
164
+ emits :has_bonus_reward_miles do
165
+ html.include?('<B>Bonus Reward Miles Offer</B>')
166
+ end
167
+
168
+ emits :is_seasonal do
169
+ html.include?('<font color="#ff0000">SEASONAL/LIMITED QUANTITIES</font>')
170
+ end
171
+
172
+ emits :is_vqa do
173
+ html.include?('This is a <B>VQA</B> wine')
174
+ end
175
+
176
+ emits :description do
177
+ if html.include?('<B>Description</B>')
178
+ match = html.match(/<B>Description<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
179
+ match ? match.captures[0] : nil
180
+ else
181
+ nil
182
+ end
183
+ end
184
+
185
+ emits :serving_suggestion do
186
+ if html.include?('<B>Serving Suggestion</B>')
187
+ match = html.match(/<B>Serving Suggestion<\/B><\/font><BR>\n\t\t\t(.*)<BR><BR>/m)
188
+ match ? match.captures[0] : nil
189
+ else
190
+ nil
191
+ end
192
+ end
193
+
194
+ emits :tasting_note do
195
+ if html.include?('<B>Tasting Note</B>')
196
+ match = html.match(/<B>Tasting Note<\/B><\/font><BR>\n\t\t\t(.*)<BR>\n\t\t\t<BR>/m)
197
+ match ? match.captures[0] : nil
198
+ else
199
+ nil
200
+ end
201
+ end
202
+
203
+ private
204
+
205
+ def volume_helper
206
+ @volume_helper ||= CrawlKit::VolumeHelper.new(package)
207
+ end
208
+
209
+ def has_package?
210
+ !info_cell_lines[2].include?('Price:')
211
+ end
212
+
213
+ def stock_category
214
+ cat = get_info_lines_at_offset(12).reject do |line|
215
+ l = line.strip
216
+ l == '' ||
217
+ l.include?('Price:') ||
218
+ l.include?('Bonus Reward Miles Offer') ||
219
+ l.include?('Value Added Promotion') ||
220
+ l.include?('Limited Time Offer') ||
221
+ l.include?('NOTE:')
222
+ end.first
223
+ cat ? cat.strip : nil
224
+ end
225
+
226
+ def product_details_form(name)
227
+ doc.css("form[name=\"productdetails\"] input[name=\"#{name}\"]")[0].
228
+ attributes['value'].to_s
229
+ end
230
+
231
+ def get_info_lines_at_offset(offset)
232
+ raw_info_cell_lines.select do |line|
233
+ match = line.scan(/\A[\s]+/)[0]
234
+ match ? offset == match.size : false
235
+ end
236
+ end
237
+
238
+ def info_cell_text
239
+ @info_cell_text ||= info_cell_lines.join("\n")
240
+ end
241
+
242
+ def find_info_line(regexp)
243
+ info_cell_lines.select { |l| l =~ regexp }.first
244
+ end
245
+
246
+ def raw_info_cell_lines
247
+ @raw_info_cell_lines ||= info_cell_element.content.split(/\n/)
248
+ end
249
+
250
+ def info_cell_lines
251
+ @info_cell_lines ||= begin
252
+ raw_info_cell_lines.map { |l| l.strip }.reject { |l| l == '' }
253
+ end
254
+ end
255
+
256
+ def info_cell_line_after(item)
257
+ i = info_cell_lines.index(item)
258
+ return unless i
259
+ info_cell_lines[i + 1]
260
+ end
261
+
262
+ def info_cell_html
263
+ @info_cell_html ||= info_cell_element.inner_html
264
+ end
265
+
266
+ def info_cell_element
267
+ doc.css('table[width="478"] td[height="271"] td[colspan="2"].main_font')[0]
268
+ end
269
+
270
+ def verify_third_info_cell
271
+ return unless has_package? && info_cell_lines[2][0,1] != '|'
272
+ raise CrawlKit::MalformedDocumentError,
273
+ "Expected third line in info cell to begin with bar. LCBO No: " \
274
+ "#{product_no}, Dump: #{info_cell_lines[2].inspect}"
275
+ end
276
+
277
+ def verify_response_not_blank
278
+ return unless html.strip == ''
279
+ raise CrawlKit::MissingResourceError,
280
+ "product #{product_no} does not appear to exist"
281
+ end
282
+
283
+ def verify_product_name
284
+ return unless product_details_form('itemName').strip == ''
285
+ raise CrawlKit::MissingResourceError,
286
+ "can not locate name for product #{product_no}"
287
+ end
288
+
289
+ def verify_product_details_form
290
+ return unless doc.css('form[name="productdetails"]').empty?
291
+ raise CrawlKit::MalformedDocumentError,
292
+ "productdetails form not found in doc for product #{product_no}"
293
+ end
294
+
295
+ end
296
+ end
@@ -0,0 +1,196 @@
1
+ require 'cgi'
2
+
3
+ module LCBO
4
+ class StorePage
5
+
6
+ include CrawlKit::Page
7
+
8
+ uri 'http://www.lcbo.com/lcbo-ear/jsp/storeinfo.jsp?' \
9
+ 'STORE={store_no}&language=EN'
10
+
11
+ DAY_NAMES = %w[
12
+ monday
13
+ tuesday
14
+ wednesday
15
+ thursday
16
+ friday
17
+ saturday
18
+ sunday ]
19
+
20
+ DETAIL_FIELDS = {
21
+ :has_wheelchair_accessability => 'wheelchair',
22
+ :has_bilingual_services => 'bilingual',
23
+ :has_product_consultant => 'consultant',
24
+ :has_tasting_bar => 'tasting',
25
+ :has_beer_cold_room => 'cold',
26
+ :has_special_occasion_permits => 'permits',
27
+ :has_vintages_corner => 'vintages',
28
+ :has_parking => 'parking',
29
+ :has_transit_access => 'transit' }
30
+
31
+ on :before_parse, :verify_store_returned
32
+ on :after_parse, :verify_node_count
33
+ on :after_parse, :verify_telephone_number
34
+
35
+ emits :store_no do
36
+ query_params[:store_no].to_i
37
+ end
38
+
39
+ DAY_NAMES.each do |day|
40
+ emits :"#{day}_open" do
41
+ time_open_close(day)[0]
42
+ end
43
+
44
+ emits :"#{day}_close" do
45
+ time_open_close(day)[1]
46
+ end
47
+ end
48
+
49
+ emits :name do
50
+ CrawlKit::TitleCaseHelper[info_nodes[1].content.strip]
51
+ end
52
+
53
+ emits :address_line_1 do
54
+ data = info_nodes[2].content.strip.split(',')[0]
55
+ unless data
56
+ raise MalformedDocumentError,
57
+ "unable to locate address for store #{store_no}"
58
+ end
59
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip]
60
+ end
61
+
62
+ emits :address_line_2 do
63
+ data = info_nodes[2].content.strip.split(',')[1]
64
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
65
+ end
66
+
67
+ emits :city do
68
+ data = info_nodes[3].content.strip.split(',')[0]
69
+ CrawlKit::TitleCaseHelper[data.gsub(/[\n\r\t]+/, ' ').strip] if data
70
+ end
71
+
72
+ emits :postal_code do
73
+ data = info_nodes[3].content.strip.split(',')[1]
74
+ unless data
75
+ raise MalformedDocumentError,
76
+ "unable to locate postal code for store #{store_no}"
77
+ end
78
+ data.gsub(/[\n\r\t]+/, ' ').strip.upcase
79
+ end
80
+
81
+ emits :telephone do
82
+ info_nodes[4].content.
83
+ gsub(/[\n\r\t]+/, ' ').
84
+ gsub('Telephone:', '').
85
+ strip
86
+ end
87
+
88
+ emits :fax do
89
+ if has_fax?
90
+ info_nodes[5].content.gsub(/[\n\r\t]+/, ' ').gsub('Fax:', '').strip
91
+ end
92
+ end
93
+
94
+ emits :latitude do
95
+ location['latitude'][0].to_f
96
+ end
97
+
98
+ emits :longitude do
99
+ location['longitude'][0].to_f
100
+ end
101
+
102
+ DETAIL_FIELDS.keys.each do |field|
103
+ emits(field) { details[field] }
104
+ end
105
+
106
+ protected
107
+
108
+ def detail_rows
109
+ @detail_rows ||= begin
110
+ doc.css('input[type="checkbox"]').map { |e| e.parent.parent.inner_html }
111
+ end
112
+ end
113
+
114
+ def details
115
+ @details ||= begin
116
+ DETAIL_FIELDS.reduce({}) do |hsh, (field, term)|
117
+ row = detail_rows.detect { |row| row.include?(term) }
118
+ value = row.include?('checked')
119
+ hsh.merge(field => value)
120
+ end
121
+ end
122
+ end
123
+
124
+ def map_anchor_href
125
+ info_nodes[has_fax? ? 6 : 5].css('a').first.attributes['href'].to_s
126
+ end
127
+
128
+ def location
129
+ CGI.parse(URI.parse(map_anchor_href).query)
130
+ end
131
+
132
+ def has_fax?
133
+ info_nodes.to_s.include?('Fax: ')
134
+ end
135
+
136
+ def time_open_close(day)
137
+ open_close_times[day.to_s.downcase]
138
+ end
139
+
140
+ def open_close_times
141
+ @open_close_times ||= begin
142
+ time_cells.inject({}) do |hsh, td|
143
+ text = td.text.gsub(/\s+/, ' ')
144
+ day = text.match(/[MTWTFS]{1}[a-z]+/).to_s.downcase
145
+ times = text.scan(/[0-9]{1,2}:[0-9]{2}/)
146
+ open, close = *times.map { |time|
147
+ hour, min = *time.split(':').map { |t| t.to_i }
148
+ (hour * 60) + min
149
+ }
150
+ hsh.merge(day => (open == close ? [nil, nil] : [open, close]))
151
+ end
152
+ end
153
+ end
154
+
155
+ def container_table
156
+ @doc.css('table.border[width="478"]')
157
+ end
158
+
159
+ def hours_table
160
+ container_table.css('table[width="100%"]')
161
+ end
162
+
163
+ def info_nodes
164
+ container_table.css('td[width="48%"]')
165
+ end
166
+
167
+ def time_cells
168
+ hours_table.
169
+ css('td[width="50%"] tr').
170
+ select { |td| td.to_s =~ /[MTWTFS]{1}[onuesdhriat]{2,5}day/ }
171
+ end
172
+
173
+ def expected_node_count
174
+ has_fax? ? 8 : 7
175
+ end
176
+
177
+ def verify_store_returned
178
+ return if !@html.include?('No stores were located using your criteria.')
179
+ raise MissingResourceError, "store #{store_no} does not exist"
180
+ end
181
+
182
+ def verify_telephone_number
183
+ return if telephone
184
+ raise MalformedDocumentError,
185
+ "unable to locate telephone number for store #{store_no}"
186
+ end
187
+
188
+ def verify_node_count
189
+ return if expected_node_count == info_nodes.size
190
+ raise MalformedDocumentError,
191
+ "Expected #{expected_node_count} nodes for store #{store_no} but found " \
192
+ "#{info_nodes.size} instead."
193
+ end
194
+
195
+ end
196
+ end