lcbo 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
@@ -0,0 +1,51 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class Request
4
+
5
+ attr_reader :request_prototype, :query_params, :body_params
6
+
7
+ def initialize(request_prototype, query_p = {}, body_p = {})
8
+ @request_prototype = request_prototype
9
+ self.query_params = query_p
10
+ self.body_params = body_p
11
+ end
12
+
13
+ def query_params=(value)
14
+ @query_params = (value || {})
15
+ end
16
+
17
+ def body_params=(value)
18
+ @body_params = request_prototype.body_params.merge(value || {})
19
+ end
20
+
21
+ def gettable?
22
+ [:head, :get].include?(request_prototype.http_method)
23
+ end
24
+
25
+ def config
26
+ opts = {}
27
+ opts[:method] = request_prototype.http_method
28
+ opts[:user_agent] = USER_AGENT
29
+ opts[:params] = body_params unless gettable?
30
+ opts
31
+ end
32
+
33
+ def uri
34
+ request_prototype.uri_template.expand(query_params).to_s
35
+ end
36
+
37
+ def run
38
+ response = Typhoeus::Request.run(uri, config)
39
+ Response.new \
40
+ :code => response.code,
41
+ :uri => response.request.url,
42
+ :http_method => response.request.method,
43
+ :time => response.time,
44
+ :query_params => query_params,
45
+ :body_params => body_params,
46
+ :body => response.body
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,31 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class RequestPrototype
4
+
5
+ attr_reader :http_method, :uri_template, :body_params
6
+
7
+ def initialize(uri_template = nil, http_method = :get, body_params = {})
8
+ self.uri_template = uri_template
9
+ self.http_method = http_method
10
+ self.body_params = body_params
11
+ end
12
+
13
+ def http_method=(value)
14
+ @http_method = value ? value.to_s.downcase.to_sym : :get
15
+ end
16
+
17
+ def uri_template=(value)
18
+ @uri_template = Addressable::Template.new(value) if value
19
+ end
20
+
21
+ def body_params=(value)
22
+ @body_params = value ? HashExt.symbolize_keys(value) : {}
23
+ end
24
+
25
+ def request(query_params = {}, body_params = {})
26
+ Request.new(self, query_params, body_params).run
27
+ end
28
+
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,48 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class Response
4
+
5
+ attr_reader :response, :body, :query_params, :body_params, :uri,
6
+ :code, :time, :http_method
7
+
8
+ def initialize(response)
9
+ params = HashExt.symbolize_keys(response)
10
+ @code = params[:code]
11
+ @uri = params[:uri]
12
+ @http_method = params[:http_method]
13
+ @time = params[:time]
14
+ @query_params = params[:query_params]
15
+ @body_params = params[:body_params]
16
+ @body = self.class.normalize_encoding(params[:body])
17
+ ensure_success!
18
+ end
19
+
20
+ def self.normalize_encoding(html)
21
+ if html.valid_encoding?
22
+ html
23
+ else
24
+ html.force_encoding('ISO-8859-1')
25
+ html.encode('UTF-8')
26
+ end.gsub("\r\n", "\n")
27
+ end
28
+
29
+ def as_hash
30
+ { :code => code,
31
+ :uri => uri,
32
+ :http_method => http_method,
33
+ :time => time,
34
+ :query_params => query_params,
35
+ :body_params => body_params,
36
+ :body => body }
37
+ end
38
+
39
+ protected
40
+
41
+ def ensure_success!
42
+ return if @code == 200
43
+ raise RequestFailedError, "<#{@uri}> failed with status: #{@code}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,97 @@
1
+ # coding: utf-8
2
+ # TODO: This is an ugly piece of ass that should burn and die!
3
+ module LCBO
4
+ module CrawlKit
5
+ class TitleCaseHelper
6
+
7
+ UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
8
+ LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
9
+ ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
10
+ UPPER_RANGE = "[#{UPPER_CHARS}]"
11
+ LOWER_RANGE = "[#{LOWER_CHARS}]"
12
+ FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
13
+ ALPHA_RE = /#{ALPHA_RANGE}.*/u
14
+ SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
15
+ ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
16
+
17
+ attr_reader :input
18
+
19
+ def self.[](string)
20
+ titleize(string)
21
+ end
22
+
23
+ def self.upcase(string)
24
+ string.tr(LOWER_CHARS, UPPER_CHARS)
25
+ end
26
+
27
+ def self.downcase(string)
28
+ string.tr(UPPER_CHARS, LOWER_CHARS)
29
+ end
30
+
31
+ def self.preclean(string)
32
+ # Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
33
+ string.gsub(/\(.+\Z/, '').
34
+ # Strip trailing stars.
35
+ gsub(/\*+\Z/, '')
36
+ end
37
+
38
+ def self.capitalize(string)
39
+ first_letter = string.scan(FIRST_CHAR_RE)[0]
40
+ if first_letter
41
+ uchar = upcase(first_letter)
42
+ string.sub(/#{first_letter}/u, uchar)
43
+ else
44
+ string
45
+ end
46
+ end
47
+
48
+ def self.titleize(string)
49
+ phrases(preclean(downcase(string))).map do |phrase|
50
+ words = phrase.split
51
+ words.map do |word|
52
+ def word.capitalize
53
+ self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
54
+ end
55
+ case word
56
+ when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
57
+ upcase(word)
58
+ when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
59
+ word.split(/\&/).map { |w| capitalize(w) }.join('&')
60
+ when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
61
+ word.split(/\-/).map { |w| capitalize(w) }.join('-')
62
+ when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
63
+ word.split(/\//).map { |w| capitalize(w) }.join(' / ')
64
+ when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
65
+ capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
66
+ '.' == word[-1, 1] ? capitalized + '.' : capitalized
67
+ when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
68
+ word
69
+ when words.first, words.last
70
+ word.capitalize
71
+ when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
72
+ word.downcase
73
+ else
74
+ word.capitalize
75
+ end
76
+ end.join(' ')
77
+ end.join(' ').
78
+ # Special case for Word'S
79
+ gsub(/(['’])S\b/, '\1s')
80
+ end
81
+
82
+ def self.phrases(title)
83
+ phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
84
+ # rejoin phrases that were split on the '.' from a small word
85
+ if phrases.size > 1
86
+ phrases[0..-2].each_with_index do |phrase, index|
87
+ if SMALL_WORDS.include?(phrase.split.last.downcase)
88
+ phrases[index] << " " + phrases.slice!(index + 1)
89
+ end
90
+ end
91
+ end
92
+ phrases
93
+ end
94
+
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,46 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class VolumeHelper
4
+
5
+ attr_reader :package_volume, :unit_volume, :total_units, :unit_type
6
+
7
+ def initialize(input)
8
+ @input = input
9
+ @package_volume = 0
10
+ @unit_volume = 0
11
+ @total_units = 0
12
+ calculate
13
+ self
14
+ end
15
+
16
+ def self.[](input_string)
17
+ new(input_string).as_milliliters
18
+ end
19
+
20
+ def as_milliliters
21
+ @package_volume
22
+ end
23
+
24
+ private
25
+
26
+ def calculate
27
+ return unless @input
28
+ match = @input.match(/([0-9]+|[0-9]+x[0-9]+) (mL) ([a-z]+)/)
29
+ return unless match
30
+ captures = match.captures
31
+ return unless captures.size == 3
32
+
33
+ if captures[0].include?('x')
34
+ @total_units, @unit_volume = *captures[0].split('x').map(&:to_i)
35
+ else
36
+ @total_units = 1
37
+ @unit_volume = captures[0].to_i
38
+ end
39
+
40
+ @unit_type = captures[2]
41
+ @package_volume = @total_units * @unit_volume
42
+ end
43
+
44
+ end
45
+ end
46
+ end
data/lib/lcbo/ext.rb ADDED
@@ -0,0 +1,13 @@
1
+ module LCBO
2
+ module HashExt
3
+
4
+ def self.symbolize_keys(input)
5
+ input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_sym => value) }
6
+ end
7
+
8
+ def self.stringify_keys(input)
9
+ input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_s => value) }
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ module LCBO
2
+
3
+ PAGE_TYPES = {
4
+ :product => 'ProductPage',
5
+ :product_list => 'ProductListPage',
6
+ :store => 'StorePage',
7
+ :inventory => 'InventoryPage'
8
+ }
9
+
10
+ def self.page(type)
11
+ Object.const_get(PAGE_TYPES[type.to_sym])
12
+ end
13
+
14
+ def self.parse(page_type, response)
15
+ page[page_type].parse(response)
16
+ end
17
+
18
+ def self.product(product_no)
19
+ ProductPage.process(:product_no => product_no).as_hash
20
+ end
21
+
22
+ def self.store(store_no)
23
+ StorePage.process(:store_no => store_no).as_hash
24
+ end
25
+
26
+ def self.inventory(product_no)
27
+ InventoryPage.process(:product_no => product_no).as_hash
28
+ end
29
+
30
+ def self.product_list(page_number)
31
+ ProductListPage.process({}, { :page => page_number }).as_hash
32
+ end
33
+
34
+ end
data/lib/lcbo/pages.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'lcbo/pages/inventory_page'
2
+ require 'lcbo/pages/product_page'
3
+ require 'lcbo/pages/product_list_page'
4
+ require 'lcbo/pages/store_page'
@@ -0,0 +1,60 @@
1
+ module LCBO
2
+ class InventoryPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/searchResults.do' \
7
+ '?language=EN&itemNumber={product_no}'
8
+
9
+ emits :product_no do
10
+ query_params[:product_no].to_i
11
+ end
12
+
13
+ emits :inventory_count do
14
+ inventories.reduce(0) { |sum, inv| sum + inv[:quantity] }
15
+ end
16
+
17
+ emits :inventories do
18
+ # [updated_on, store_no, quantity]
19
+ inventory_table_rows.reduce([]) do |ary, node|
20
+ h = {}
21
+ h[:updated_on] = begin
22
+ CrawlKit::FastDateHelper[
23
+ node.
24
+ css('td[width="17%"]')[-1].
25
+ text.
26
+ strip]
27
+ end
28
+ h[:store_no] = begin
29
+ node.
30
+ css('td[width="38%"] a.item-details-col2').
31
+ attribute('href').
32
+ value.
33
+ match(/\?STORE=([0-9]{1,3})\&/).
34
+ captures[0].
35
+ to_s.
36
+ to_i
37
+ end
38
+ h[:quantity] = begin
39
+ node.
40
+ css('td[width="13%"]')[0].
41
+ content.
42
+ strip.
43
+ to_i
44
+ end
45
+ ary << h
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def inventory_table
52
+ doc.css('table[cellpadding="3"]')
53
+ end
54
+
55
+ def inventory_table_rows
56
+ inventory_table.css('tr[bgcolor]')
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,85 @@
1
+ module LCBO
2
+ class ProductListPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ PER_PAGE = 100
7
+ http_method :post
8
+ uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do'
9
+
10
+ default_body_params \
11
+ :STOCK_TYPE_NAME => 'All',
12
+ :ITEM_NAME => '',
13
+ :KEYWORDS => '',
14
+ :ITEM_NUMBER => '',
15
+ :productListingType => '',
16
+ :LIQUOR_TYPE_SHORT_ => '*',
17
+ :CATEGORY_NAME => '*',
18
+ :SUB_CATEGORY_NAME => '*',
19
+ :PRODUCING_CNAME => '*',
20
+ :PRODUCING_REGION_N => '*',
21
+ :UNIT_VOLUME => '*',
22
+ :SELLING_PRICE => '*',
23
+ :LTO_SALES_CODE => 'N',
24
+ :VQA_CODE => 'N',
25
+ :KOSHER_CODE => 'N',
26
+ :VINTAGES_CODE => 'N',
27
+ :VALUE_ADD_SALES_CO => 'N',
28
+ :AIR_MILES_SALES_CO => 'N',
29
+ :language => 'EN',
30
+ :style => 'LCBO.css',
31
+ :sort => 'sortedProduct',
32
+ :order => '1',
33
+ :resultsPerPage => PER_PAGE.to_s,
34
+ :page => '1',
35
+ :action => 'result',
36
+ :sortby => 'sortedProduct',
37
+ :orderby => '',
38
+ :numPerPage => PER_PAGE.to_s
39
+
40
+ emits :page do
41
+ body_params[:page].to_i
42
+ end
43
+
44
+ emits :final_page do
45
+ @final_page ||= begin
46
+ count = total_products / PER_PAGE
47
+ 0 == (total_products % PER_PAGE) ? count : count + 1
48
+ end
49
+ end
50
+
51
+ emits :next_page do
52
+ @next_page ||= begin
53
+ page < final_page ? page + 1 : nil
54
+ end
55
+ end
56
+
57
+ emits :total_products do
58
+ @total_products ||= begin
59
+ doc.css('td[width="42%"] font.main_font b')[0].
60
+ text.
61
+ gsub(/\s+/, ' ').
62
+ strip.
63
+ to_i
64
+ end
65
+ end
66
+
67
+ emits :product_nos do
68
+ product_anchors.reduce([]) do |ary, a|
69
+ if (match = a.attribute('href').value.match(/\&itemNumber=([0-9]+)/))
70
+ ary << (match.captures[0].to_i)
71
+ else
72
+ next ary
73
+ end
74
+ end
75
+ end
76
+ alias_method :as_array, :product_nos
77
+
78
+ protected
79
+
80
+ def product_anchors
81
+ doc.css('td[style="padding: 5 5 5 0;"] a.item-details-col2')
82
+ end
83
+
84
+ end
85
+ end