lcbo 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
@@ -0,0 +1,51 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class Request
4
+
5
+ attr_reader :request_prototype, :query_params, :body_params
6
+
7
+ def initialize(request_prototype, query_p = {}, body_p = {})
8
+ @request_prototype = request_prototype
9
+ self.query_params = query_p
10
+ self.body_params = body_p
11
+ end
12
+
13
+ def query_params=(value)
14
+ @query_params = (value || {})
15
+ end
16
+
17
+ def body_params=(value)
18
+ @body_params = request_prototype.body_params.merge(value || {})
19
+ end
20
+
21
+ def gettable?
22
+ [:head, :get].include?(request_prototype.http_method)
23
+ end
24
+
25
+ def config
26
+ opts = {}
27
+ opts[:method] = request_prototype.http_method
28
+ opts[:user_agent] = USER_AGENT
29
+ opts[:params] = body_params unless gettable?
30
+ opts
31
+ end
32
+
33
+ def uri
34
+ request_prototype.uri_template.expand(query_params).to_s
35
+ end
36
+
37
+ def run
38
+ response = Typhoeus::Request.run(uri, config)
39
+ Response.new \
40
+ :code => response.code,
41
+ :uri => response.request.url,
42
+ :http_method => response.request.method,
43
+ :time => response.time,
44
+ :query_params => query_params,
45
+ :body_params => body_params,
46
+ :body => response.body
47
+ end
48
+
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,31 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class RequestPrototype
4
+
5
+ attr_reader :http_method, :uri_template, :body_params
6
+
7
+ def initialize(uri_template = nil, http_method = :get, body_params = {})
8
+ self.uri_template = uri_template
9
+ self.http_method = http_method
10
+ self.body_params = body_params
11
+ end
12
+
13
+ def http_method=(value)
14
+ @http_method = value ? value.to_s.downcase.to_sym : :get
15
+ end
16
+
17
+ def uri_template=(value)
18
+ @uri_template = Addressable::Template.new(value) if value
19
+ end
20
+
21
+ def body_params=(value)
22
+ @body_params = value ? HashExt.symbolize_keys(value) : {}
23
+ end
24
+
25
+ def request(query_params = {}, body_params = {})
26
+ Request.new(self, query_params, body_params).run
27
+ end
28
+
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,48 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class Response
4
+
5
+ attr_reader :response, :body, :query_params, :body_params, :uri,
6
+ :code, :time, :http_method
7
+
8
+ def initialize(response)
9
+ params = HashExt.symbolize_keys(response)
10
+ @code = params[:code]
11
+ @uri = params[:uri]
12
+ @http_method = params[:http_method]
13
+ @time = params[:time]
14
+ @query_params = params[:query_params]
15
+ @body_params = params[:body_params]
16
+ @body = self.class.normalize_encoding(params[:body])
17
+ ensure_success!
18
+ end
19
+
20
+ def self.normalize_encoding(html)
21
+ if html.valid_encoding?
22
+ html
23
+ else
24
+ html.force_encoding('ISO-8859-1')
25
+ html.encode('UTF-8')
26
+ end.gsub("\r\n", "\n")
27
+ end
28
+
29
+ def as_hash
30
+ { :code => code,
31
+ :uri => uri,
32
+ :http_method => http_method,
33
+ :time => time,
34
+ :query_params => query_params,
35
+ :body_params => body_params,
36
+ :body => body }
37
+ end
38
+
39
+ protected
40
+
41
+ def ensure_success!
42
+ return if @code == 200
43
+ raise RequestFailedError, "<#{@uri}> failed with status: #{@code}"
44
+ end
45
+
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,97 @@
1
+ # coding: utf-8
2
+ # TODO: This is an ugly piece of ass that should burn and die!
3
+ module LCBO
4
+ module CrawlKit
5
+ class TitleCaseHelper
6
+
7
+ UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
8
+ LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
9
+ ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
10
+ UPPER_RANGE = "[#{UPPER_CHARS}]"
11
+ LOWER_RANGE = "[#{LOWER_CHARS}]"
12
+ FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
13
+ ALPHA_RE = /#{ALPHA_RANGE}.*/u
14
+ SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
15
+ ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
16
+
17
+ attr_reader :input
18
+
19
+ def self.[](string)
20
+ titleize(string)
21
+ end
22
+
23
+ def self.upcase(string)
24
+ string.tr(LOWER_CHARS, UPPER_CHARS)
25
+ end
26
+
27
+ def self.downcase(string)
28
+ string.tr(UPPER_CHARS, LOWER_CHARS)
29
+ end
30
+
31
+ def self.preclean(string)
32
+ # Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
33
+ string.gsub(/\(.+\Z/, '').
34
+ # Strip trailing stars.
35
+ gsub(/\*+\Z/, '')
36
+ end
37
+
38
+ def self.capitalize(string)
39
+ first_letter = string.scan(FIRST_CHAR_RE)[0]
40
+ if first_letter
41
+ uchar = upcase(first_letter)
42
+ string.sub(/#{first_letter}/u, uchar)
43
+ else
44
+ string
45
+ end
46
+ end
47
+
48
+ def self.titleize(string)
49
+ phrases(preclean(downcase(string))).map do |phrase|
50
+ words = phrase.split
51
+ words.map do |word|
52
+ def word.capitalize
53
+ self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
54
+ end
55
+ case word
56
+ when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
57
+ upcase(word)
58
+ when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
59
+ word.split(/\&/).map { |w| capitalize(w) }.join('&')
60
+ when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
61
+ word.split(/\-/).map { |w| capitalize(w) }.join('-')
62
+ when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
63
+ word.split(/\//).map { |w| capitalize(w) }.join(' / ')
64
+ when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
65
+ capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
66
+ '.' == word[-1, 1] ? capitalized + '.' : capitalized
67
+ when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
68
+ word
69
+ when words.first, words.last
70
+ word.capitalize
71
+ when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
72
+ word.downcase
73
+ else
74
+ word.capitalize
75
+ end
76
+ end.join(' ')
77
+ end.join(' ').
78
+ # Special case for Word'S
79
+ gsub(/(['’])S\b/, '\1s')
80
+ end
81
+
82
+ def self.phrases(title)
83
+ phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
84
+ # rejoin phrases that were split on the '.' from a small word
85
+ if phrases.size > 1
86
+ phrases[0..-2].each_with_index do |phrase, index|
87
+ if SMALL_WORDS.include?(phrase.split.last.downcase)
88
+ phrases[index] << " " + phrases.slice!(index + 1)
89
+ end
90
+ end
91
+ end
92
+ phrases
93
+ end
94
+
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,46 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class VolumeHelper
4
+
5
+ attr_reader :package_volume, :unit_volume, :total_units, :unit_type
6
+
7
+ def initialize(input)
8
+ @input = input
9
+ @package_volume = 0
10
+ @unit_volume = 0
11
+ @total_units = 0
12
+ calculate
13
+ self
14
+ end
15
+
16
+ def self.[](input_string)
17
+ new(input_string).as_milliliters
18
+ end
19
+
20
+ def as_milliliters
21
+ @package_volume
22
+ end
23
+
24
+ private
25
+
26
+ def calculate
27
+ return unless @input
28
+ match = @input.match(/([0-9]+|[0-9]+x[0-9]+) (mL) ([a-z]+)/)
29
+ return unless match
30
+ captures = match.captures
31
+ return unless captures.size == 3
32
+
33
+ if captures[0].include?('x')
34
+ @total_units, @unit_volume = *captures[0].split('x').map(&:to_i)
35
+ else
36
+ @total_units = 1
37
+ @unit_volume = captures[0].to_i
38
+ end
39
+
40
+ @unit_type = captures[2]
41
+ @package_volume = @total_units * @unit_volume
42
+ end
43
+
44
+ end
45
+ end
46
+ end
data/lib/lcbo/ext.rb ADDED
@@ -0,0 +1,13 @@
1
+ module LCBO
2
+ module HashExt
3
+
4
+ def self.symbolize_keys(input)
5
+ input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_sym => value) }
6
+ end
7
+
8
+ def self.stringify_keys(input)
9
+ input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_s => value) }
10
+ end
11
+
12
+ end
13
+ end
@@ -0,0 +1,34 @@
1
+ module LCBO
2
+
3
+ PAGE_TYPES = {
4
+ :product => 'ProductPage',
5
+ :product_list => 'ProductListPage',
6
+ :store => 'StorePage',
7
+ :inventory => 'InventoryPage'
8
+ }
9
+
10
+ def self.page(type)
11
+ Object.const_get(PAGE_TYPES[type.to_sym])
12
+ end
13
+
14
+ def self.parse(page_type, response)
15
+ page[page_type].parse(response)
16
+ end
17
+
18
+ def self.product(product_no)
19
+ ProductPage.process(:product_no => product_no).as_hash
20
+ end
21
+
22
+ def self.store(store_no)
23
+ StorePage.process(:store_no => store_no).as_hash
24
+ end
25
+
26
+ def self.inventory(product_no)
27
+ InventoryPage.process(:product_no => product_no).as_hash
28
+ end
29
+
30
+ def self.product_list(page_number)
31
+ ProductListPage.process({}, { :page => page_number }).as_hash
32
+ end
33
+
34
+ end
data/lib/lcbo/pages.rb ADDED
@@ -0,0 +1,4 @@
1
+ require 'lcbo/pages/inventory_page'
2
+ require 'lcbo/pages/product_page'
3
+ require 'lcbo/pages/product_list_page'
4
+ require 'lcbo/pages/store_page'
@@ -0,0 +1,60 @@
1
+ module LCBO
2
+ class InventoryPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/searchResults.do' \
7
+ '?language=EN&itemNumber={product_no}'
8
+
9
+ emits :product_no do
10
+ query_params[:product_no].to_i
11
+ end
12
+
13
+ emits :inventory_count do
14
+ inventories.reduce(0) { |sum, inv| sum + inv[:quantity] }
15
+ end
16
+
17
+ emits :inventories do
18
+ # [updated_on, store_no, quantity]
19
+ inventory_table_rows.reduce([]) do |ary, node|
20
+ h = {}
21
+ h[:updated_on] = begin
22
+ CrawlKit::FastDateHelper[
23
+ node.
24
+ css('td[width="17%"]')[-1].
25
+ text.
26
+ strip]
27
+ end
28
+ h[:store_no] = begin
29
+ node.
30
+ css('td[width="38%"] a.item-details-col2').
31
+ attribute('href').
32
+ value.
33
+ match(/\?STORE=([0-9]{1,3})\&/).
34
+ captures[0].
35
+ to_s.
36
+ to_i
37
+ end
38
+ h[:quantity] = begin
39
+ node.
40
+ css('td[width="13%"]')[0].
41
+ content.
42
+ strip.
43
+ to_i
44
+ end
45
+ ary << h
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ def inventory_table
52
+ doc.css('table[cellpadding="3"]')
53
+ end
54
+
55
+ def inventory_table_rows
56
+ inventory_table.css('tr[bgcolor]')
57
+ end
58
+
59
+ end
60
+ end
@@ -0,0 +1,85 @@
1
+ module LCBO
2
+ class ProductListPage
3
+
4
+ include CrawlKit::Page
5
+
6
+ PER_PAGE = 100
7
+ http_method :post
8
+ uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do'
9
+
10
+ default_body_params \
11
+ :STOCK_TYPE_NAME => 'All',
12
+ :ITEM_NAME => '',
13
+ :KEYWORDS => '',
14
+ :ITEM_NUMBER => '',
15
+ :productListingType => '',
16
+ :LIQUOR_TYPE_SHORT_ => '*',
17
+ :CATEGORY_NAME => '*',
18
+ :SUB_CATEGORY_NAME => '*',
19
+ :PRODUCING_CNAME => '*',
20
+ :PRODUCING_REGION_N => '*',
21
+ :UNIT_VOLUME => '*',
22
+ :SELLING_PRICE => '*',
23
+ :LTO_SALES_CODE => 'N',
24
+ :VQA_CODE => 'N',
25
+ :KOSHER_CODE => 'N',
26
+ :VINTAGES_CODE => 'N',
27
+ :VALUE_ADD_SALES_CO => 'N',
28
+ :AIR_MILES_SALES_CO => 'N',
29
+ :language => 'EN',
30
+ :style => 'LCBO.css',
31
+ :sort => 'sortedProduct',
32
+ :order => '1',
33
+ :resultsPerPage => PER_PAGE.to_s,
34
+ :page => '1',
35
+ :action => 'result',
36
+ :sortby => 'sortedProduct',
37
+ :orderby => '',
38
+ :numPerPage => PER_PAGE.to_s
39
+
40
+ emits :page do
41
+ body_params[:page].to_i
42
+ end
43
+
44
+ emits :final_page do
45
+ @final_page ||= begin
46
+ count = total_products / PER_PAGE
47
+ 0 == (total_products % PER_PAGE) ? count : count + 1
48
+ end
49
+ end
50
+
51
+ emits :next_page do
52
+ @next_page ||= begin
53
+ page < final_page ? page + 1 : nil
54
+ end
55
+ end
56
+
57
+ emits :total_products do
58
+ @total_products ||= begin
59
+ doc.css('td[width="42%"] font.main_font b')[0].
60
+ text.
61
+ gsub(/\s+/, ' ').
62
+ strip.
63
+ to_i
64
+ end
65
+ end
66
+
67
+ emits :product_nos do
68
+ product_anchors.reduce([]) do |ary, a|
69
+ if (match = a.attribute('href').value.match(/\&itemNumber=([0-9]+)/))
70
+ ary << (match.captures[0].to_i)
71
+ else
72
+ next ary
73
+ end
74
+ end
75
+ end
76
+ alias_method :as_array, :product_nos
77
+
78
+ protected
79
+
80
+ def product_anchors
81
+ doc.css('td[style="padding: 5 5 5 0;"] a.item-details-col2')
82
+ end
83
+
84
+ end
85
+ end