lcbo 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +18 -0
- data/README.md +29 -0
- data/Rakefile +62 -0
- data/lcbo.gemspec +29 -0
- data/lib/lcbo.rb +23 -0
- data/lib/lcbo/crawlers.rb +4 -0
- data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
- data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
- data/lib/lcbo/crawlers/products_crawler.rb +16 -0
- data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
- data/lib/lcbo/crawlkit.rb +24 -0
- data/lib/lcbo/crawlkit/eventable.rb +56 -0
- data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
- data/lib/lcbo/crawlkit/page.rb +141 -0
- data/lib/lcbo/crawlkit/request.rb +51 -0
- data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
- data/lib/lcbo/crawlkit/response.rb +48 -0
- data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
- data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
- data/lib/lcbo/ext.rb +13 -0
- data/lib/lcbo/helpers.rb +34 -0
- data/lib/lcbo/pages.rb +4 -0
- data/lib/lcbo/pages/inventory_page.rb +60 -0
- data/lib/lcbo/pages/product_list_page.rb +85 -0
- data/lib/lcbo/pages/product_page.rb +296 -0
- data/lib/lcbo/pages/store_page.rb +196 -0
- data/lib/lcbo/version.rb +3 -0
- data/spec/crawlkit/eventable_spec.rb +23 -0
- data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
- data/spec/crawlkit/page_spec.rb +114 -0
- data/spec/crawlkit/request_prototype_spec.rb +5 -0
- data/spec/crawlkit/request_spec.rb +41 -0
- data/spec/crawlkit/response_spec.rb +5 -0
- data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
- data/spec/crawlkit/volume_helper_spec.rb +21 -0
- data/spec/crawlkit_spec.rb +5 -0
- data/spec/lcbo_spec.rb +38 -0
- data/spec/pages/inventory_pages.yml +1685 -0
- data/spec/pages/inventory_pages/1.html +11649 -0
- data/spec/pages/inventory_pages/2.html +495 -0
- data/spec/pages/product_list_pages.yml +108 -0
- data/spec/pages/product_list_pages/1.html +4866 -0
- data/spec/pages/product_pages.yml +258 -0
- data/spec/pages/product_pages/1.html +1319 -0
- data/spec/pages/product_pages/2.html +1343 -0
- data/spec/pages/product_pages/3.html +1336 -0
- data/spec/pages/product_pages/4.html +1319 -0
- data/spec/pages/product_pages/5.html +1324 -0
- data/spec/pages/product_pages/6.html +1319 -0
- data/spec/pages/product_pages/7.html +1314 -0
- data/spec/pages/store_pages.yml +80 -0
- data/spec/pages/store_pages/1.html +592 -0
- data/spec/pages/store_pages/2.html +592 -0
- data/spec/pages_spec.rb +34 -0
- data/spec/spec_helper.rb +77 -0
- metadata +205 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class Request
|
4
|
+
|
5
|
+
attr_reader :request_prototype, :query_params, :body_params
|
6
|
+
|
7
|
+
def initialize(request_prototype, query_p = {}, body_p = {})
|
8
|
+
@request_prototype = request_prototype
|
9
|
+
self.query_params = query_p
|
10
|
+
self.body_params = body_p
|
11
|
+
end
|
12
|
+
|
13
|
+
def query_params=(value)
|
14
|
+
@query_params = (value || {})
|
15
|
+
end
|
16
|
+
|
17
|
+
def body_params=(value)
|
18
|
+
@body_params = request_prototype.body_params.merge(value || {})
|
19
|
+
end
|
20
|
+
|
21
|
+
def gettable?
|
22
|
+
[:head, :get].include?(request_prototype.http_method)
|
23
|
+
end
|
24
|
+
|
25
|
+
def config
|
26
|
+
opts = {}
|
27
|
+
opts[:method] = request_prototype.http_method
|
28
|
+
opts[:user_agent] = USER_AGENT
|
29
|
+
opts[:params] = body_params unless gettable?
|
30
|
+
opts
|
31
|
+
end
|
32
|
+
|
33
|
+
def uri
|
34
|
+
request_prototype.uri_template.expand(query_params).to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def run
|
38
|
+
response = Typhoeus::Request.run(uri, config)
|
39
|
+
Response.new \
|
40
|
+
:code => response.code,
|
41
|
+
:uri => response.request.url,
|
42
|
+
:http_method => response.request.method,
|
43
|
+
:time => response.time,
|
44
|
+
:query_params => query_params,
|
45
|
+
:body_params => body_params,
|
46
|
+
:body => response.body
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class RequestPrototype
|
4
|
+
|
5
|
+
attr_reader :http_method, :uri_template, :body_params
|
6
|
+
|
7
|
+
def initialize(uri_template = nil, http_method = :get, body_params = {})
|
8
|
+
self.uri_template = uri_template
|
9
|
+
self.http_method = http_method
|
10
|
+
self.body_params = body_params
|
11
|
+
end
|
12
|
+
|
13
|
+
def http_method=(value)
|
14
|
+
@http_method = value ? value.to_s.downcase.to_sym : :get
|
15
|
+
end
|
16
|
+
|
17
|
+
def uri_template=(value)
|
18
|
+
@uri_template = Addressable::Template.new(value) if value
|
19
|
+
end
|
20
|
+
|
21
|
+
def body_params=(value)
|
22
|
+
@body_params = value ? HashExt.symbolize_keys(value) : {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def request(query_params = {}, body_params = {})
|
26
|
+
Request.new(self, query_params, body_params).run
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class Response
|
4
|
+
|
5
|
+
attr_reader :response, :body, :query_params, :body_params, :uri,
|
6
|
+
:code, :time, :http_method
|
7
|
+
|
8
|
+
def initialize(response)
|
9
|
+
params = HashExt.symbolize_keys(response)
|
10
|
+
@code = params[:code]
|
11
|
+
@uri = params[:uri]
|
12
|
+
@http_method = params[:http_method]
|
13
|
+
@time = params[:time]
|
14
|
+
@query_params = params[:query_params]
|
15
|
+
@body_params = params[:body_params]
|
16
|
+
@body = self.class.normalize_encoding(params[:body])
|
17
|
+
ensure_success!
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.normalize_encoding(html)
|
21
|
+
if html.valid_encoding?
|
22
|
+
html
|
23
|
+
else
|
24
|
+
html.force_encoding('ISO-8859-1')
|
25
|
+
html.encode('UTF-8')
|
26
|
+
end.gsub("\r\n", "\n")
|
27
|
+
end
|
28
|
+
|
29
|
+
def as_hash
|
30
|
+
{ :code => code,
|
31
|
+
:uri => uri,
|
32
|
+
:http_method => http_method,
|
33
|
+
:time => time,
|
34
|
+
:query_params => query_params,
|
35
|
+
:body_params => body_params,
|
36
|
+
:body => body }
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
def ensure_success!
|
42
|
+
return if @code == 200
|
43
|
+
raise RequestFailedError, "<#{@uri}> failed with status: #{@code}"
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# TODO: This is an ugly piece of ass that should burn and die!
|
3
|
+
module LCBO
|
4
|
+
module CrawlKit
|
5
|
+
class TitleCaseHelper
|
6
|
+
|
7
|
+
UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
|
8
|
+
LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
|
9
|
+
ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
|
10
|
+
UPPER_RANGE = "[#{UPPER_CHARS}]"
|
11
|
+
LOWER_RANGE = "[#{LOWER_CHARS}]"
|
12
|
+
FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
|
13
|
+
ALPHA_RE = /#{ALPHA_RANGE}.*/u
|
14
|
+
SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
|
15
|
+
ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
|
16
|
+
|
17
|
+
attr_reader :input
|
18
|
+
|
19
|
+
def self.[](string)
|
20
|
+
titleize(string)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.upcase(string)
|
24
|
+
string.tr(LOWER_CHARS, UPPER_CHARS)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.downcase(string)
|
28
|
+
string.tr(UPPER_CHARS, LOWER_CHARS)
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.preclean(string)
|
32
|
+
# Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
|
33
|
+
string.gsub(/\(.+\Z/, '').
|
34
|
+
# Strip trailing stars.
|
35
|
+
gsub(/\*+\Z/, '')
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.capitalize(string)
|
39
|
+
first_letter = string.scan(FIRST_CHAR_RE)[0]
|
40
|
+
if first_letter
|
41
|
+
uchar = upcase(first_letter)
|
42
|
+
string.sub(/#{first_letter}/u, uchar)
|
43
|
+
else
|
44
|
+
string
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.titleize(string)
|
49
|
+
phrases(preclean(downcase(string))).map do |phrase|
|
50
|
+
words = phrase.split
|
51
|
+
words.map do |word|
|
52
|
+
def word.capitalize
|
53
|
+
self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
|
54
|
+
end
|
55
|
+
case word
|
56
|
+
when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
|
57
|
+
upcase(word)
|
58
|
+
when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
|
59
|
+
word.split(/\&/).map { |w| capitalize(w) }.join('&')
|
60
|
+
when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
|
61
|
+
word.split(/\-/).map { |w| capitalize(w) }.join('-')
|
62
|
+
when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
|
63
|
+
word.split(/\//).map { |w| capitalize(w) }.join(' / ')
|
64
|
+
when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
|
65
|
+
capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
|
66
|
+
'.' == word[-1, 1] ? capitalized + '.' : capitalized
|
67
|
+
when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
|
68
|
+
word
|
69
|
+
when words.first, words.last
|
70
|
+
word.capitalize
|
71
|
+
when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
|
72
|
+
word.downcase
|
73
|
+
else
|
74
|
+
word.capitalize
|
75
|
+
end
|
76
|
+
end.join(' ')
|
77
|
+
end.join(' ').
|
78
|
+
# Special case for Word'S
|
79
|
+
gsub(/(['’])S\b/, '\1s')
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.phrases(title)
|
83
|
+
phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
|
84
|
+
# rejoin phrases that were split on the '.' from a small word
|
85
|
+
if phrases.size > 1
|
86
|
+
phrases[0..-2].each_with_index do |phrase, index|
|
87
|
+
if SMALL_WORDS.include?(phrase.split.last.downcase)
|
88
|
+
phrases[index] << " " + phrases.slice!(index + 1)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
phrases
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class VolumeHelper
|
4
|
+
|
5
|
+
attr_reader :package_volume, :unit_volume, :total_units, :unit_type
|
6
|
+
|
7
|
+
def initialize(input)
|
8
|
+
@input = input
|
9
|
+
@package_volume = 0
|
10
|
+
@unit_volume = 0
|
11
|
+
@total_units = 0
|
12
|
+
calculate
|
13
|
+
self
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.[](input_string)
|
17
|
+
new(input_string).as_milliliters
|
18
|
+
end
|
19
|
+
|
20
|
+
def as_milliliters
|
21
|
+
@package_volume
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def calculate
|
27
|
+
return unless @input
|
28
|
+
match = @input.match(/([0-9]+|[0-9]+x[0-9]+) (mL) ([a-z]+)/)
|
29
|
+
return unless match
|
30
|
+
captures = match.captures
|
31
|
+
return unless captures.size == 3
|
32
|
+
|
33
|
+
if captures[0].include?('x')
|
34
|
+
@total_units, @unit_volume = *captures[0].split('x').map(&:to_i)
|
35
|
+
else
|
36
|
+
@total_units = 1
|
37
|
+
@unit_volume = captures[0].to_i
|
38
|
+
end
|
39
|
+
|
40
|
+
@unit_type = captures[2]
|
41
|
+
@package_volume = @total_units * @unit_volume
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/lcbo/ext.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module LCBO
|
2
|
+
module HashExt
|
3
|
+
|
4
|
+
def self.symbolize_keys(input)
|
5
|
+
input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_sym => value) }
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.stringify_keys(input)
|
9
|
+
input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_s => value) }
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
data/lib/lcbo/helpers.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
module LCBO
|
2
|
+
|
3
|
+
PAGE_TYPES = {
|
4
|
+
:product => 'ProductPage',
|
5
|
+
:product_list => 'ProductListPage',
|
6
|
+
:store => 'StorePage',
|
7
|
+
:inventory => 'InventoryPage'
|
8
|
+
}
|
9
|
+
|
10
|
+
def self.page(type)
|
11
|
+
Object.const_get(PAGE_TYPES[type.to_sym])
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parse(page_type, response)
|
15
|
+
page[page_type].parse(response)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.product(product_no)
|
19
|
+
ProductPage.process(:product_no => product_no).as_hash
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.store(store_no)
|
23
|
+
StorePage.process(:store_no => store_no).as_hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.inventory(product_no)
|
27
|
+
InventoryPage.process(:product_no => product_no).as_hash
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.product_list(page_number)
|
31
|
+
ProductListPage.process({}, { :page => page_number }).as_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
data/lib/lcbo/pages.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
module LCBO
|
2
|
+
class InventoryPage
|
3
|
+
|
4
|
+
include CrawlKit::Page
|
5
|
+
|
6
|
+
uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/searchResults.do' \
|
7
|
+
'?language=EN&itemNumber={product_no}'
|
8
|
+
|
9
|
+
emits :product_no do
|
10
|
+
query_params[:product_no].to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
emits :inventory_count do
|
14
|
+
inventories.reduce(0) { |sum, inv| sum + inv[:quantity] }
|
15
|
+
end
|
16
|
+
|
17
|
+
emits :inventories do
|
18
|
+
# [updated_on, store_no, quantity]
|
19
|
+
inventory_table_rows.reduce([]) do |ary, node|
|
20
|
+
h = {}
|
21
|
+
h[:updated_on] = begin
|
22
|
+
CrawlKit::FastDateHelper[
|
23
|
+
node.
|
24
|
+
css('td[width="17%"]')[-1].
|
25
|
+
text.
|
26
|
+
strip]
|
27
|
+
end
|
28
|
+
h[:store_no] = begin
|
29
|
+
node.
|
30
|
+
css('td[width="38%"] a.item-details-col2').
|
31
|
+
attribute('href').
|
32
|
+
value.
|
33
|
+
match(/\?STORE=([0-9]{1,3})\&/).
|
34
|
+
captures[0].
|
35
|
+
to_s.
|
36
|
+
to_i
|
37
|
+
end
|
38
|
+
h[:quantity] = begin
|
39
|
+
node.
|
40
|
+
css('td[width="13%"]')[0].
|
41
|
+
content.
|
42
|
+
strip.
|
43
|
+
to_i
|
44
|
+
end
|
45
|
+
ary << h
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def inventory_table
|
52
|
+
doc.css('table[cellpadding="3"]')
|
53
|
+
end
|
54
|
+
|
55
|
+
def inventory_table_rows
|
56
|
+
inventory_table.css('tr[bgcolor]')
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module LCBO
|
2
|
+
class ProductListPage
|
3
|
+
|
4
|
+
include CrawlKit::Page
|
5
|
+
|
6
|
+
PER_PAGE = 100
|
7
|
+
http_method :post
|
8
|
+
uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do'
|
9
|
+
|
10
|
+
default_body_params \
|
11
|
+
:STOCK_TYPE_NAME => 'All',
|
12
|
+
:ITEM_NAME => '',
|
13
|
+
:KEYWORDS => '',
|
14
|
+
:ITEM_NUMBER => '',
|
15
|
+
:productListingType => '',
|
16
|
+
:LIQUOR_TYPE_SHORT_ => '*',
|
17
|
+
:CATEGORY_NAME => '*',
|
18
|
+
:SUB_CATEGORY_NAME => '*',
|
19
|
+
:PRODUCING_CNAME => '*',
|
20
|
+
:PRODUCING_REGION_N => '*',
|
21
|
+
:UNIT_VOLUME => '*',
|
22
|
+
:SELLING_PRICE => '*',
|
23
|
+
:LTO_SALES_CODE => 'N',
|
24
|
+
:VQA_CODE => 'N',
|
25
|
+
:KOSHER_CODE => 'N',
|
26
|
+
:VINTAGES_CODE => 'N',
|
27
|
+
:VALUE_ADD_SALES_CO => 'N',
|
28
|
+
:AIR_MILES_SALES_CO => 'N',
|
29
|
+
:language => 'EN',
|
30
|
+
:style => 'LCBO.css',
|
31
|
+
:sort => 'sortedProduct',
|
32
|
+
:order => '1',
|
33
|
+
:resultsPerPage => PER_PAGE.to_s,
|
34
|
+
:page => '1',
|
35
|
+
:action => 'result',
|
36
|
+
:sortby => 'sortedProduct',
|
37
|
+
:orderby => '',
|
38
|
+
:numPerPage => PER_PAGE.to_s
|
39
|
+
|
40
|
+
emits :page do
|
41
|
+
body_params[:page].to_i
|
42
|
+
end
|
43
|
+
|
44
|
+
emits :final_page do
|
45
|
+
@final_page ||= begin
|
46
|
+
count = total_products / PER_PAGE
|
47
|
+
0 == (total_products % PER_PAGE) ? count : count + 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
emits :next_page do
|
52
|
+
@next_page ||= begin
|
53
|
+
page < final_page ? page + 1 : nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
emits :total_products do
|
58
|
+
@total_products ||= begin
|
59
|
+
doc.css('td[width="42%"] font.main_font b')[0].
|
60
|
+
text.
|
61
|
+
gsub(/\s+/, ' ').
|
62
|
+
strip.
|
63
|
+
to_i
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
emits :product_nos do
|
68
|
+
product_anchors.reduce([]) do |ary, a|
|
69
|
+
if (match = a.attribute('href').value.match(/\&itemNumber=([0-9]+)/))
|
70
|
+
ary << (match.captures[0].to_i)
|
71
|
+
else
|
72
|
+
next ary
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
alias_method :as_array, :product_nos
|
77
|
+
|
78
|
+
protected
|
79
|
+
|
80
|
+
def product_anchors
|
81
|
+
doc.css('td[style="padding: 5 5 5 0;"] a.item-details-col2')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|