lcbo 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/CHANGELOG.md +3 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +18 -0
- data/README.md +29 -0
- data/Rakefile +62 -0
- data/lcbo.gemspec +29 -0
- data/lib/lcbo.rb +23 -0
- data/lib/lcbo/crawlers.rb +4 -0
- data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
- data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
- data/lib/lcbo/crawlers/products_crawler.rb +16 -0
- data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
- data/lib/lcbo/crawlkit.rb +24 -0
- data/lib/lcbo/crawlkit/eventable.rb +56 -0
- data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
- data/lib/lcbo/crawlkit/page.rb +141 -0
- data/lib/lcbo/crawlkit/request.rb +51 -0
- data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
- data/lib/lcbo/crawlkit/response.rb +48 -0
- data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
- data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
- data/lib/lcbo/ext.rb +13 -0
- data/lib/lcbo/helpers.rb +34 -0
- data/lib/lcbo/pages.rb +4 -0
- data/lib/lcbo/pages/inventory_page.rb +60 -0
- data/lib/lcbo/pages/product_list_page.rb +85 -0
- data/lib/lcbo/pages/product_page.rb +296 -0
- data/lib/lcbo/pages/store_page.rb +196 -0
- data/lib/lcbo/version.rb +3 -0
- data/spec/crawlkit/eventable_spec.rb +23 -0
- data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
- data/spec/crawlkit/page_spec.rb +114 -0
- data/spec/crawlkit/request_prototype_spec.rb +5 -0
- data/spec/crawlkit/request_spec.rb +41 -0
- data/spec/crawlkit/response_spec.rb +5 -0
- data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
- data/spec/crawlkit/volume_helper_spec.rb +21 -0
- data/spec/crawlkit_spec.rb +5 -0
- data/spec/lcbo_spec.rb +38 -0
- data/spec/pages/inventory_pages.yml +1685 -0
- data/spec/pages/inventory_pages/1.html +11649 -0
- data/spec/pages/inventory_pages/2.html +495 -0
- data/spec/pages/product_list_pages.yml +108 -0
- data/spec/pages/product_list_pages/1.html +4866 -0
- data/spec/pages/product_pages.yml +258 -0
- data/spec/pages/product_pages/1.html +1319 -0
- data/spec/pages/product_pages/2.html +1343 -0
- data/spec/pages/product_pages/3.html +1336 -0
- data/spec/pages/product_pages/4.html +1319 -0
- data/spec/pages/product_pages/5.html +1324 -0
- data/spec/pages/product_pages/6.html +1319 -0
- data/spec/pages/product_pages/7.html +1314 -0
- data/spec/pages/store_pages.yml +80 -0
- data/spec/pages/store_pages/1.html +592 -0
- data/spec/pages/store_pages/2.html +592 -0
- data/spec/pages_spec.rb +34 -0
- data/spec/spec_helper.rb +77 -0
- metadata +205 -0
@@ -0,0 +1,51 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class Request
|
4
|
+
|
5
|
+
attr_reader :request_prototype, :query_params, :body_params
|
6
|
+
|
7
|
+
def initialize(request_prototype, query_p = {}, body_p = {})
|
8
|
+
@request_prototype = request_prototype
|
9
|
+
self.query_params = query_p
|
10
|
+
self.body_params = body_p
|
11
|
+
end
|
12
|
+
|
13
|
+
def query_params=(value)
|
14
|
+
@query_params = (value || {})
|
15
|
+
end
|
16
|
+
|
17
|
+
def body_params=(value)
|
18
|
+
@body_params = request_prototype.body_params.merge(value || {})
|
19
|
+
end
|
20
|
+
|
21
|
+
def gettable?
|
22
|
+
[:head, :get].include?(request_prototype.http_method)
|
23
|
+
end
|
24
|
+
|
25
|
+
def config
|
26
|
+
opts = {}
|
27
|
+
opts[:method] = request_prototype.http_method
|
28
|
+
opts[:user_agent] = USER_AGENT
|
29
|
+
opts[:params] = body_params unless gettable?
|
30
|
+
opts
|
31
|
+
end
|
32
|
+
|
33
|
+
def uri
|
34
|
+
request_prototype.uri_template.expand(query_params).to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def run
|
38
|
+
response = Typhoeus::Request.run(uri, config)
|
39
|
+
Response.new \
|
40
|
+
:code => response.code,
|
41
|
+
:uri => response.request.url,
|
42
|
+
:http_method => response.request.method,
|
43
|
+
:time => response.time,
|
44
|
+
:query_params => query_params,
|
45
|
+
:body_params => body_params,
|
46
|
+
:body => response.body
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class RequestPrototype
|
4
|
+
|
5
|
+
attr_reader :http_method, :uri_template, :body_params
|
6
|
+
|
7
|
+
def initialize(uri_template = nil, http_method = :get, body_params = {})
|
8
|
+
self.uri_template = uri_template
|
9
|
+
self.http_method = http_method
|
10
|
+
self.body_params = body_params
|
11
|
+
end
|
12
|
+
|
13
|
+
def http_method=(value)
|
14
|
+
@http_method = value ? value.to_s.downcase.to_sym : :get
|
15
|
+
end
|
16
|
+
|
17
|
+
def uri_template=(value)
|
18
|
+
@uri_template = Addressable::Template.new(value) if value
|
19
|
+
end
|
20
|
+
|
21
|
+
def body_params=(value)
|
22
|
+
@body_params = value ? HashExt.symbolize_keys(value) : {}
|
23
|
+
end
|
24
|
+
|
25
|
+
def request(query_params = {}, body_params = {})
|
26
|
+
Request.new(self, query_params, body_params).run
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class Response
|
4
|
+
|
5
|
+
attr_reader :response, :body, :query_params, :body_params, :uri,
|
6
|
+
:code, :time, :http_method
|
7
|
+
|
8
|
+
def initialize(response)
|
9
|
+
params = HashExt.symbolize_keys(response)
|
10
|
+
@code = params[:code]
|
11
|
+
@uri = params[:uri]
|
12
|
+
@http_method = params[:http_method]
|
13
|
+
@time = params[:time]
|
14
|
+
@query_params = params[:query_params]
|
15
|
+
@body_params = params[:body_params]
|
16
|
+
@body = self.class.normalize_encoding(params[:body])
|
17
|
+
ensure_success!
|
18
|
+
end
|
19
|
+
|
20
|
+
def self.normalize_encoding(html)
|
21
|
+
if html.valid_encoding?
|
22
|
+
html
|
23
|
+
else
|
24
|
+
html.force_encoding('ISO-8859-1')
|
25
|
+
html.encode('UTF-8')
|
26
|
+
end.gsub("\r\n", "\n")
|
27
|
+
end
|
28
|
+
|
29
|
+
def as_hash
|
30
|
+
{ :code => code,
|
31
|
+
:uri => uri,
|
32
|
+
:http_method => http_method,
|
33
|
+
:time => time,
|
34
|
+
:query_params => query_params,
|
35
|
+
:body_params => body_params,
|
36
|
+
:body => body }
|
37
|
+
end
|
38
|
+
|
39
|
+
protected
|
40
|
+
|
41
|
+
def ensure_success!
|
42
|
+
return if @code == 200
|
43
|
+
raise RequestFailedError, "<#{@uri}> failed with status: #{@code}"
|
44
|
+
end
|
45
|
+
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# TODO: This is an ugly piece of ass that should burn and die!
|
3
|
+
module LCBO
|
4
|
+
module CrawlKit
|
5
|
+
class TitleCaseHelper
|
6
|
+
|
7
|
+
UPPER_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝ'
|
8
|
+
LOWER_CHARS = 'abcdefghijklmnopqrstuvwxyzàáâãäåæçèéêëìíîïðñòóôõöøùúûüý'
|
9
|
+
ALPHA_RANGE = "[#{UPPER_CHARS}#{LOWER_CHARS}]"
|
10
|
+
UPPER_RANGE = "[#{UPPER_CHARS}]"
|
11
|
+
LOWER_RANGE = "[#{LOWER_CHARS}]"
|
12
|
+
FIRST_CHAR_RE = /#{ALPHA_RANGE}{1}/u
|
13
|
+
ALPHA_RE = /#{ALPHA_RANGE}.*/u
|
14
|
+
SMALL_WORDS = %w[ a an and as at but by en for if in of del de on or the to v v. via vs vs. ]
|
15
|
+
ACRONYMS = %w[ vqa vsop xo nq5 vs xxx igt xiii xi xoxo srl bdb cvbg ocb lcbo i ii iii ]
|
16
|
+
|
17
|
+
attr_reader :input
|
18
|
+
|
19
|
+
def self.[](string)
|
20
|
+
titleize(string)
|
21
|
+
end
|
22
|
+
|
23
|
+
def self.upcase(string)
|
24
|
+
string.tr(LOWER_CHARS, UPPER_CHARS)
|
25
|
+
end
|
26
|
+
|
27
|
+
def self.downcase(string)
|
28
|
+
string.tr(UPPER_CHARS, LOWER_CHARS)
|
29
|
+
end
|
30
|
+
|
31
|
+
def self.preclean(string)
|
32
|
+
# Strip useless bracketed crap: Some Product Name (Some Redundant Stuff)**
|
33
|
+
string.gsub(/\(.+\Z/, '').
|
34
|
+
# Strip trailing stars.
|
35
|
+
gsub(/\*+\Z/, '')
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.capitalize(string)
|
39
|
+
first_letter = string.scan(FIRST_CHAR_RE)[0]
|
40
|
+
if first_letter
|
41
|
+
uchar = upcase(first_letter)
|
42
|
+
string.sub(/#{first_letter}/u, uchar)
|
43
|
+
else
|
44
|
+
string
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def self.titleize(string)
|
49
|
+
phrases(preclean(downcase(string))).map do |phrase|
|
50
|
+
words = phrase.split
|
51
|
+
words.map do |word|
|
52
|
+
def word.capitalize
|
53
|
+
self.sub(ALPHA_RE) { |subword| TitleCaseHelper.capitalize(subword) }
|
54
|
+
end
|
55
|
+
case word
|
56
|
+
when *(ACRONYMS + ACRONYMS.map { |ac| capitalize(ac) })
|
57
|
+
upcase(word)
|
58
|
+
when /#{ALPHA_RANGE}\&#{ALPHA_RANGE}/u # words with &, like E&J
|
59
|
+
word.split(/\&/).map { |w| capitalize(w) }.join('&')
|
60
|
+
when /#{ALPHA_RANGE}\-#{ALPHA_RANGE}/u # words with dashes, like "Smith-Weston"
|
61
|
+
word.split(/\-/).map { |w| capitalize(w) }.join('-')
|
62
|
+
when /#{ALPHA_RANGE}\/#{ALPHA_RANGE}/u # words with slashes
|
63
|
+
word.split(/\//).map { |w| capitalize(w) }.join(' / ')
|
64
|
+
when /#{ALPHA_RANGE}\.#{ALPHA_RANGE}/u # words with dots, like "example.com"
|
65
|
+
capitalized = word.split(/\./u).map { |w| capitalize(w) }.join('.')
|
66
|
+
'.' == word[-1, 1] ? capitalized + '.' : capitalized
|
67
|
+
when /^#{ALPHA_RANGE}.*#{UPPER_RANGE}/u # non-first letter capitalized already
|
68
|
+
word
|
69
|
+
when words.first, words.last
|
70
|
+
word.capitalize
|
71
|
+
when *(SMALL_WORDS + SMALL_WORDS.map { |small| capitalize(small) })
|
72
|
+
word.downcase
|
73
|
+
else
|
74
|
+
word.capitalize
|
75
|
+
end
|
76
|
+
end.join(' ')
|
77
|
+
end.join(' ').
|
78
|
+
# Special case for Word'S
|
79
|
+
gsub(/(['’])S\b/, '\1s')
|
80
|
+
end
|
81
|
+
|
82
|
+
def self.phrases(title)
|
83
|
+
phrases = title.scan(/.+?(?:[:.;?!] |$)/u).map { |phrase| phrase.strip }
|
84
|
+
# rejoin phrases that were split on the '.' from a small word
|
85
|
+
if phrases.size > 1
|
86
|
+
phrases[0..-2].each_with_index do |phrase, index|
|
87
|
+
if SMALL_WORDS.include?(phrase.split.last.downcase)
|
88
|
+
phrases[index] << " " + phrases.slice!(index + 1)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
phrases
|
93
|
+
end
|
94
|
+
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module LCBO
|
2
|
+
module CrawlKit
|
3
|
+
class VolumeHelper
|
4
|
+
|
5
|
+
attr_reader :package_volume, :unit_volume, :total_units, :unit_type
|
6
|
+
|
7
|
+
def initialize(input)
|
8
|
+
@input = input
|
9
|
+
@package_volume = 0
|
10
|
+
@unit_volume = 0
|
11
|
+
@total_units = 0
|
12
|
+
calculate
|
13
|
+
self
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.[](input_string)
|
17
|
+
new(input_string).as_milliliters
|
18
|
+
end
|
19
|
+
|
20
|
+
def as_milliliters
|
21
|
+
@package_volume
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def calculate
|
27
|
+
return unless @input
|
28
|
+
match = @input.match(/([0-9]+|[0-9]+x[0-9]+) (mL) ([a-z]+)/)
|
29
|
+
return unless match
|
30
|
+
captures = match.captures
|
31
|
+
return unless captures.size == 3
|
32
|
+
|
33
|
+
if captures[0].include?('x')
|
34
|
+
@total_units, @unit_volume = *captures[0].split('x').map(&:to_i)
|
35
|
+
else
|
36
|
+
@total_units = 1
|
37
|
+
@unit_volume = captures[0].to_i
|
38
|
+
end
|
39
|
+
|
40
|
+
@unit_type = captures[2]
|
41
|
+
@package_volume = @total_units * @unit_volume
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
data/lib/lcbo/ext.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module LCBO
|
2
|
+
module HashExt
|
3
|
+
|
4
|
+
def self.symbolize_keys(input)
|
5
|
+
input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_sym => value) }
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.stringify_keys(input)
|
9
|
+
input.reduce({}) { |hsh, (key, value)| hsh.merge(key.to_s => value) }
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
data/lib/lcbo/helpers.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
module LCBO
|
2
|
+
|
3
|
+
PAGE_TYPES = {
|
4
|
+
:product => 'ProductPage',
|
5
|
+
:product_list => 'ProductListPage',
|
6
|
+
:store => 'StorePage',
|
7
|
+
:inventory => 'InventoryPage'
|
8
|
+
}
|
9
|
+
|
10
|
+
def self.page(type)
|
11
|
+
Object.const_get(PAGE_TYPES[type.to_sym])
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.parse(page_type, response)
|
15
|
+
page[page_type].parse(response)
|
16
|
+
end
|
17
|
+
|
18
|
+
def self.product(product_no)
|
19
|
+
ProductPage.process(:product_no => product_no).as_hash
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.store(store_no)
|
23
|
+
StorePage.process(:store_no => store_no).as_hash
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.inventory(product_no)
|
27
|
+
InventoryPage.process(:product_no => product_no).as_hash
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.product_list(page_number)
|
31
|
+
ProductListPage.process({}, { :page => page_number }).as_hash
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
data/lib/lcbo/pages.rb
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
module LCBO
|
2
|
+
class InventoryPage
|
3
|
+
|
4
|
+
include CrawlKit::Page
|
5
|
+
|
6
|
+
uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/inventory/searchResults.do' \
|
7
|
+
'?language=EN&itemNumber={product_no}'
|
8
|
+
|
9
|
+
emits :product_no do
|
10
|
+
query_params[:product_no].to_i
|
11
|
+
end
|
12
|
+
|
13
|
+
emits :inventory_count do
|
14
|
+
inventories.reduce(0) { |sum, inv| sum + inv[:quantity] }
|
15
|
+
end
|
16
|
+
|
17
|
+
emits :inventories do
|
18
|
+
# [updated_on, store_no, quantity]
|
19
|
+
inventory_table_rows.reduce([]) do |ary, node|
|
20
|
+
h = {}
|
21
|
+
h[:updated_on] = begin
|
22
|
+
CrawlKit::FastDateHelper[
|
23
|
+
node.
|
24
|
+
css('td[width="17%"]')[-1].
|
25
|
+
text.
|
26
|
+
strip]
|
27
|
+
end
|
28
|
+
h[:store_no] = begin
|
29
|
+
node.
|
30
|
+
css('td[width="38%"] a.item-details-col2').
|
31
|
+
attribute('href').
|
32
|
+
value.
|
33
|
+
match(/\?STORE=([0-9]{1,3})\&/).
|
34
|
+
captures[0].
|
35
|
+
to_s.
|
36
|
+
to_i
|
37
|
+
end
|
38
|
+
h[:quantity] = begin
|
39
|
+
node.
|
40
|
+
css('td[width="13%"]')[0].
|
41
|
+
content.
|
42
|
+
strip.
|
43
|
+
to_i
|
44
|
+
end
|
45
|
+
ary << h
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
private
|
50
|
+
|
51
|
+
def inventory_table
|
52
|
+
doc.css('table[cellpadding="3"]')
|
53
|
+
end
|
54
|
+
|
55
|
+
def inventory_table_rows
|
56
|
+
inventory_table.css('tr[bgcolor]')
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module LCBO
|
2
|
+
class ProductListPage
|
3
|
+
|
4
|
+
include CrawlKit::Page
|
5
|
+
|
6
|
+
PER_PAGE = 100
|
7
|
+
http_method :post
|
8
|
+
uri 'http://www.lcbo.com/lcbo-ear/lcbo/product/searchResults.do'
|
9
|
+
|
10
|
+
default_body_params \
|
11
|
+
:STOCK_TYPE_NAME => 'All',
|
12
|
+
:ITEM_NAME => '',
|
13
|
+
:KEYWORDS => '',
|
14
|
+
:ITEM_NUMBER => '',
|
15
|
+
:productListingType => '',
|
16
|
+
:LIQUOR_TYPE_SHORT_ => '*',
|
17
|
+
:CATEGORY_NAME => '*',
|
18
|
+
:SUB_CATEGORY_NAME => '*',
|
19
|
+
:PRODUCING_CNAME => '*',
|
20
|
+
:PRODUCING_REGION_N => '*',
|
21
|
+
:UNIT_VOLUME => '*',
|
22
|
+
:SELLING_PRICE => '*',
|
23
|
+
:LTO_SALES_CODE => 'N',
|
24
|
+
:VQA_CODE => 'N',
|
25
|
+
:KOSHER_CODE => 'N',
|
26
|
+
:VINTAGES_CODE => 'N',
|
27
|
+
:VALUE_ADD_SALES_CO => 'N',
|
28
|
+
:AIR_MILES_SALES_CO => 'N',
|
29
|
+
:language => 'EN',
|
30
|
+
:style => 'LCBO.css',
|
31
|
+
:sort => 'sortedProduct',
|
32
|
+
:order => '1',
|
33
|
+
:resultsPerPage => PER_PAGE.to_s,
|
34
|
+
:page => '1',
|
35
|
+
:action => 'result',
|
36
|
+
:sortby => 'sortedProduct',
|
37
|
+
:orderby => '',
|
38
|
+
:numPerPage => PER_PAGE.to_s
|
39
|
+
|
40
|
+
emits :page do
|
41
|
+
body_params[:page].to_i
|
42
|
+
end
|
43
|
+
|
44
|
+
emits :final_page do
|
45
|
+
@final_page ||= begin
|
46
|
+
count = total_products / PER_PAGE
|
47
|
+
0 == (total_products % PER_PAGE) ? count : count + 1
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
emits :next_page do
|
52
|
+
@next_page ||= begin
|
53
|
+
page < final_page ? page + 1 : nil
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
emits :total_products do
|
58
|
+
@total_products ||= begin
|
59
|
+
doc.css('td[width="42%"] font.main_font b')[0].
|
60
|
+
text.
|
61
|
+
gsub(/\s+/, ' ').
|
62
|
+
strip.
|
63
|
+
to_i
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
emits :product_nos do
|
68
|
+
product_anchors.reduce([]) do |ary, a|
|
69
|
+
if (match = a.attribute('href').value.match(/\&itemNumber=([0-9]+)/))
|
70
|
+
ary << (match.captures[0].to_i)
|
71
|
+
else
|
72
|
+
next ary
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
alias_method :as_array, :product_nos
|
77
|
+
|
78
|
+
protected
|
79
|
+
|
80
|
+
def product_anchors
|
81
|
+
doc.css('td[style="padding: 5 5 5 0;"] a.item-details-col2')
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|