lcbo 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ pkg
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ Version 0.9.0
2
+
3
+ * Initial public release
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'rspec', '1.3.0'
4
+ gem 'typhoeus', '0.1.29'
5
+ gem 'addressable', '2.1.2'
6
+ gem 'nokogiri', '1.4.3.1'
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ addressable (2.1.2)
5
+ nokogiri (1.4.3.1)
6
+ rack (1.2.1)
7
+ rspec (1.3.0)
8
+ typhoeus (0.1.29)
9
+ rack
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ addressable (= 2.1.2)
16
+ nokogiri (= 1.4.3.1)
17
+ rspec (= 1.3.0)
18
+ typhoeus (= 0.1.29)
data/LICENSE ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Carsten Nielsen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ of the Software, and to permit persons to whom the Software is furnished to do
8
+ so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # LCBO: The Ruby Gem
2
+
3
+ This library is used to gather data for [LCBO API](http://lcboapi.com). It allows you to request and parse store, product, inventory, and product list pages directly from the [LCBO](http://lcbo.com) website.
4
+
5
+ ## Synopsis
6
+
7
+ require 'lcbo'
8
+
9
+ LCBO.store(511)
10
+ # => { :store_no => 511, :name => "King & Spadina", ... }
11
+
12
+ LCBO.product(18)
13
+ # => { :product_no => 11, :name => "Heineken Lager", ... }
14
+
15
+ LCBO.inventory(18)
16
+ # => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
17
+
18
+ LCBO.products_list(1)
19
+ # => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
20
+
21
+ ## Installation
22
+
23
+ Use Ruby Gems: `gem install lcbo`
24
+
25
+ ## Links
26
+
27
+ * [Issue tracker](http://github.com/heycarsten/lcbo/issues)
28
+ * [Source code](http://github.com/heycarsten/lcbo)
29
+ * [License](http://github.com/heycarsten/lcbo/blob/master/LICENSE)
data/Rakefile ADDED
@@ -0,0 +1,62 @@
1
+ gem 'rspec', '1.3.0'
2
+ require 'rubygems/specification' unless defined?(Gem::Specification)
3
+ require 'spec/rake/spectask'
4
+ require 'rake/gempackagetask'
5
+
6
+ task :default => :spec
7
+
8
+ def gemspec
9
+ @gemspec ||= begin
10
+ Gem::Specification.load(File.expand_path('lcbo.gemspec'))
11
+ end
12
+ end
13
+
14
+ desc 'Start an irb console'
15
+ task :console do
16
+ system 'irb -I lib -r lcbo'
17
+ end
18
+
19
+ desc 'Validates the gemspec'
20
+ task :gemspec do
21
+ gemspec.validate
22
+ end
23
+
24
+ desc 'Displays the current version'
25
+ task :version do
26
+ puts "Current version: #{gemspec.version}"
27
+ end
28
+
29
+ desc 'Installs the gem locally'
30
+ task :install => :package do
31
+ sh "gem install pkg/#{gemspec.name}-#{gemspec.version}"
32
+ end
33
+
34
+ desc 'Release the gem'
35
+ task :release => :package do
36
+ sh "gem push pkg/#{gemspec.name}-#{gemspec.version}.gem"
37
+ end
38
+
39
+ Rake::GemPackageTask.new(gemspec) do |pkg|
40
+ pkg.gem_spec = gemspec
41
+ end
42
+ task :gem => :gemspec
43
+ task :package => :gemspec
44
+
45
+ desc 'Run the specs'
46
+ Spec::Rake::SpecTask.new do |t|
47
+ t.spec_files = FileList['spec/**/*_spec.rb']
48
+ t.spec_opts = %w[-fs --color]
49
+ end
50
+
51
+ desc 'Download all HTML indicated in YAML assertion files'
52
+ task :download_support do
53
+ require 'yaml'
54
+ require 'open-uri'
55
+ product_pages = YAML.load_file('./spec/support/product_pages.yml')
56
+ product_pages.each do |spec|
57
+ html = open(spec[:uri]).read
58
+ File.open("./spec/support/product_pages/#{spec[:file]}", ?w) { |file|
59
+ file.print(html)
60
+ }
61
+ end
62
+ end
data/lcbo.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ require File.expand_path("../lib/lcbo/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'lcbo'
6
+ s.version = LCBO::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['Carsten Nielsen']
9
+ s.email = ['heycarsten@gmail.com']
10
+ s.homepage = 'http://github.com/heycarsten/lcbo'
11
+ s.summary = %q{A library for parsing HTML pages from http://lcbo.com}
12
+ s.description = %q{Request and parse product, store, inventory, and product search pages directly from the official LCBO website.}
13
+
14
+ s.required_rubygems_version = '>= 1.3.6'
15
+ s.rubyforge_project = 'lcbo'
16
+
17
+ s.add_dependency 'typhoeus'
18
+ s.add_dependency 'addressable'
19
+ s.add_dependency 'nokogiri'
20
+
21
+ s.add_development_dependency 'rspec', '1.3.0'
22
+
23
+ s.files = `git ls-files`.split(?\n)
24
+ s.test_files = `git ls-files -- {test,spec}/*`.split(?\n)
25
+ s.require_paths = ['lib']
26
+
27
+ # If you need an executable, add it here
28
+ # s.executables = ['lcbo']
29
+ end
data/lib/lcbo.rb ADDED
@@ -0,0 +1,23 @@
1
+ module LCBO
2
+
3
+ DEFAULT_CONFIG = {
4
+ :user_agent => nil,
5
+ }
6
+
7
+ def self.config
8
+ reset_config! unless @config
9
+ @config
10
+ end
11
+
12
+ def self.reset_config!
13
+ @config = DEFAULT_CONFIG.dup
14
+ end
15
+
16
+ end
17
+
18
+ require 'lcbo/version'
19
+ require 'lcbo/ext'
20
+ require 'lcbo/helpers'
21
+ require 'lcbo/crawlkit'
22
+ require 'lcbo/pages'
23
+ require 'lcbo/crawlers'
@@ -0,0 +1,4 @@
1
+ require 'lcbo/crawlers/inventories_crawler'
2
+ require 'lcbo/crawlers/products_crawler'
3
+ require 'lcbo/crawlers/product_lists_crawler'
4
+ require 'lcbo/crawlers/stores_crawler'
@@ -0,0 +1,15 @@
1
+ module LCBO
2
+ class InventoriesCrawler
3
+
4
+ def self.run(product_nos, &block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ product_nos.each do |product_no|
7
+ begin
8
+ yield InventoryRequest.parse(:product_no => product_no).as_hash
9
+ rescue
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module LCBO
2
+ class ProductListsCrawler
3
+
4
+ MAX_RETRIES = 10
5
+
6
+ class Error < StandardError; end
7
+ class EpicTimeoutError < Error; end
8
+
9
+ def self.run(params = {}, tries = 0, &block)
10
+ raise ArgumentError, 'block expected' unless block_given?
11
+ begin
12
+ payload = ProductListRequest.parse(params).as_hash
13
+ yield(payload)
14
+ run(:page => payload[:next_page], &block) if payload[:next_page]
15
+ rescue Errno::ETIMEDOUT, Timeout::Error
16
+ # On timeout, try again.
17
+ raise EpicTimeoutError if tries > MAX_RETRIES
18
+ run(params, (tries + 1), &block)
19
+ end
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,16 @@
1
+ module LCBO
2
+ class ProductsCrawler
3
+
4
+ def self.run(product_nos, &block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ product_nos.each do |product_no|
7
+ begin
8
+ yield ProductRequest.parse(:product_no => product_no).as_hash
9
+ rescue CrawlKit::MissingResourceError, Errno::ETIMEDOUT, Timeout::Error
10
+ # Ignore products with no data, and timeouts.
11
+ end
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module LCBO
2
+ class StoresCrawler
3
+
4
+ def self.run(&block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ (1..720).each do |store_no|
7
+ begin
8
+ yield StoreRequest.parse(:store_no => store_no).as_hash
9
+ rescue CrawlKit::MissingResourceError, Errno::ETIMEDOUT, Timeout::Error
10
+ # Ignore stores that don't exist and timeouts.
11
+ end
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,24 @@
1
+ require 'addressable/template'
2
+ require 'addressable/uri'
3
+ require 'nokogiri'
4
+ require 'typhoeus'
5
+ require 'uri'
6
+
7
+ module LCBO
8
+ module CrawlKit
9
+ USER_AGENT ||= begin
10
+ LCBO.config[:user_agent] ||
11
+ ENV['LCBO_USER_AGENT'] ||
12
+ Typhoeus::USER_AGENT
13
+ end
14
+ end
15
+ end
16
+
17
+ require 'lcbo/crawlkit/eventable'
18
+ require 'lcbo/crawlkit/fastdate_helper'
19
+ require 'lcbo/crawlkit/page'
20
+ require 'lcbo/crawlkit/request'
21
+ require 'lcbo/crawlkit/response'
22
+ require 'lcbo/crawlkit/request_prototype'
23
+ require 'lcbo/crawlkit/titlecase_helper'
24
+ require 'lcbo/crawlkit/volume_helper'
@@ -0,0 +1,56 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Eventable
4
+
5
+ EVENT_TYPES = %[
6
+ before_request
7
+ after_request
8
+ before_parse
9
+ after_parse ]
10
+
11
+ def self.included(mod)
12
+ mod.extend(ClassMethods)
13
+ end
14
+
15
+ def fire(event_type)
16
+ self.class.callbacks.
17
+ select { |callback| callback.is_for?(event_type) }.
18
+ each { |callback| callback.call_on(self) }
19
+ end
20
+
21
+ module ClassMethods
22
+ def on(event_type, *method_names)
23
+ @callbacks ||= []
24
+ method_names.each do |method_name|
25
+ @callbacks << Callback.new(event_type, method_name)
26
+ end
27
+ end
28
+
29
+ def callbacks
30
+ @callbacks || []
31
+ end
32
+ end
33
+
34
+ class Callback
35
+ attr_reader :event_type, :method_name
36
+
37
+ def initialize(event_type, method_name)
38
+ @event_type = event_type.to_sym
39
+ @method_name = method_name.to_sym
40
+ end
41
+
42
+ def is_for?(event_sym)
43
+ unless EVENT_TYPES.include?(event_sym.to_s)
44
+ raise ArgumentError, "event_type: #{event_sym} is not valid"
45
+ end
46
+ event_type == event_sym.to_sym
47
+ end
48
+
49
+ def call_on(object)
50
+ object.send(method_name)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,40 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class FastDateHelper
4
+
5
+ MONTH_NAMES_TO_NUMBERS = {
6
+ 'Jan' => '01',
7
+ 'Feb' => '02',
8
+ 'Mar' => '03',
9
+ 'Apr' => '04',
10
+ 'May' => '05',
11
+ 'Jun' => '06',
12
+ 'Jul' => '07',
13
+ 'Aug' => '08',
14
+ 'Sep' => '09',
15
+ 'Oct' => '10',
16
+ 'Nov' => '11',
17
+ 'Dec' => '12' }
18
+
19
+ attr_reader :input
20
+
21
+ def initialize(input_string)
22
+ @input = input_string
23
+ end
24
+
25
+ def self.[](input_string)
26
+ new(input_string).as_sql_date
27
+ end
28
+
29
+ def as_sql_date
30
+ return nil unless input
31
+ parts = input.gsub(',', '').split
32
+ month = MONTH_NAMES_TO_NUMBERS[parts[0]]
33
+ return nil unless month
34
+ day = parts[1].rjust(2, '0')
35
+ "#{parts[2]}-#{month}-#{day}"
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,141 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Page
4
+
5
+ class Error < StandardError; end
6
+ class MalformedDocumentError < Error; end
7
+ class MissingResourceError < Error; end
8
+ class RequestFailedError < Error; end
9
+
10
+ def self.included(mod)
11
+ mod.module_eval do
12
+ include Eventable
13
+ attr_reader :html, :query_params, :body_params, :response
14
+ instance_variable_set :@request_prototype, RequestPrototype.new
15
+ instance_variable_set :@fields, []
16
+ end
17
+ mod.extend(ClassMethods)
18
+ end
19
+
20
+ module ClassMethods
21
+ def uri(value = nil)
22
+ if value
23
+ @request_prototype.uri_template = value
24
+ else
25
+ @request_prototype.uri_template
26
+ end
27
+ end
28
+
29
+ def default_body_params(value = nil)
30
+ if value
31
+ @request_prototype.body_params = value
32
+ else
33
+ @request_prototype.body_params
34
+ end
35
+ end
36
+
37
+ def http_method(value = nil)
38
+ if value
39
+ @request_prototype.http_method = value
40
+ else
41
+ @request_prototype.http_method
42
+ end
43
+ end
44
+
45
+ def emits(field, &block)
46
+ fields << field.to_sym
47
+ define_method(field) { instance_exec(field, &block) } if block_given?
48
+ end
49
+
50
+ def request(query_params = {}, body_params = {})
51
+ new(query_params, body_params).request
52
+ end
53
+
54
+ def parse(response)
55
+ new(nil, nil, response).parse
56
+ end
57
+
58
+ def process(query_params = {}, body_params = {})
59
+ new(query_params, body_params).process
60
+ end
61
+
62
+ def fields
63
+ @fields
64
+ end
65
+
66
+ def request_prototype
67
+ @request_prototype
68
+ end
69
+ end
70
+
71
+ def initialize(query_params = {}, body_params = {}, response = nil)
72
+ if response
73
+ @response = response.is_a?(Hash) ? Response.new(response) : response
74
+ @query_params = @response.query_params
75
+ @body_params = @response.body_params
76
+ @html = @response.body
77
+ else
78
+ @query_params = query_params
79
+ @body_params = body_params
80
+ end
81
+ end
82
+
83
+ def [](field)
84
+ as_hash[field.to_sym]
85
+ end
86
+
87
+ def request_prototype
88
+ self.class.request_prototype
89
+ end
90
+
91
+ def fields
92
+ self.class.fields
93
+ end
94
+
95
+ def http_method
96
+ self.class.http_method
97
+ end
98
+
99
+ def process
100
+ request
101
+ parse
102
+ self
103
+ end
104
+
105
+ def request
106
+ return if @html
107
+ fire :before_request
108
+ @response = request_prototype.request(query_params, body_params)
109
+ @html = @response.body
110
+ fire :after_request
111
+ self
112
+ end
113
+
114
+ def parse
115
+ return if is_parsed?
116
+ return unless @html
117
+ fire :before_parse
118
+ @doc = Nokogiri::HTML(@html)
119
+ fire :after_parse
120
+ self
121
+ end
122
+
123
+ def is_parsed?
124
+ doc ? true : false
125
+ end
126
+
127
+ def as_hash
128
+ @as_hash ||= begin
129
+ fields.reduce({}) { |hsh, field| hsh.merge(field => send(field)) }
130
+ end
131
+ end
132
+
133
+ protected
134
+
135
+ def doc
136
+ @doc
137
+ end
138
+
139
+ end
140
+ end
141
+ end