lcbo 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. data/.gitignore +1 -0
  2. data/CHANGELOG.md +3 -0
  3. data/Gemfile +6 -0
  4. data/Gemfile.lock +18 -0
  5. data/LICENSE +18 -0
  6. data/README.md +29 -0
  7. data/Rakefile +62 -0
  8. data/lcbo.gemspec +29 -0
  9. data/lib/lcbo.rb +23 -0
  10. data/lib/lcbo/crawlers.rb +4 -0
  11. data/lib/lcbo/crawlers/inventories_crawler.rb +15 -0
  12. data/lib/lcbo/crawlers/product_lists_crawler.rb +23 -0
  13. data/lib/lcbo/crawlers/products_crawler.rb +16 -0
  14. data/lib/lcbo/crawlers/stores_crawler.rb +16 -0
  15. data/lib/lcbo/crawlkit.rb +24 -0
  16. data/lib/lcbo/crawlkit/eventable.rb +56 -0
  17. data/lib/lcbo/crawlkit/fastdate_helper.rb +40 -0
  18. data/lib/lcbo/crawlkit/page.rb +141 -0
  19. data/lib/lcbo/crawlkit/request.rb +51 -0
  20. data/lib/lcbo/crawlkit/request_prototype.rb +31 -0
  21. data/lib/lcbo/crawlkit/response.rb +48 -0
  22. data/lib/lcbo/crawlkit/titlecase_helper.rb +97 -0
  23. data/lib/lcbo/crawlkit/volume_helper.rb +46 -0
  24. data/lib/lcbo/ext.rb +13 -0
  25. data/lib/lcbo/helpers.rb +34 -0
  26. data/lib/lcbo/pages.rb +4 -0
  27. data/lib/lcbo/pages/inventory_page.rb +60 -0
  28. data/lib/lcbo/pages/product_list_page.rb +85 -0
  29. data/lib/lcbo/pages/product_page.rb +296 -0
  30. data/lib/lcbo/pages/store_page.rb +196 -0
  31. data/lib/lcbo/version.rb +3 -0
  32. data/spec/crawlkit/eventable_spec.rb +23 -0
  33. data/spec/crawlkit/fastdate_helper_spec.rb +18 -0
  34. data/spec/crawlkit/page_spec.rb +114 -0
  35. data/spec/crawlkit/request_prototype_spec.rb +5 -0
  36. data/spec/crawlkit/request_spec.rb +41 -0
  37. data/spec/crawlkit/response_spec.rb +5 -0
  38. data/spec/crawlkit/titlecase_helper_spec.rb +30 -0
  39. data/spec/crawlkit/volume_helper_spec.rb +21 -0
  40. data/spec/crawlkit_spec.rb +5 -0
  41. data/spec/lcbo_spec.rb +38 -0
  42. data/spec/pages/inventory_pages.yml +1685 -0
  43. data/spec/pages/inventory_pages/1.html +11649 -0
  44. data/spec/pages/inventory_pages/2.html +495 -0
  45. data/spec/pages/product_list_pages.yml +108 -0
  46. data/spec/pages/product_list_pages/1.html +4866 -0
  47. data/spec/pages/product_pages.yml +258 -0
  48. data/spec/pages/product_pages/1.html +1319 -0
  49. data/spec/pages/product_pages/2.html +1343 -0
  50. data/spec/pages/product_pages/3.html +1336 -0
  51. data/spec/pages/product_pages/4.html +1319 -0
  52. data/spec/pages/product_pages/5.html +1324 -0
  53. data/spec/pages/product_pages/6.html +1319 -0
  54. data/spec/pages/product_pages/7.html +1314 -0
  55. data/spec/pages/store_pages.yml +80 -0
  56. data/spec/pages/store_pages/1.html +592 -0
  57. data/spec/pages/store_pages/2.html +592 -0
  58. data/spec/pages_spec.rb +34 -0
  59. data/spec/spec_helper.rb +77 -0
  60. metadata +205 -0
data/.gitignore ADDED
@@ -0,0 +1 @@
1
+ pkg
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ Version 0.9.0
2
+
3
+ * Initial public release
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'http://rubygems.org'
2
+
3
+ gem 'rspec', '1.3.0'
4
+ gem 'typhoeus', '0.1.29'
5
+ gem 'addressable', '2.1.2'
6
+ gem 'nokogiri', '1.4.3.1'
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ GEM
2
+ remote: http://rubygems.org/
3
+ specs:
4
+ addressable (2.1.2)
5
+ nokogiri (1.4.3.1)
6
+ rack (1.2.1)
7
+ rspec (1.3.0)
8
+ typhoeus (0.1.29)
9
+ rack
10
+
11
+ PLATFORMS
12
+ ruby
13
+
14
+ DEPENDENCIES
15
+ addressable (= 2.1.2)
16
+ nokogiri (= 1.4.3.1)
17
+ rspec (= 1.3.0)
18
+ typhoeus (= 0.1.29)
data/LICENSE ADDED
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2010 Carsten Nielsen
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy of
4
+ this software and associated documentation files (the "Software"), to deal in
5
+ the Software without restriction, including without limitation the rights to
6
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
7
+ of the Software, and to permit persons to whom the Software is furnished to do
8
+ so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15
+ FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16
+ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17
+ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18
+ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # LCBO: The Ruby Gem
2
+
3
+ This library is used to gather data for [LCBO API](http://lcboapi.com). It allows you to request and parse store, product, inventory, and product list pages directly from the [LCBO](http://lcbo.com) website.
4
+
5
+ ## Synopsis
6
+
7
+ require 'lcbo'
8
+
9
+ LCBO.store(511)
10
+ # => { :store_no => 511, :name => "King & Spadina", ... }
11
+
12
+ LCBO.product(18)
13
+ # => { :product_no => 11, :name => "Heineken Lager", ... }
14
+
15
+ LCBO.inventory(18)
16
+ # => { :product_no => 18, :inventory_count => 40398, :inventories => [ ... ] }
17
+
18
+ LCBO.products_list(1)
19
+ # => { :page => 1, :final_page => 108, ..., :product_nos => [ ... ] }
20
+
21
+ ## Installation
22
+
23
+ Use Ruby Gems: `gem install lcbo`
24
+
25
+ ## Links
26
+
27
+ * [Issue tracker](http://github.com/heycarsten/lcbo/issues)
28
+ * [Source code](http://github.com/heycarsten/lcbo)
29
+ * [License](http://github.com/heycarsten/lcbo/blob/master/LICENSE)
data/Rakefile ADDED
@@ -0,0 +1,62 @@
1
+ gem 'rspec', '1.3.0'
2
+ require 'rubygems/specification' unless defined?(Gem::Specification)
3
+ require 'spec/rake/spectask'
4
+ require 'rake/gempackagetask'
5
+
6
+ task :default => :spec
7
+
8
+ def gemspec
9
+ @gemspec ||= begin
10
+ Gem::Specification.load(File.expand_path('lcbo.gemspec'))
11
+ end
12
+ end
13
+
14
+ desc 'Start an irb console'
15
+ task :console do
16
+ system 'irb -I lib -r lcbo'
17
+ end
18
+
19
+ desc 'Validates the gemspec'
20
+ task :gemspec do
21
+ gemspec.validate
22
+ end
23
+
24
+ desc 'Displays the current version'
25
+ task :version do
26
+ puts "Current version: #{gemspec.version}"
27
+ end
28
+
29
+ desc 'Installs the gem locally'
30
+ task :install => :package do
31
+ sh "gem install pkg/#{gemspec.name}-#{gemspec.version}"
32
+ end
33
+
34
+ desc 'Release the gem'
35
+ task :release => :package do
36
+ sh "gem push pkg/#{gemspec.name}-#{gemspec.version}.gem"
37
+ end
38
+
39
+ Rake::GemPackageTask.new(gemspec) do |pkg|
40
+ pkg.gem_spec = gemspec
41
+ end
42
+ task :gem => :gemspec
43
+ task :package => :gemspec
44
+
45
+ desc 'Run the specs'
46
+ Spec::Rake::SpecTask.new do |t|
47
+ t.spec_files = FileList['spec/**/*_spec.rb']
48
+ t.spec_opts = %w[-fs --color]
49
+ end
50
+
51
+ desc 'Download all HTML indicated in YAML assertion files'
52
+ task :download_support do
53
+ require 'yaml'
54
+ require 'open-uri'
55
+ product_pages = YAML.load_file('./spec/support/product_pages.yml')
56
+ product_pages.each do |spec|
57
+ html = open(spec[:uri]).read
58
+ File.open("./spec/support/product_pages/#{spec[:file]}", ?w) { |file|
59
+ file.print(html)
60
+ }
61
+ end
62
+ end
data/lcbo.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ require File.expand_path("../lib/lcbo/version", __FILE__)
3
+
4
+ Gem::Specification.new do |s|
5
+ s.name = 'lcbo'
6
+ s.version = LCBO::VERSION
7
+ s.platform = Gem::Platform::RUBY
8
+ s.authors = ['Carsten Nielsen']
9
+ s.email = ['heycarsten@gmail.com']
10
+ s.homepage = 'http://github.com/heycarsten/lcbo'
11
+ s.summary = %q{A library for parsing HTML pages from http://lcbo.com}
12
+ s.description = %q{Request and parse product, store, inventory, and product search pages directly from the official LCBO website.}
13
+
14
+ s.required_rubygems_version = '>= 1.3.6'
15
+ s.rubyforge_project = 'lcbo'
16
+
17
+ s.add_dependency 'typhoeus'
18
+ s.add_dependency 'addressable'
19
+ s.add_dependency 'nokogiri'
20
+
21
+ s.add_development_dependency 'rspec', '1.3.0'
22
+
23
+ s.files = `git ls-files`.split(?\n)
24
+ s.test_files = `git ls-files -- {test,spec}/*`.split(?\n)
25
+ s.require_paths = ['lib']
26
+
27
+ # If you need an executable, add it here
28
+ # s.executables = ['lcbo']
29
+ end
data/lib/lcbo.rb ADDED
@@ -0,0 +1,23 @@
1
+ module LCBO
2
+
3
+ DEFAULT_CONFIG = {
4
+ :user_agent => nil,
5
+ }
6
+
7
+ def self.config
8
+ reset_config! unless @config
9
+ @config
10
+ end
11
+
12
+ def self.reset_config!
13
+ @config = DEFAULT_CONFIG.dup
14
+ end
15
+
16
+ end
17
+
18
+ require 'lcbo/version'
19
+ require 'lcbo/ext'
20
+ require 'lcbo/helpers'
21
+ require 'lcbo/crawlkit'
22
+ require 'lcbo/pages'
23
+ require 'lcbo/crawlers'
@@ -0,0 +1,4 @@
1
+ require 'lcbo/crawlers/inventories_crawler'
2
+ require 'lcbo/crawlers/products_crawler'
3
+ require 'lcbo/crawlers/product_lists_crawler'
4
+ require 'lcbo/crawlers/stores_crawler'
@@ -0,0 +1,15 @@
1
+ module LCBO
2
+ class InventoriesCrawler
3
+
4
+ def self.run(product_nos, &block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ product_nos.each do |product_no|
7
+ begin
8
+ yield InventoryRequest.parse(:product_no => product_no).as_hash
9
+ rescue
10
+ end
11
+ end
12
+ end
13
+
14
+ end
15
+ end
@@ -0,0 +1,23 @@
1
+ module LCBO
2
+ class ProductListsCrawler
3
+
4
+ MAX_RETRIES = 10
5
+
6
+ class Error < StandardError; end
7
+ class EpicTimeoutError < Error; end
8
+
9
+ def self.run(params = {}, tries = 0, &block)
10
+ raise ArgumentError, 'block expected' unless block_given?
11
+ begin
12
+ payload = ProductListRequest.parse(params).as_hash
13
+ yield(payload)
14
+ run(:page => payload[:next_page], &block) if payload[:next_page]
15
+ rescue Errno::ETIMEDOUT, Timeout::Error
16
+ # On timeout, try again.
17
+ raise EpicTimeoutError if tries > MAX_RETRIES
18
+ run(params, (tries + 1), &block)
19
+ end
20
+ end
21
+
22
+ end
23
+ end
@@ -0,0 +1,16 @@
1
+ module LCBO
2
+ class ProductsCrawler
3
+
4
+ def self.run(product_nos, &block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ product_nos.each do |product_no|
7
+ begin
8
+ yield ProductRequest.parse(:product_no => product_no).as_hash
9
+ rescue CrawlKit::MissingResourceError, Errno::ETIMEDOUT, Timeout::Error
10
+ # Ignore products with no data, and timeouts.
11
+ end
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ module LCBO
2
+ class StoresCrawler
3
+
4
+ def self.run(&block)
5
+ raise ArgumentError, 'block expected' unless block_given?
6
+ (1..720).each do |store_no|
7
+ begin
8
+ yield StoreRequest.parse(:store_no => store_no).as_hash
9
+ rescue CrawlKit::MissingResourceError, Errno::ETIMEDOUT, Timeout::Error
10
+ # Ignore stores that don't exist and timeouts.
11
+ end
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,24 @@
1
+ require 'addressable/template'
2
+ require 'addressable/uri'
3
+ require 'nokogiri'
4
+ require 'typhoeus'
5
+ require 'uri'
6
+
7
+ module LCBO
8
+ module CrawlKit
9
+ USER_AGENT ||= begin
10
+ LCBO.config[:user_agent] ||
11
+ ENV['LCBO_USER_AGENT'] ||
12
+ Typhoeus::USER_AGENT
13
+ end
14
+ end
15
+ end
16
+
17
+ require 'lcbo/crawlkit/eventable'
18
+ require 'lcbo/crawlkit/fastdate_helper'
19
+ require 'lcbo/crawlkit/page'
20
+ require 'lcbo/crawlkit/request'
21
+ require 'lcbo/crawlkit/response'
22
+ require 'lcbo/crawlkit/request_prototype'
23
+ require 'lcbo/crawlkit/titlecase_helper'
24
+ require 'lcbo/crawlkit/volume_helper'
@@ -0,0 +1,56 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Eventable
4
+
5
+ EVENT_TYPES = %[
6
+ before_request
7
+ after_request
8
+ before_parse
9
+ after_parse ]
10
+
11
+ def self.included(mod)
12
+ mod.extend(ClassMethods)
13
+ end
14
+
15
+ def fire(event_type)
16
+ self.class.callbacks.
17
+ select { |callback| callback.is_for?(event_type) }.
18
+ each { |callback| callback.call_on(self) }
19
+ end
20
+
21
+ module ClassMethods
22
+ def on(event_type, *method_names)
23
+ @callbacks ||= []
24
+ method_names.each do |method_name|
25
+ @callbacks << Callback.new(event_type, method_name)
26
+ end
27
+ end
28
+
29
+ def callbacks
30
+ @callbacks || []
31
+ end
32
+ end
33
+
34
+ class Callback
35
+ attr_reader :event_type, :method_name
36
+
37
+ def initialize(event_type, method_name)
38
+ @event_type = event_type.to_sym
39
+ @method_name = method_name.to_sym
40
+ end
41
+
42
+ def is_for?(event_sym)
43
+ unless EVENT_TYPES.include?(event_sym.to_s)
44
+ raise ArgumentError, "event_type: #{event_sym} is not valid"
45
+ end
46
+ event_type == event_sym.to_sym
47
+ end
48
+
49
+ def call_on(object)
50
+ object.send(method_name)
51
+ end
52
+ end
53
+
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,40 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ class FastDateHelper
4
+
5
+ MONTH_NAMES_TO_NUMBERS = {
6
+ 'Jan' => '01',
7
+ 'Feb' => '02',
8
+ 'Mar' => '03',
9
+ 'Apr' => '04',
10
+ 'May' => '05',
11
+ 'Jun' => '06',
12
+ 'Jul' => '07',
13
+ 'Aug' => '08',
14
+ 'Sep' => '09',
15
+ 'Oct' => '10',
16
+ 'Nov' => '11',
17
+ 'Dec' => '12' }
18
+
19
+ attr_reader :input
20
+
21
+ def initialize(input_string)
22
+ @input = input_string
23
+ end
24
+
25
+ def self.[](input_string)
26
+ new(input_string).as_sql_date
27
+ end
28
+
29
+ def as_sql_date
30
+ return nil unless input
31
+ parts = input.gsub(',', '').split
32
+ month = MONTH_NAMES_TO_NUMBERS[parts[0]]
33
+ return nil unless month
34
+ day = parts[1].rjust(2, '0')
35
+ "#{parts[2]}-#{month}-#{day}"
36
+ end
37
+
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,141 @@
1
+ module LCBO
2
+ module CrawlKit
3
+ module Page
4
+
5
+ class Error < StandardError; end
6
+ class MalformedDocumentError < Error; end
7
+ class MissingResourceError < Error; end
8
+ class RequestFailedError < Error; end
9
+
10
+ def self.included(mod)
11
+ mod.module_eval do
12
+ include Eventable
13
+ attr_reader :html, :query_params, :body_params, :response
14
+ instance_variable_set :@request_prototype, RequestPrototype.new
15
+ instance_variable_set :@fields, []
16
+ end
17
+ mod.extend(ClassMethods)
18
+ end
19
+
20
+ module ClassMethods
21
+ def uri(value = nil)
22
+ if value
23
+ @request_prototype.uri_template = value
24
+ else
25
+ @request_prototype.uri_template
26
+ end
27
+ end
28
+
29
+ def default_body_params(value = nil)
30
+ if value
31
+ @request_prototype.body_params = value
32
+ else
33
+ @request_prototype.body_params
34
+ end
35
+ end
36
+
37
+ def http_method(value = nil)
38
+ if value
39
+ @request_prototype.http_method = value
40
+ else
41
+ @request_prototype.http_method
42
+ end
43
+ end
44
+
45
+ def emits(field, &block)
46
+ fields << field.to_sym
47
+ define_method(field) { instance_exec(field, &block) } if block_given?
48
+ end
49
+
50
+ def request(query_params = {}, body_params = {})
51
+ new(query_params, body_params).request
52
+ end
53
+
54
+ def parse(response)
55
+ new(nil, nil, response).parse
56
+ end
57
+
58
+ def process(query_params = {}, body_params = {})
59
+ new(query_params, body_params).process
60
+ end
61
+
62
+ def fields
63
+ @fields
64
+ end
65
+
66
+ def request_prototype
67
+ @request_prototype
68
+ end
69
+ end
70
+
71
+ def initialize(query_params = {}, body_params = {}, response = nil)
72
+ if response
73
+ @response = response.is_a?(Hash) ? Response.new(response) : response
74
+ @query_params = @response.query_params
75
+ @body_params = @response.body_params
76
+ @html = @response.body
77
+ else
78
+ @query_params = query_params
79
+ @body_params = body_params
80
+ end
81
+ end
82
+
83
+ def [](field)
84
+ as_hash[field.to_sym]
85
+ end
86
+
87
+ def request_prototype
88
+ self.class.request_prototype
89
+ end
90
+
91
+ def fields
92
+ self.class.fields
93
+ end
94
+
95
+ def http_method
96
+ self.class.http_method
97
+ end
98
+
99
+ def process
100
+ request
101
+ parse
102
+ self
103
+ end
104
+
105
+ def request
106
+ return if @html
107
+ fire :before_request
108
+ @response = request_prototype.request(query_params, body_params)
109
+ @html = @response.body
110
+ fire :after_request
111
+ self
112
+ end
113
+
114
+ def parse
115
+ return if is_parsed?
116
+ return unless @html
117
+ fire :before_parse
118
+ @doc = Nokogiri::HTML(@html)
119
+ fire :after_parse
120
+ self
121
+ end
122
+
123
+ def is_parsed?
124
+ doc ? true : false
125
+ end
126
+
127
+ def as_hash
128
+ @as_hash ||= begin
129
+ fields.reduce({}) { |hsh, field| hsh.merge(field => send(field)) }
130
+ end
131
+ end
132
+
133
+ protected
134
+
135
+ def doc
136
+ @doc
137
+ end
138
+
139
+ end
140
+ end
141
+ end