bazaar_sources 0.2.1.1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,127 @@
1
+ module ExternalUrl
2
+ require 'net/http'
3
+ require 'uri'
4
+
5
+ REQUEST_HEADERS = {
6
+ 'User-Agent' => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
7
+ }
8
+
9
+ # Note: This method is only used now by the Validator, and is only suitable for the Validator.
10
+ def self.just_page_content(page)
11
+ # No longer used
12
+ justbody = /.*?<body.*?>(.*)<\/body>/im
13
+ comments = /(<!--.*?-->)(.{1,40})/m
14
+ nostyle = /<style.*?<\/style>/im
15
+ notags = /<.*?>/im
16
+ noentities = /&.*?;/
17
+ noextrawhitespace = /(\s)+/im
18
+ # Remove comments, unless inside of JavaScript (because frequently JavaScript has good matches for model numbers, etc.)
19
+ page.gsub(comments) do |c|
20
+ comment = $1
21
+ post = $2
22
+ if post =~ /<\/script/
23
+ comment + post
24
+ else
25
+ post
26
+ end
27
+ end.gsub(nostyle,' ').gsub(notags,'').gsub(noentities,' ').gsub(noextrawhitespace,'\1')
28
+ end
29
+
30
+ # returns a hash containing :success flag; if true, you'll have the :response (thus response.body) and :final_uri
31
+ # (e.g. if redirected) If false a :message is set and :final_uri
32
+ def self.fetch_response(url, limit = 10, debug = false)
33
+ begin
34
+ if limit == 0
35
+ return {:success => false, :response => nil, :message => "Redirected too many times", :final_uri => url}
36
+ end
37
+
38
+ message = self.invalid_uri(url)
39
+ if message
40
+ return {:success => false, :response => nil, :message => message, :final_uri => url}
41
+ end
42
+ uri = URI.safe_parse(url.to_s)
43
+ http_request = Net::HTTP.new(uri.host)
44
+ if debug
45
+ puts "http request: #{http_request.inspect}"
46
+ end
47
+ # Adding user agent header helps some merchants feel more comfortable with our bot
48
+ no_host_url = uri.to_s.gsub(/.*?#{uri.host}(.*)/,'\1')
49
+ if debug
50
+ puts "http request to: #{no_host_url}"
51
+ end
52
+ response = http_request.get(no_host_url, REQUEST_HEADERS)
53
+ if debug
54
+ puts "http response: #{response.inspect}"
55
+ end
56
+
57
+ case response
58
+ when Net::HTTPSuccess then
59
+ if debug
60
+ puts "Success, final url: #{url}"
61
+ ExternalUrl.to_file(url, response.body, "html")
62
+ ExternalUrl.to_file(url, ExternalUrl.just_page_content(response.body), "txt")
63
+ end
64
+ {:success => true, :response => response, :final_uri => url}
65
+ when Net::HTTPRedirection then
66
+ redirect_url = to_absolute_url(response['location'], url)
67
+ if debug
68
+ puts "Redirecting to #{redirect_url}"
69
+ end
70
+ self.fetch_response(redirect_url, limit - 1, debug)
71
+ else
72
+ {:success => false, :response => response, :final_uri => url}
73
+ end
74
+ rescue Exception => exp
75
+ {:success => false, :response => nil, :message => exp.message, :final_uri => url}
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def self.to_file(url, content, ext = "html")
82
+ uri = URI.safe_parse(url.to_s)
83
+ filename_base = "#{uri.host}.#{uri.path}?#{uri.query}"
84
+ filename = filename_base.gsub(/(\W)+/,"-") + ".#{ext}"
85
+ f = File.new(filename, "w")
86
+ f.write(content)
87
+ f.close
88
+ puts "Wrote content to file #{filename} in #{Dir.pwd}"
89
+ end
90
+
91
+ # Returns nil if the URL is a valid URI, else a message. In this case, scheme, host and path are all required.
92
+ def self.invalid_uri(url)
93
+ return 'No URL' if url.nil? || url.empty?
94
+ begin
95
+ uri = URI.safe_parse(url)
96
+ if uri.nil? || uri.scheme.nil? || uri.host.nil? || uri.path.nil?
97
+ if uri.nil?
98
+ return "URL is not well formed"
99
+ else
100
+ return "URL incomplete: scheme is #{uri.scheme.nil? ? "missing" : uri.scheme}, host is #{uri.host.nil? ? "missing" : uri.host} and path is #{uri.path.nil? ? "missing" : uri.path}"
101
+ end
102
+ end
103
+ rescue Exception => exp
104
+ return "URI improperly formed: #{exp.message}"
105
+ end
106
+ return nil
107
+ end
108
+
109
+ def self.to_absolute_url(url, current_url)
110
+ unless url.is_a? URI
111
+ url = URI.safe_parse(url)
112
+ end
113
+
114
+ # construct an absolute url
115
+ if url.relative?
116
+ unless current_url.is_a? URI
117
+ current_url = URI.safe_parse(current_url)
118
+ end
119
+
120
+ url.scheme = current_url.scheme
121
+ url.host = current_url.host
122
+ url.port = current_url.port unless current_url.port == 80
123
+ end
124
+
125
+ return url.to_s
126
+ end
127
+ end
@@ -0,0 +1,14 @@
1
+ require 'nokogiri'
2
+
3
+ class HttpartyNokogiriParser < HTTParty::Parser
4
+
5
+ protected
6
+
7
+ def xml
8
+ Nokogiri::XML(body)
9
+ end
10
+
11
+ def html
12
+ Nokogiri::HTML(body)
13
+ end
14
+ end
@@ -0,0 +1,174 @@
1
+ require 'ostruct'
2
+ require 'hpricot'
3
+ require 'api_helpers/external_url'
4
+
5
+ module ResellerRatingsAPI
6
+ def self.alt_code_from_merchant_source_page_url(merchant_source_page_url)
7
+ alt_code = nil
8
+ if res = merchant_source_page_url.match(/resellerratings\.com\/store\/([^\/\?#]*)/)
9
+ alt_code = res[1]
10
+ end
11
+ alt_code
12
+ end
13
+
14
+ def self.fetch_suggestions(search_text, limit)
15
+ rr_search_url = "http://www.resellerratings.com/reseller_list.pl?keyword_search=#{URI.escape(search_text)}"
16
+ doc_and_final_uri = open_doc(rr_search_url)
17
+ result = []
18
+ unless doc_and_final_uri.nil? || doc_and_final_uri[:final_uri].nil? || doc_and_final_uri[:final_uri].empty?
19
+ final_uri = URI.safe_parse(doc_and_final_uri[:final_uri])
20
+ if final_uri.path == '/reseller_list.pl'
21
+ # got the search results page with more than one result
22
+ result = convert_search_results_page_to_merchant_array(doc_and_final_uri[:doc], limit)
23
+ elsif final_uri.path.match(/^\/store\/(.+)$/)
24
+ # got merchant page back
25
+ result << { :merchant_page_url => final_uri, :merchant_code => $1, :merchant_name => $1.gsub('_', ' ') }
26
+ else
27
+ # don't know where we ended up
28
+ end
29
+ end
30
+ result
31
+ end
32
+
33
+ def self.search_for_merchant_source(search_text, limit=15)
34
+ merchant_sources = []
35
+ rr_search_url = "http://www.resellerratings.com/reseller_list.pl?keyword_search=#{URI.escape(search_text)}"
36
+ doc_and_final_uri = open_doc(rr_search_url)
37
+ unless doc_and_final_uri.nil?
38
+ if merchant_page_url?(doc_and_final_uri[:final_uri])
39
+ merchant_sources << convert_merchant_page_to_merchant_source(doc_and_final_uri)
40
+ else
41
+ doc_and_final_uri[:doc].search('tr/td/font/a[text() = "Read Reviews"]/../../..').each do |result_row|
42
+ element = result_row.at('td//a')
43
+ name = element.inner_text.strip
44
+ alt_merchant_code = element.attributes['href'].match(/\/store\/(.+)$/)[1]
45
+ existing_merchant_source = MerchantSource.find_by_source_and_alt_code(Source.reseller_ratings_source, alt_merchant_code)
46
+ if existing_merchant_source.nil?
47
+ merchant_sources << OpenStruct.new({:source => Source.reseller_ratings_source, :name => name, :alt_code => alt_merchant_code})
48
+ else
49
+ merchant_sources << existing_merchant_source
50
+ end
51
+ break if merchant_sources.length >= limit
52
+ end
53
+ end
54
+ end
55
+ merchant_sources
56
+ end
57
+
58
+ def self.search_for_merchant_source_best_match(search_text)
59
+ rr_search_url = "http://www.resellerratings.com/reseller_list.pl?keyword_search=#{URI.escape(search_text)}"
60
+ fetch_merchant_source(rr_search_url)
61
+ end
62
+
63
+ def self.fetch_merchant_source(merchant_source_page_url)
64
+ doc_and_final_uri = open_doc(merchant_source_page_url)
65
+ if !doc_and_final_uri.nil?
66
+ convert_merchant_page_to_merchant_source(doc_and_final_uri)
67
+ else
68
+ nil
69
+ end
70
+ end
71
+
72
+ def self.fetch_merchant_source_by_alt_merchant_code(alt_merchant_code)
73
+ merchant_source_page_url = "http://www.resellerratings.com/store/#{alt_merchant_code}"
74
+ fetch_merchant_source(merchant_source_page_url)
75
+ end
76
+
77
+ def self.merchant_page_url?(url)
78
+ !url.nil? && url.match(/\/store\/.+$/) != nil
79
+ end
80
+
81
+ private
82
+
83
+ def self.convert_merchant_page_to_merchant_source(doc_and_final_uri)
84
+ return nil if doc_and_final_uri.nil?
85
+ merchant_source = OpenStruct.new
86
+ merchant_source.source = Source.reseller_ratings_source
87
+ doc = doc_and_final_uri[:doc]
88
+
89
+ # Merchant Code
90
+ elements = doc.search('img[@src="http://images.resellerratings.com/images/write_a_review.gif"]/..')
91
+ unless elements.empty?
92
+ code = elements.first[:href].match(/^.*?([0-9]+).*?$/)[1]
93
+ merchant_source.code = code
94
+ end
95
+
96
+ # Use a blank 'code' to indicate we didn't find the merchant page
97
+ if merchant_source.code.nil? || merchant_source.code.empty?
98
+ return nil
99
+ end
100
+
101
+ # Alternative Merchant Code
102
+ unless doc_and_final_uri[:final_uri].nil? || doc_and_final_uri[:final_uri].empty?
103
+ merchant_source.alt_code = alt_code_from_merchant_source_page_url(doc_and_final_uri[:final_uri])
104
+ end
105
+
106
+ # Merchant Name
107
+ elements = doc.search('img[@src="http://images.resellerratings.com/images/small-storefront-rev.gif"]/../..')
108
+ unless elements.empty?
109
+ name = elements.first.inner_text.strip
110
+ merchant_source.name = name
111
+ end
112
+
113
+ # Merchant Homepage
114
+ elements = doc.search('font[text() *= "Homepage:"]/a/font')
115
+ unless elements.empty?
116
+ homepage = elements.first.inner_text.strip
117
+ merchant_source.homepage = homepage
118
+ end
119
+
120
+ # Merchant Rating
121
+ elements = doc.search('font[text() *= "Six-Month Rating:"]/../font[2]')
122
+ unless elements.empty?
123
+ merchant_rating = elements.first.inner_text.match(/\s*(.*?)\s*\/.*?/)[1]
124
+ merchant_source.merchant_rating = (merchant_rating.to_f * 10.0).round unless merchant_rating.nil?
125
+ end
126
+
127
+ # Num Merchant Reviews
128
+ elements = doc.search('font[text() *= "Six-Month Reviews:"]/../../td[2]')
129
+ unless elements.empty?
130
+ num_merchant_reviews = elements.first.inner_text.strip
131
+ merchant_source.num_merchant_reviews = num_merchant_reviews
132
+ end
133
+
134
+ # Merchant Rating Lifetime
135
+ elements = doc.search('font[text() *= "Lifetime Rating:"]/../font[2]')
136
+ unless elements.empty?
137
+ merchant_rating_lifetime = elements.first.inner_text.match(/\s*(.*?)\s*\/.*?/)[1]
138
+ merchant_source.merchant_rating_lifetime = (merchant_rating_lifetime.to_f * 10.0).round unless merchant_rating_lifetime.nil?
139
+ end
140
+
141
+ # Num Merchant Reviews Lifetime
142
+ elements = doc.search("font[text() *= 'Lifetime\nReviews:']/../../td[2]")
143
+ unless elements.empty?
144
+ num_merchant_reviews_lifetime = elements.first.inner_text.strip
145
+ merchant_source.num_merchant_reviews_lifetime = num_merchant_reviews_lifetime
146
+ end
147
+
148
+ merchant_source
149
+ end
150
+
151
+ def self.convert_search_results_page_to_merchant_array(search_results_doc, limit)
152
+ result = []
153
+ merchant_links = search_results_doc.search('tr[/td/font/a/font/b[text() = "Store Name"]]../tr/td/a')
154
+ merchant_links.each_with_index do |merchant_link, index|
155
+ break if index > limit-1
156
+ merchant_link.attributes['href'].match(/^.*\/store\/(.+)$/)
157
+ result << { :merchant_page_url => merchant_link.attributes['href'],
158
+ :merchant_code => $1,
159
+ :merchant_name => merchant_link.inner_text.strip }
160
+ end
161
+ result
162
+ end
163
+
164
+ def self.open_doc(url)
165
+ response = ExternalUrl.fetch_response(url)
166
+ if response[:success]
167
+ doc = Hpricot(response[:response].body)
168
+ final_uri = response[:final_uri]
169
+ return {:doc => doc, :final_uri => final_uri}
170
+ else
171
+ return nil
172
+ end
173
+ end
174
+ end
@@ -0,0 +1,224 @@
1
+ require File.join(File.dirname(__FILE__), 'api_helper')
2
+ require 'rubygems'
3
+ gem 'httparty', '>= 0.5.0'
4
+ require 'httparty'
5
+ require File.join(File.dirname(__FILE__), 'httparty_nokogiri_parser')
6
+
7
+ module Shopping
8
+ class Error < StandardError
9
+ attr_reader :code
10
+ def initialize(message, code)
11
+ super(message)
12
+ @code = code
13
+ end
14
+ end
15
+
16
+ class Publisher
17
+ include ApiHelper
18
+ include HTTParty
19
+ parser HttpartyNokogiriParser
20
+ format :xml
21
+ base_uri 'http://publisher.api.shopping.com/publisher/3.0/rest/'
22
+ default_params 'trackingId' => '3068547', 'apiKey' => SHOPPING_API_KEY
23
+
24
+ MAX_OFFERS = 20
25
+ PUTS_API_URL = false
26
+
27
+ def fetch_product(product_id, include_specs=false, include_offers=false)
28
+ # Won't get psuedo-redirected to new product ID unless we request at least one offer (very strange)
29
+ query = {'numItems' => include_offers ? MAX_OFFERS : 1, 'productId' => product_id.strip, 'showProductSpecs' => include_specs ? 'true' : 'false'}
30
+ call_api('/GeneralSearch', {:query => query}) do |doc|
31
+ product_node = doc.at('GeneralSearchResponse/categories/category/items/product')
32
+ convert_product_node(product_node, include_offers)
33
+ end
34
+ end
35
+
36
+ def fetch_offers(product_id)
37
+ query = {'numItems' => MAX_OFFERS, 'showOffersOnly' => 'true', 'productId' => product_id.strip}
38
+ call_api('/GeneralSearch', {:query => query}) do |doc|
39
+ items_node = doc.at('GeneralSearchResponse/categories/category/items')
40
+ convert_offers_collection_node(items_node)
41
+ end
42
+ end
43
+
44
+ def search_for_product(keyword, max_results=10)
45
+ query = {'doSkipping' => 'false', 'showProductOffers' => 'false', 'numAttributes' => 0, 'numItems' => max_results, 'keyword' => keyword.strip}
46
+ call_api('/GeneralSearch', {:query => query}) do |doc|
47
+ product_nodes = doc.search('GeneralSearchResponse/categories/category/items/product')
48
+ product_nodes.collect{|product_node| convert_product_node(product_node)}
49
+ end
50
+ end
51
+
52
+ protected
53
+
54
+ def call_api(path, options, &block)
55
+ if PUTS_API_URL
56
+ merged_options = self.class.default_options.dup.merge(options)
57
+ puts "Shopping.com API URL: #{HTTParty::Request.new(Net::HTTP::Get, path, merged_options).uri}"
58
+ end
59
+ doc = self.class.get(path, options)
60
+ errors = get_errors(doc)
61
+ if errors.empty?
62
+ yield doc
63
+ else
64
+ raise_exception(errors)
65
+ end
66
+ end
67
+
68
+ def get_errors(doc)
69
+ errors = []
70
+ doc.search('GenericResponse/exceptions/exception') do |exception_node|
71
+ message = exception_mode.at('message').text
72
+ code = exception_node.at('code').text.to_i
73
+ errors << Shopping::Error.new(message, code)
74
+ end
75
+ errors
76
+ end
77
+
78
+ def raise_exception(errors)
79
+ raise errors.first
80
+ end
81
+
82
+ def convert_product_node(product_node, include_offers=false)
83
+ product = {}
84
+ product[:product_id] = product_node['id']
85
+ product[:name] = product_node.at('name').text
86
+
87
+ description = product_node.at('fullDescription').text
88
+ if description.nil? || description.empty?
89
+ description = product_node.at('shortDescription').text
90
+ end
91
+ product[:description] = (description.nil? || description.empty?) ? '' : description
92
+
93
+ image_nodes = product_node.search('images/image[@available="true"]')
94
+ images = image_nodes.collect{|x|
95
+ {
96
+ :width => x['width'].to_i,
97
+ :height => x['height'].to_i,
98
+ :url => x.at('sourceURL').text
99
+ }
100
+ }.sort_by{|x| x[:width] * x[:height] }
101
+
102
+ product[:images] = {
103
+ :small_image => images[0],
104
+ :medium_image => images[1],
105
+ :large_image => images[2]
106
+ }
107
+
108
+ # possible_manufacturers = (product / 'offer > manufacturer').collect{|x| x.text}.compact.uniq
109
+ #
110
+ # if possible_manufacturers.length == 1
111
+ # product[:manufacturer] = possible_manufacturers.first # easy peasy lemon squezy
112
+ # elsif possible_manufacturers.length > 1
113
+ # # figure out which manufacturer is the most popular
114
+ # manufacturers_popularity_index = possible_manufacturers.inject({}) {|ha, manufacturer| ha[manufacturer] ||= 0; ha[manufacturer] += 1; ha }
115
+ # product[:manufacturer] = manufacturers_popularity_index.sort_by{|key, val| val }.last.first
116
+ # else
117
+ # product[:manufacturer] = nil # zip, zero, doodad :(
118
+ # end
119
+
120
+ # rating
121
+ review_count_node = product_node.at('rating/reviewCount')
122
+ product[:num_reviews] = review_count_node.nil? ? 0 : review_count_node.text.to_i
123
+ rating_value_node = product_node.at('rating/rating')
124
+ product[:rating] = rating_value_node.nil? ? nil : normalize_product_rating(rating_value_node.text.to_f)
125
+
126
+ # offers
127
+ if include_offers
128
+ offers_node = product_node.at('offers')
129
+ product[:offers] = convert_offers_collection_node(offers_node) unless offers_node.nil?
130
+ end
131
+
132
+ # specifications
133
+ specifications_node = product_node.at('specifications')
134
+ product[:specifications] = convert_specifications_node(specifications_node) unless specifications_node.nil?
135
+
136
+ product
137
+ end
138
+
139
+ def convert_offers_collection_node(offers_collection_node)
140
+ offer_nodes = offers_collection_node.nil? ? nil : offers_collection_node.search('offer')
141
+ return [] if offer_nodes.nil?
142
+ offers = {}
143
+ offer_nodes.each_with_index do |offer, offer_index|
144
+ # in-stock
145
+ stock_status = offer.at('stockStatus').text
146
+ in_stock = stock_status != 'out-of-stock' && stock_status != 'back-order'
147
+
148
+ if in_stock
149
+ store = offer.at('store')
150
+ store_hash = {
151
+ :id => store['id'],
152
+ :name => store.at('name').text,
153
+ :trusted => store['trusted'] == "true",
154
+ :authorized_reseller => store['authorizedReseller'] == 'true'
155
+ }
156
+ store_logo = store.at('logo')
157
+ if store_logo['available'] == 'true'
158
+ store_hash[:logo] = {
159
+ :width => store_logo['width'],
160
+ :height => store_logo['height'],
161
+ :url => store_logo.at('sourceURL').text
162
+ }
163
+ else
164
+ store_hash[:logo] = nil
165
+ end
166
+
167
+ # store rating
168
+ store_rating = store.at('ratingInfo')
169
+ store_hash[:rating] = {
170
+ :number => store_rating.at('rating').nil? ? nil : normalize_merchant_rating(store_rating.at('rating').text.to_f),
171
+ :count => store_rating.at('reviewCount').text.to_i,
172
+ :url => store_rating.at('reviewURL').nil? ? nil : store_rating.at('reviewURL').text
173
+ }
174
+
175
+ # prices
176
+ cpc = offer.at('cpc').nil? ? nil : (offer.at('cpc').text.to_f*100).to_i
177
+ base_price = to_d_or_nil(offer.at('basePrice').text)
178
+ shipping_cost = offer.at('shippingCost')['checkSite'] == 'true' ? nil : to_d_or_nil(offer.at('shippingCost').text)
179
+
180
+ # skip this offer if we already have one from same merchant and it has a lower total price
181
+ existing_offer = offers[store_hash[:id]]
182
+ unless existing_offer.nil?
183
+ next if existing_offer[:price] + (existing_offer[:shipping] || 0.0) < base_price + (shipping_cost || 0.0)
184
+ end
185
+
186
+ offers[store_hash[:id]] = { :original_index => offer_index,
187
+ :merchant_code => store_hash[:id],
188
+ :merchant_name => store_hash[:name],
189
+ :merchant_logo_url => store_hash[:logo].nil? ? nil : store_hash[:logo][:url],
190
+ :cpc => cpc,
191
+ :price => base_price,
192
+ :shipping => shipping_cost,
193
+ :offer_url => offer.at('offerURL').text,
194
+ :offer_tier => 1,
195
+ :merchant_rating => store_hash[:rating][:number],
196
+ :num_merchant_reviews => store_hash[:rating][:count] }
197
+ end
198
+ end
199
+ offers.values.sort_by{|x| x[:price] + (x[:shipping] || 0) }
200
+ end
201
+
202
+ def convert_specifications_node(specifications_node)
203
+ specifications = {}
204
+ specifications_node.search('feature').each do |feature_node|
205
+ feature_name = feature_node.at('name').text
206
+ value_nodes = feature_node.search('value')
207
+ if value_nodes.length > 1
208
+ specifications[feature_name] = value_nodes.collect{|value_node| value_node.text}
209
+ elsif value_nodes.length == 1
210
+ specifications[feature_name] = value_nodes.first.text
211
+ end
212
+ end
213
+ specifications
214
+ end
215
+
216
+ def normalize_product_rating(product_rating)
217
+ product_rating.nil? ? nil : (product_rating * 20.0).round
218
+ end
219
+
220
+ def normalize_merchant_rating(merchant_rating)
221
+ merchant_rating.nil? ? nil : (merchant_rating * 20.0).round
222
+ end
223
+ end
224
+ end