bazaar_sources 0.2.1.1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +7 -0
- data/init.rb +1 -0
- data/lib/api_helpers/amazon.rb +606 -0
- data/lib/api_helpers/api_helper.rb +9 -0
- data/lib/api_helpers/external_url.rb +127 -0
- data/lib/api_helpers/httparty_nokogiri_parser.rb +14 -0
- data/lib/api_helpers/reseller_ratings_api.rb +174 -0
- data/lib/api_helpers/shopping.rb +224 -0
- data/lib/api_helpers/shopping_bulk_api.rb +514 -0
- data/lib/api_helpers/shopzilla_api.rb +230 -0
- data/lib/bazaar_sources.rb +35 -0
- data/lib/sources/amazon_source.rb +94 -0
- data/lib/sources/buy_source.rb +34 -0
- data/lib/sources/ebay_source.rb +35 -0
- data/lib/sources/epinions_source.rb +80 -0
- data/lib/sources/google_source.rb +119 -0
- data/lib/sources/price_grabber_source.rb +94 -0
- data/lib/sources/reseller_ratings_source.rb +47 -0
- data/lib/sources/shopping_source.rb +136 -0
- data/lib/sources/shopzilla_source.rb +108 -0
- data/lib/sources/simple_sources.yml +71 -0
- data/lib/sources/source.rb +242 -0
- metadata +137 -0
data/README.rdoc
ADDED
data/init.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'bazaar_sources'
|
@@ -0,0 +1,606 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
require 'open-uri'
|
3
|
+
require 'cgi'
|
4
|
+
require 'digest/sha2'
|
5
|
+
|
6
|
+
module Amazon
|
7
|
+
class AsinNotFoundError < StandardError
|
8
|
+
attr_reader :asin
|
9
|
+
def initialize(message, asin)
|
10
|
+
super(message)
|
11
|
+
@asin = asin
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
class AsinFatalError < StandardError
|
16
|
+
attr_reader :asin
|
17
|
+
def initialize(message, asin)
|
18
|
+
super(message)
|
19
|
+
@asin = asin
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class ProductAdvertising
|
24
|
+
def associate_tag
|
25
|
+
AMAZON_ASSOCIATE_TAG
|
26
|
+
end
|
27
|
+
|
28
|
+
def at_a_glance_url(seller_id)
|
29
|
+
"http://www.amazon.com/gp/help/seller/at-a-glance.html?seller=#{seller_id}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def offer_url(asin, merchant_type, merchant_id)
|
33
|
+
"http://www.amazon.com/exec/obidos/ASIN/#{asin}/?#{merchant_type == 'seller' ? 'seller' : 'm'}=#{merchant_id}&tag=#{associate_tag}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def offer_listing_url(asin)
|
37
|
+
"http://www.amazon.com/gp/offer-listing/#{asin}?condition=new"
|
38
|
+
end
|
39
|
+
|
40
|
+
def accessories_url(asin)
|
41
|
+
accessories_url = "http://www.amazon.com/dp/accessories/#{asin}/#accessories"
|
42
|
+
"http://www.amazon.com/gp/redirect.html?ie=UTF8&tag=#{associate_tag}&linkCode=ur2&camp=1789&creative=9325&location=#{CGI::escape(accessories_url)}"
|
43
|
+
end
|
44
|
+
|
45
|
+
def find_offers_by_asin(asin, featured_merchants_only=false)
|
46
|
+
# find_offers_by_asin_via_api(asin, featured_merchants_only)
|
47
|
+
scrape_offer_listing_page_to_hash(asin, featured_merchants_only)
|
48
|
+
end
|
49
|
+
|
50
|
+
def find_product_review_info_by_asin_raw(asin)
|
51
|
+
request = {'Operation' => 'ItemLookup',
|
52
|
+
'ResponseGroup' => 'Reviews',
|
53
|
+
'ItemId' => asin.strip,
|
54
|
+
'IdType' => 'ASIN'}
|
55
|
+
make_amazon_api_request_raw request
|
56
|
+
end
|
57
|
+
|
58
|
+
def find_product_by_asin(asin)
|
59
|
+
request = {'Operation' => 'ItemLookup',
|
60
|
+
'ResponseGroup' => 'Medium',
|
61
|
+
'ItemId' => asin.strip,
|
62
|
+
'IdType' => 'ASIN'}
|
63
|
+
res = make_amazon_api_request request
|
64
|
+
|
65
|
+
item = res / 'Items' / 'Item'
|
66
|
+
asin = (item / 'ASIN').inner_html
|
67
|
+
item_attributes = item / 'ItemAttributes'
|
68
|
+
name = (item_attributes / 'Title').inner_html
|
69
|
+
list_price = (item_attributes / 'ListPrice' / 'Amount').inner_html
|
70
|
+
if list_price.nil? || list_price.empty?
|
71
|
+
list_price = 0
|
72
|
+
else
|
73
|
+
list_price = (list_price.to_f / 100.0)
|
74
|
+
end
|
75
|
+
model = (item_attributes / 'Model').inner_html
|
76
|
+
mpn = (item_attributes / 'MPN').inner_html
|
77
|
+
upc = (item_attributes / 'UPC').inner_html
|
78
|
+
manufacturer = (item_attributes / 'Manufacturer').inner_html
|
79
|
+
|
80
|
+
features = (item_attributes / 'Feature').collect{|x| x.inner_html } # specifications
|
81
|
+
|
82
|
+
editorial_reviews = (item / 'EditorialReviews').inject({}) {|ha, x| ha[(x / 'EditorialReview' / 'Source').inner_html] = (x / 'EditorialReview' / 'Content').inner_html; ha }
|
83
|
+
|
84
|
+
begin
|
85
|
+
small_image = {:url => (item.at('SmallImage') / 'URL').inner_html,
|
86
|
+
:width => (item.at('SmallImage') / 'Width').inner_html,
|
87
|
+
:height => (item.at('SmallImage') / 'Height').inner_html}
|
88
|
+
rescue
|
89
|
+
small_image = nil
|
90
|
+
end
|
91
|
+
begin
|
92
|
+
medium_image = {:url => (item.at('MediumImage') / 'URL').inner_html,
|
93
|
+
:width => (item.at('MediumImage') / 'Width').inner_html,
|
94
|
+
:height => (item.at('MediumImage') / 'Height').inner_html}
|
95
|
+
rescue
|
96
|
+
medium_image = nil
|
97
|
+
end
|
98
|
+
begin
|
99
|
+
large_image = {:url => (item.at('LargeImage') / 'URL').inner_html,
|
100
|
+
:width => (item.at('LargeImage') / 'Width').inner_html,
|
101
|
+
:height => (item.at('LargeImage') / 'Height').inner_html}
|
102
|
+
rescue
|
103
|
+
large_image = nil
|
104
|
+
end
|
105
|
+
|
106
|
+
product = {:asin => asin,
|
107
|
+
:name => name,
|
108
|
+
:list_price => list_price,
|
109
|
+
:model => model,
|
110
|
+
:mpn => mpn,
|
111
|
+
:upc => upc,
|
112
|
+
:manufacturer => manufacturer,
|
113
|
+
:small_image => small_image,
|
114
|
+
:medium_image => medium_image,
|
115
|
+
:large_image => large_image,
|
116
|
+
:features => features,
|
117
|
+
:editorial_reviews => editorial_reviews}
|
118
|
+
product
|
119
|
+
end
|
120
|
+
|
121
|
+
def item_search(search_terms)
|
122
|
+
request = {'Operation' => 'ItemSearch',
|
123
|
+
'Keywords' => search_terms,
|
124
|
+
'SearchIndex' => 'All',
|
125
|
+
'ResponseGroup' => 'Images,ItemAttributes'}
|
126
|
+
res = make_amazon_api_request request
|
127
|
+
products = []
|
128
|
+
items = (res / 'Items' / 'Item')
|
129
|
+
items.each do |item|
|
130
|
+
begin
|
131
|
+
small_image = item.at('SmallImage')
|
132
|
+
if !small_image.nil?
|
133
|
+
small_image_url = (small_image / 'URL').inner_html
|
134
|
+
else
|
135
|
+
small_image_url = ''
|
136
|
+
end
|
137
|
+
products << {
|
138
|
+
:asin => (item / 'ASIN').inner_html,
|
139
|
+
:name => (item / 'ItemAttributes' / 'Title').inner_html,
|
140
|
+
:small_image_url => small_image_url
|
141
|
+
}
|
142
|
+
rescue
|
143
|
+
end
|
144
|
+
end
|
145
|
+
products
|
146
|
+
end
|
147
|
+
|
148
|
+
def seller_lookup(seller_id)
|
149
|
+
request = { 'Operation' => 'SellerLookup',
|
150
|
+
'SellerId' => seller_id }
|
151
|
+
res = make_amazon_api_request request
|
152
|
+
|
153
|
+
element = res.at('/SellerLookupResponse/Sellers/Seller/SellerName')
|
154
|
+
if element.nil?
|
155
|
+
element = res.at('/SellerLookupResponse/Sellers/Seller/Nickname')
|
156
|
+
end
|
157
|
+
if !element.nil?
|
158
|
+
merchant_name = element.inner_text
|
159
|
+
end
|
160
|
+
begin
|
161
|
+
details = scrape_at_a_glance_page(seller_id)
|
162
|
+
logo_url = details[:logo_url]
|
163
|
+
merchant_name = details[:merchant_name] if merchant_name.nil? || merchant_name.empty?
|
164
|
+
homepage = details[:homepage]
|
165
|
+
rescue
|
166
|
+
end
|
167
|
+
|
168
|
+
if merchant_name.nil? || merchant_name.empty?
|
169
|
+
merchant_name = "Amazon merchant (#{seller_id})"
|
170
|
+
end
|
171
|
+
|
172
|
+
element = res.at('/SellerLookupResponse/Sellers/Seller/GlancePage')
|
173
|
+
glance_page_url = element.inner_text unless element.nil?
|
174
|
+
|
175
|
+
element = res.at('/SellerLookupResponse/Sellers/Seller/AverageFeedbackRating')
|
176
|
+
average_feedback_rating = element.nil? ? 0.0 : element.inner_text.to_f
|
177
|
+
|
178
|
+
element = res.at('/SellerLookupResponse/Sellers/Seller/TotalFeedback')
|
179
|
+
total_feedback = element.nil? ? 0 : element.inner_text.to_i
|
180
|
+
|
181
|
+
{ :seller_id => seller_id,
|
182
|
+
:merchant_name => merchant_name,
|
183
|
+
:glance_page_url => glance_page_url,
|
184
|
+
:average_feedback_rating => average_feedback_rating,
|
185
|
+
:total_feedback => total_feedback,
|
186
|
+
:logo_url => logo_url,
|
187
|
+
:homepage => homepage }
|
188
|
+
end
|
189
|
+
|
190
|
+
private
|
191
|
+
|
192
|
+
def find_offers_by_asin_via_api(asin, featured_merchants_only=false)
|
193
|
+
asin.strip!
|
194
|
+
request = {'Operation' => 'ItemLookup',
|
195
|
+
'ResponseGroup' => 'Large,OfferFull',
|
196
|
+
'ItemId' => asin,
|
197
|
+
'IdType' => 'ASIN',
|
198
|
+
'MerchantId' => featured_merchants_only ? 'Featured' : 'All',
|
199
|
+
'Condition' => 'New',
|
200
|
+
'OfferPage' => 1}
|
201
|
+
req = make_amazon_api_request request
|
202
|
+
offers = {}
|
203
|
+
|
204
|
+
total_offer_pages = (req / 'Items' / 'Offers' / 'TotalOfferPages').inner_html.to_i
|
205
|
+
|
206
|
+
#enumerate through all the offer pages
|
207
|
+
1.upto(total_offer_pages) do |page|
|
208
|
+
# move on to the next page if necessary
|
209
|
+
# (this helps avoid a repeat request)
|
210
|
+
if page != 1
|
211
|
+
request['OfferPage']+=1
|
212
|
+
req = make_amazon_api_request request
|
213
|
+
end
|
214
|
+
|
215
|
+
#loop through all the offers
|
216
|
+
(req / 'Items' / 'Offers' / 'Offer' ).each do |offer|
|
217
|
+
# find either ther seller id or the merchant id
|
218
|
+
|
219
|
+
id = (offer / 'Merchant' / 'MerchantId').inner_html
|
220
|
+
if id.nil? || id.empty?
|
221
|
+
id = (offer / 'Seller' / 'SellerId').inner_html
|
222
|
+
name = (offer / 'Seller' / 'Nickname').inner_html
|
223
|
+
type = 'seller'
|
224
|
+
else
|
225
|
+
name = (offer / 'Merchant' / 'Name').inner_html
|
226
|
+
type = 'merchant'
|
227
|
+
end
|
228
|
+
|
229
|
+
if (offer / 'OfferListing' / 'SalePrice').size > 0 # sometimes we get a SalePrice
|
230
|
+
unformatted_price = (offer / 'OfferListing' / 'SalePrice' / 'Amount').inner_html
|
231
|
+
formatted_price = (offer / 'OfferListing' / 'SalePrice' / 'FormattedPrice').inner_html
|
232
|
+
else # most of the time we just get Price
|
233
|
+
unformatted_price = (offer / 'OfferListing' / 'Price' / 'Amount').inner_html
|
234
|
+
formatted_price = (offer / 'OfferListing' / 'Price' / 'FormattedPrice').inner_html
|
235
|
+
end
|
236
|
+
added_to_cart = false
|
237
|
+
if formatted_price == 'Too low to display'
|
238
|
+
offer_listing_id = (offer / 'OfferListing' / 'OfferListingId').inner_html
|
239
|
+
unformatted_price, formatted_price = reveal_too_low_to_display_price_from_offer_listing_id(offer_listing_id)
|
240
|
+
added_to_cart = true
|
241
|
+
end
|
242
|
+
|
243
|
+
if (offer / 'OfferListing' / 'Quantity')
|
244
|
+
quantity = (offer / 'OfferListing' / 'Quantity').inner_html.to_i
|
245
|
+
end
|
246
|
+
|
247
|
+
if !unformatted_price.nil? && !unformatted_price.empty?
|
248
|
+
price = unformatted_price.to_i * 0.01 # convert 21995 to 219.95
|
249
|
+
elsif !formatted_price.nil? && !formatted_price.empty? # sometimes we only get a formatted price and no amount
|
250
|
+
price = formatted_price.gsub(/[$,]/,'').to_f
|
251
|
+
else
|
252
|
+
price = 0.0 # should never get here.
|
253
|
+
end
|
254
|
+
|
255
|
+
offer_listing_id = (offer / 'OfferListing' / 'OfferListingId').inner_html
|
256
|
+
total_feedback = (offer / 'Merchant' / 'TotalFeedback')
|
257
|
+
|
258
|
+
if quantity.nil? || quantity > 0
|
259
|
+
url = offer_url(asin, type, id)
|
260
|
+
# do we already have it in the offers hash?
|
261
|
+
# if so, we only want a lower price to override the entry.
|
262
|
+
if !offers[id] || offers[id][:price] > price
|
263
|
+
#add it to the offers hash
|
264
|
+
offers[id] = { :merchant_code => id,
|
265
|
+
:merchant_name => CGI::unescapeHTML(name),
|
266
|
+
:merchant_logo_url => nil,
|
267
|
+
:cpc => nil,
|
268
|
+
:price => BigDecimal(price.to_s),
|
269
|
+
:shipping => nil,
|
270
|
+
:offer_url => url,
|
271
|
+
:offer_tier => type == 'seller' ? 2 : 1,
|
272
|
+
:merchant_type => type }
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
offers
|
278
|
+
end
|
279
|
+
|
280
|
+
def scrape_offer_listing_page_to_hash(asin, featured_merchants_only=false)
|
281
|
+
offers_hash = {}
|
282
|
+
offers = scrape_offer_listing_page(asin, featured_merchants_only)
|
283
|
+
offers.each do |offer|
|
284
|
+
offers_hash[offer[:merchant_code]] = offer
|
285
|
+
end
|
286
|
+
offers_hash
|
287
|
+
end
|
288
|
+
|
289
|
+
def scrape_offer_listing_page(asin, featured_merchants_only=false)
|
290
|
+
begin
|
291
|
+
url = offer_listing_url(asin)
|
292
|
+
doc = scrape_page(url, Source.amazon_source.offer_ttl_seconds / 2, 'offer-listing')
|
293
|
+
rescue Net::HTTPServerException => ex
|
294
|
+
if ex.message =~ /^404/
|
295
|
+
raise Amazon::AsinNotFoundError.new(ex.message, asin)
|
296
|
+
else
|
297
|
+
raise ex
|
298
|
+
end
|
299
|
+
rescue Net::HTTPFatalError => ex
|
300
|
+
raise Amazon::AsinFatalError.new(ex.message, asin)
|
301
|
+
end
|
302
|
+
offers = []
|
303
|
+
offers_box_element = doc.at('div.resultsset')
|
304
|
+
offer_type_header_tables = offers_box_element.search('table')
|
305
|
+
offer_type_header_tables.each do |offer_type_header_table|
|
306
|
+
inner_text = offer_type_header_table.inner_text
|
307
|
+
if inner_text.include?('Featured Merchants')
|
308
|
+
featured_offer_rows = offer_type_header_table.search('tbody.result/tr')
|
309
|
+
offers += parse_offer_listing_rows(asin, featured_offer_rows, true)
|
310
|
+
elsif !featured_merchants_only && inner_text.include?('New')
|
311
|
+
other_offer_rows = offer_type_header_table.search('tbody.result/tr')
|
312
|
+
offers += parse_offer_listing_rows(asin, other_offer_rows, false, offers.length)
|
313
|
+
end
|
314
|
+
end
|
315
|
+
|
316
|
+
# offers.each_with_index do |offer, i|
|
317
|
+
# puts "#{i+1}. --------------------------------------------------------------------"
|
318
|
+
# puts "Merchant: #{offer[:name]} (#{offer[:merchant_id]})#{' FEATURED' if offer[:featured_merchant]}"
|
319
|
+
# puts "Merchant logo URL: #{offer[:merchant_logo_url]}" unless offer[:merchant_logo_url].nil?
|
320
|
+
# puts "Price/Shipping: #{offer[:price]}/#{offer[:shipping]}"
|
321
|
+
# puts "Offer ID: #{offer[:offer_id]}"
|
322
|
+
# puts "Offer URL: #{offer[:offer_url]}"
|
323
|
+
# puts "Merchant type: #{offer[:merchant_type]}"
|
324
|
+
# puts "Had to add to cart to get price." if offer[:added_to_cart]
|
325
|
+
# if offer[:merchant_id].nil? || offer[:name].nil? ||
|
326
|
+
# offer[:price].nil? || offer[:shipping].nil? ||
|
327
|
+
# offer[:offer_id].nil? || offer[:offer_url].nil?
|
328
|
+
# puts "!!!! One or more fields not parsed correctly !!!!"
|
329
|
+
# end
|
330
|
+
# puts '-----------------------------------------------------------------------'
|
331
|
+
# end
|
332
|
+
offers
|
333
|
+
end
|
334
|
+
|
335
|
+
def parse_offer_listing_rows(asin, offer_listing_rows, featured_merchants, offer_index_offset=0)
|
336
|
+
offers = []
|
337
|
+
offer_listing_rows.each_with_index do |row, offer_index|
|
338
|
+
# Offer Listing ID
|
339
|
+
offer_listing_tag = row.at("td.readytobuy/form/input[@name *= 'offering-id.']")
|
340
|
+
unless offer_listing_tag.nil?
|
341
|
+
offer_listing_id = offer_listing_tag.attributes['name'].sub('offering-id.', '')
|
342
|
+
end
|
343
|
+
|
344
|
+
# Price
|
345
|
+
added_to_cart = false
|
346
|
+
price_element = row.at("span.price")
|
347
|
+
unless price_element.nil?
|
348
|
+
price = price_to_f(price_element.inner_text)
|
349
|
+
end
|
350
|
+
add_to_cart_span = row.at("td/span[text() *= 'Add to cart to see price.']")
|
351
|
+
add_to_cart_span = row.at("td/span[text() *= 'Price not displayed.']") if add_to_cart_span.nil?
|
352
|
+
if add_to_cart_span && !offer_listing_id.nil? && !offer_listing_id.empty?
|
353
|
+
price = price_to_f(reveal_too_low_to_display_price_from_offer_listing_id(offer_listing_id).second)
|
354
|
+
added_to_cart = true
|
355
|
+
end
|
356
|
+
if price.nil?
|
357
|
+
puts "Failed to find offer price while scraping the offer listing page; ASIN: #{asin}. Skipping."
|
358
|
+
next
|
359
|
+
end
|
360
|
+
|
361
|
+
# Shipping
|
362
|
+
shipping_element = row.at("div.shipping_block/span.price_shipping")
|
363
|
+
if shipping_element.nil?
|
364
|
+
super_saver_element = row.at("span.supersaver")
|
365
|
+
shipping = 0.0 unless super_saver_element.nil?
|
366
|
+
else
|
367
|
+
shipping = price_to_f(shipping_element.inner_text)
|
368
|
+
end
|
369
|
+
|
370
|
+
seller_info = row.at("td[/ul.sellerInformation]")
|
371
|
+
unless seller_info.nil?
|
372
|
+
# Seller ID, merchant rating, and num merchant reviews
|
373
|
+
seller_id = nil
|
374
|
+
merchant_rating = nil
|
375
|
+
num_merchant_reviews = nil
|
376
|
+
rating_block = seller_info.at("div.rating")
|
377
|
+
unless rating_block.nil?
|
378
|
+
rating_text = rating_block.inner_text
|
379
|
+
if rating_text =~ /\((\d+) ratings\)/
|
380
|
+
num_merchant_reviews = $1.to_i
|
381
|
+
end
|
382
|
+
end
|
383
|
+
rating_link = seller_info.at("div.rating/a")
|
384
|
+
unless rating_link.nil?
|
385
|
+
seller_id = rating_link.attributes['href'].match(/seller=([^&#]+)/)[1]
|
386
|
+
end
|
387
|
+
rating_img = seller_info.at("div.rating/img")
|
388
|
+
unless rating_img.nil?
|
389
|
+
merchant_rating = (rating_img.attributes['src'].match(/stars\-([\d\-]+)/)[1].sub(/\-/,'.').to_f*20).to_i
|
390
|
+
end
|
391
|
+
|
392
|
+
if seller_id.nil?
|
393
|
+
shipping_rates_link = seller_info.at("div.availability/a[text() *= 'shipping rates']")
|
394
|
+
unless shipping_rates_link.nil?
|
395
|
+
if shipping_rates_link.attributes['href'].match(/seller=([^&#]+)/)
|
396
|
+
seller_id = $1
|
397
|
+
end
|
398
|
+
end
|
399
|
+
end
|
400
|
+
if seller_id.nil?
|
401
|
+
seller_profile_link = seller_info.at("div.rating//a[text() = 'Seller Profile']")
|
402
|
+
unless seller_profile_link.nil?
|
403
|
+
if seller_profile_link.attributes['href'].match(/seller=([^&#]+)/)
|
404
|
+
seller_id = $1
|
405
|
+
end
|
406
|
+
end
|
407
|
+
end
|
408
|
+
if seller_id.nil?
|
409
|
+
puts "Failed to find seller_id while scraping the offer listing page; ASIN: #{asin}, seller info: #{seller_info.inner_html}"
|
410
|
+
next
|
411
|
+
end
|
412
|
+
|
413
|
+
# Seller's Name & logo URL
|
414
|
+
merchant_type = 'merchant'
|
415
|
+
seller_label_link = seller_info.at('div.seller/a')
|
416
|
+
if seller_label_link.nil?
|
417
|
+
seller_logo_img = seller_info.at('a/img')
|
418
|
+
seller_logo_img = seller_info.at('img') if seller_logo_img.nil?
|
419
|
+
unless seller_logo_img.nil?
|
420
|
+
name = safe_strip(seller_logo_img.attributes['alt'])
|
421
|
+
logo_url = seller_logo_img.attributes['src']
|
422
|
+
end
|
423
|
+
else
|
424
|
+
name = safe_strip(seller_label_link.inner_text)
|
425
|
+
merchant_type = 'seller'
|
426
|
+
end
|
427
|
+
|
428
|
+
# Availability
|
429
|
+
in_stock = true
|
430
|
+
availability_element = seller_info.at("div.availability")
|
431
|
+
unless availability_element.nil?
|
432
|
+
availability_info = availability_element.inner_text
|
433
|
+
if availability_info.match(/out of stock/i)
|
434
|
+
in_stock = false
|
435
|
+
elsif availability_info.match(/Usually ships within .+ days/i)
|
436
|
+
in_stock = true
|
437
|
+
elsif availability_info.match(/Usually ships within .+ months/i)
|
438
|
+
in_stock = false
|
439
|
+
elsif availability_info.match(/In Stock/i)
|
440
|
+
in_stock = true
|
441
|
+
end
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
if in_stock
|
446
|
+
# Offer URL
|
447
|
+
offer_url = offer_url(asin, merchant_type, seller_id)
|
448
|
+
|
449
|
+
offers << { :original_index => offer_index + offer_index_offset,
|
450
|
+
:merchant_code => seller_id,
|
451
|
+
:merchant_name => CGI::unescapeHTML(name),
|
452
|
+
:merchant_logo_url => logo_url,
|
453
|
+
:cpc => Source.amazon_source.cpc,
|
454
|
+
:price => price.nil? ? nil : BigDecimal(price.to_s),
|
455
|
+
:shipping => shipping.nil? ? nil : BigDecimal(shipping.to_s),
|
456
|
+
:offer_url => offer_url,
|
457
|
+
:offer_tier => featured_merchants ? 1 : 3,
|
458
|
+
:merchant_rating => merchant_rating,
|
459
|
+
:num_merchant_reviews => num_merchant_reviews,
|
460
|
+
:merchant_type => merchant_type }
|
461
|
+
end
|
462
|
+
end
|
463
|
+
offers
|
464
|
+
end
|
465
|
+
|
466
|
+
# reveal a too low to display price by adding it to the cart
|
467
|
+
# returns the amount (in pennies) and the formatted price
|
468
|
+
def reveal_too_low_to_display_price_from_offer_listing_id(offer_listing_id)
|
469
|
+
request = {'Operation' => 'CartCreate',
|
470
|
+
'AssociateTag' => AMAZON_ASSOCIATE_TAG,
|
471
|
+
'Item.1.OfferListingId' => offer_listing_id,
|
472
|
+
'Item.1.Quantity' => 1}
|
473
|
+
req = make_amazon_api_request request
|
474
|
+
formatted_price = (req / 'Cart' / 'CartItems' / 'SubTotal' / 'FormattedPrice').inner_html
|
475
|
+
unformatted_price = (req / 'Cart' / 'CartItems' / 'SubTotal' / 'Amount').inner_html
|
476
|
+
[unformatted_price, formatted_price]
|
477
|
+
end
|
478
|
+
|
479
|
+
def scrape_at_a_glance_page(seller_id)
|
480
|
+
url = at_a_glance_url(seller_id)
|
481
|
+
doc = scrape_page(url, 10.minutes, 'seller')
|
482
|
+
merchant_description_box_element = doc.at('//table//tr//td//h1[@class = "sans"]/strong/../..')
|
483
|
+
|
484
|
+
unless merchant_description_box_element.nil?
|
485
|
+
element = merchant_description_box_element.at('//h1/strong')
|
486
|
+
merchant_name = element.inner_text.strip unless element.nil?
|
487
|
+
|
488
|
+
element = merchant_description_box_element.at('//img')
|
489
|
+
merchant_logo_url = element.attributes['src'] unless element.nil?
|
490
|
+
end
|
491
|
+
|
492
|
+
homepage_link = doc.at('//tr[@class = "tiny"]/td/a[@target = "_blank" and @href = text()]')
|
493
|
+
homepage = homepage_link.inner_text unless homepage_link.nil?
|
494
|
+
|
495
|
+
{ :merchant_name => merchant_name,
|
496
|
+
:logo_url => merchant_logo_url,
|
497
|
+
:homepage => homepage }
|
498
|
+
end
|
499
|
+
|
500
|
+
def scrape_page(url, cache_ttl, context_name=nil)
|
501
|
+
# shoot off the request
|
502
|
+
body = do_api_request(url)
|
503
|
+
Hpricot(body)
|
504
|
+
end
|
505
|
+
|
506
|
+
def cache
|
507
|
+
@cache ||= (eval('CACHE') rescue nil)
|
508
|
+
end
|
509
|
+
|
510
|
+
# make any API request given a hash of querystring parameters
|
511
|
+
def make_amazon_api_request(user_params)
|
512
|
+
result = make_amazon_api_request_raw(user_params)
|
513
|
+
result ? Hpricot.XML(result) : nil
|
514
|
+
end
|
515
|
+
|
516
|
+
# make API request, but don't process through Hpricot so called can
|
517
|
+
# process (with, say, Nokogiri)
|
518
|
+
def make_amazon_api_request_raw(user_params)
|
519
|
+
params = {'Service' => 'AWSECommerceService',
|
520
|
+
'Version' => '2007-07-16',
|
521
|
+
'AWSAccessKeyId' => AMAZON_ACCESS_KEY_ID}
|
522
|
+
params = params.merge(user_params) # merge in the user params
|
523
|
+
|
524
|
+
# because params is a hash, its order isn't defined.. so we sort it.
|
525
|
+
# this converts it to an array, but that's okay.
|
526
|
+
sorted_params_arr = params.sort{|a,b| a[0]<=>b[0]}
|
527
|
+
# build the query string
|
528
|
+
query_string = sorted_params_arr.collect{|x| "#{x[0]}=#{CGI::escape(CGI::unescape(x[1].to_s))}"}.join('&')
|
529
|
+
|
530
|
+
# do we already have a cached version of this API call?
|
531
|
+
key = "amazon-api-#{Digest::MD5.hexdigest(query_string)}-v2"
|
532
|
+
result = cache ? cache.get(key) : nil
|
533
|
+
if !result # nope.. gotta get a new one.
|
534
|
+
url = sign_url('ecs.amazonaws.com', '/onca/xml', params)
|
535
|
+
# shoot off the request
|
536
|
+
result = do_api_request(url)
|
537
|
+
cache.set(key, result, Source.amazon_source.offer_ttl_seconds) if cache # 1 hour
|
538
|
+
end
|
539
|
+
result
|
540
|
+
end
|
541
|
+
|
542
|
+
# create the Net::HTTP object to actually do the request
|
543
|
+
def do_api_request(url, retry_num=0, max_retries=10)
|
544
|
+
if retry_num >= max_retries
|
545
|
+
raise StandardError, "Failed to get Amazon URL with after #{max_retries} tries for url: #{url.inspect}"
|
546
|
+
end
|
547
|
+
|
548
|
+
#puts "Amazon API request URL: #{url}"
|
549
|
+
req_url = URI.safe_parse(url)
|
550
|
+
http = Net::HTTP.new(req_url.host, 80)
|
551
|
+
http.read_timeout=5 # 5 second timeout
|
552
|
+
resp = nil
|
553
|
+
begin
|
554
|
+
http.start do |web|
|
555
|
+
resp = web.get("#{req_url.path}?#{req_url.query}")
|
556
|
+
end
|
557
|
+
rescue Timeout::Error
|
558
|
+
# timed out, try again.
|
559
|
+
retry_num += 1
|
560
|
+
do_api_request(url, retry_num, max_retries)
|
561
|
+
end
|
562
|
+
|
563
|
+
case resp
|
564
|
+
when Net::HTTPSuccess
|
565
|
+
resp.body
|
566
|
+
when Net::HTTPRedirection
|
567
|
+
redirect_url = resp['location']
|
568
|
+
retry_num += 1
|
569
|
+
do_api_request(redirect_url, retry_num, max_retries)
|
570
|
+
when Net::HTTPServiceUnavailable
|
571
|
+
puts "GOT Net::HTTPServiceUnavailable FROM AMAZON; SLEEPING AND TRYING IN TWO SECONDS. RETRY NUM #{retry_num}."
|
572
|
+
sleep(2)
|
573
|
+
retry_num += 1
|
574
|
+
do_api_request(url, retry_num, max_retries)
|
575
|
+
when Net::HTTPClientError, Net::HTTPServerError
|
576
|
+
puts "GOT #{resp.class.name} FROM AMAZON."
|
577
|
+
resp.error!
|
578
|
+
else
|
579
|
+
raise StandardError, "Failed to get Amazon URL with unknown error: #{resp.inspect} For url: #{url.inspect}"
|
580
|
+
end
|
581
|
+
end
|
582
|
+
|
583
|
+
def safe_strip(value)
|
584
|
+
value.nil? ? nil : value.strip
|
585
|
+
end
|
586
|
+
|
587
|
+
def price_to_f(value)
|
588
|
+
return nil if value.nil? || value.empty?
|
589
|
+
value.gsub(/[^\d\.]/, '').match(/(\d*\.?\d+)/)[1].to_f rescue nil
|
590
|
+
end
|
591
|
+
|
592
|
+
def sign_url(host, path, params)
|
593
|
+
timestamp = CGI::escape(Time.now.utc.strftime('%Y-%m-%dT%H:%M:%SZ'))
|
594
|
+
params['Timestamp'] = timestamp
|
595
|
+
params_string = params.sort{|a,b| a[0]<=>b[0]}.collect{|x| "#{x[0]}=#{CGI::escape(CGI::unescape(x[1].to_s))}"}.join('&')
|
596
|
+
params_string.gsub!('+', '%20')
|
597
|
+
|
598
|
+
query = "GET\n#{host}\n#{path}\n#{params_string}"
|
599
|
+
|
600
|
+
hmac = Digest::HMAC.new(AMAZON_SECRET_ACCESS_KEY, Digest::SHA256).digest(query)
|
601
|
+
base64_hmac = Base64.encode64(hmac).chomp
|
602
|
+
signature = CGI::escape(base64_hmac)
|
603
|
+
"http://#{host}#{path}?#{params_string}&Signature=#{signature}"
|
604
|
+
end
|
605
|
+
end
|
606
|
+
end
|