amazon_deets 0.0.3 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/amazon_deets.rb +1 -119
- data/lib/amazon_deets/core.rb +85 -0
- data/lib/amazon_deets/factories.rb +18 -0
- data/lib/amazon_deets/general_merchandise.rb +95 -0
- data/lib/amazon_deets/kindle.rb +90 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4e516ca707d2e7250a0b203395e778984ac40bc
|
4
|
+
data.tar.gz: da8ad327292bdd9fee2c5dcd1a0135ea1a1c970d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96d5ce3798cefd24d568dbea661d92c77cbabc68bef6272e3b9e54406fce8d4cf4f4faaf8283e85a259fe9303aea263176e9c92b66c3b23db5245ff5339a5498
|
7
|
+
data.tar.gz: 35996047d38fcb28a35a8e1029f7889d10d8a1b0723694ce5902ab9f90c297f7f6b68026b997b361f8a175d7d77684496573b97437edc4cfa98e203931bd214a
|
data/lib/amazon_deets.rb
CHANGED
@@ -1,120 +1,2 @@
|
|
1
|
-
require 'logbert'
|
2
|
-
require 'mechanize'
|
3
|
-
|
4
|
-
module AmazonDeets
|
5
|
-
|
6
|
-
class Grabber
|
7
|
-
LOG = Logbert[self]
|
8
|
-
|
9
|
-
RatingRegex = /(.+)\s+out\sof/
|
10
|
-
ReviewsRegex = /(\d+)/
|
11
|
-
|
12
|
-
attr_accessor :agent
|
13
|
-
|
14
|
-
def initialize(agent: Mechanize.new)
|
15
|
-
@agent = agent
|
16
|
-
end
|
17
|
-
|
18
|
-
def title
|
19
|
-
result = agent.page.search("//h1[@id='title']").first
|
20
|
-
if result
|
21
|
-
return result.text.strip
|
22
|
-
end
|
23
|
-
|
24
|
-
result = agent.page.search("span#btAsinTitle").first
|
25
|
-
if result
|
26
|
-
return result.text.strip
|
27
|
-
end
|
28
|
-
|
29
|
-
return nil
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
def url
|
34
|
-
agent.page.uri.to_s
|
35
|
-
end
|
36
|
-
|
37
|
-
|
38
|
-
def list_price
|
39
|
-
lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
|
40
|
-
if lp_element.nil?
|
41
|
-
lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
|
42
|
-
end
|
43
|
-
|
44
|
-
if lp_element
|
45
|
-
return lp_element.text.gsub(/[^.\d]/, "")
|
46
|
-
else
|
47
|
-
return nil
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
def current_price
|
53
|
-
current_price_element = agent.page.search("//span[@id='priceblock_saleprice']").first
|
54
|
-
if current_price_element
|
55
|
-
return current_price_element.text
|
56
|
-
else
|
57
|
-
LOG.debug "Looks like no sale is going on. Returning list price"
|
58
|
-
return list_price
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def rating_text
|
64
|
-
result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
|
65
|
-
if result
|
66
|
-
return result[:title]
|
67
|
-
end
|
68
|
-
|
69
|
-
result = agent.page.search("div.acrRating").first
|
70
|
-
if result
|
71
|
-
return result.text
|
72
|
-
end
|
73
|
-
|
74
|
-
return nil
|
75
|
-
end
|
76
|
-
|
77
|
-
def rating
|
78
|
-
text = rating_text
|
79
|
-
if text
|
80
|
-
m = RatingRegex.match(text)
|
81
|
-
if m and m[1]
|
82
|
-
return m[1].to_f
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
return nil
|
87
|
-
end
|
88
|
-
|
89
|
-
def reviews
|
90
|
-
reviews_element = agent.page.search("//div[@id='summaryStars']/a")
|
91
|
-
if reviews_element
|
92
|
-
text = reviews_element.text.gsub(/[^\d]/, "")
|
93
|
-
|
94
|
-
return text.to_i unless text.empty?
|
95
|
-
end
|
96
|
-
return nil
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
def details_hash
|
101
|
-
return {
|
102
|
-
title: title,
|
103
|
-
url: url,
|
104
|
-
list_price: list_price,
|
105
|
-
current_price: current_price,
|
106
|
-
rating: rating,
|
107
|
-
reviews: reviews
|
108
|
-
}
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
|
-
def grab(url)
|
113
|
-
agent.get(url)
|
114
|
-
details_hash
|
115
|
-
end
|
116
|
-
|
117
|
-
end
|
118
|
-
|
119
|
-
end
|
120
1
|
|
2
|
+
require 'amazon_deets/factories'
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
|
6
|
+
module AmazonDeets
|
7
|
+
|
8
|
+
# Basic interface for the scrapers. Point it to
|
9
|
+
# a URL, and it does the scrape. BOOM!
|
10
|
+
class AbstractScraper
|
11
|
+
|
12
|
+
def scrape(url)
|
13
|
+
raise NotImplementedError
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
class MechanizedScraper < AbstractScraper
|
21
|
+
|
22
|
+
attr_accessor :agent
|
23
|
+
attr_accessor :fragments
|
24
|
+
|
25
|
+
def initialize(agent: Mechanize.new, fragments: Array.new)
|
26
|
+
@agent = agent
|
27
|
+
@fragments = fragments
|
28
|
+
end
|
29
|
+
|
30
|
+
def scrape(url)
|
31
|
+
agent.get(url)
|
32
|
+
fragments.each do |f|
|
33
|
+
if f.applicable?(agent)
|
34
|
+
return f.scrape(agent)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# Amazon renders different HTML dependending upon
|
43
|
+
# the type of product that you are viewing. This
|
44
|
+
# means that the scraper queries need to change
|
45
|
+
# depending upon whether you want the data for a
|
46
|
+
# Kindle book or some general merchandise. Rather
|
47
|
+
# than building one super-complicated scraper, we'll
|
48
|
+
# break the code into multiple simple scrapers that
|
49
|
+
# focus on solving specific problems.
|
50
|
+
#
|
51
|
+
class MechanizedFragment
|
52
|
+
|
53
|
+
# Decides whether or not this MechanizedFragment
|
54
|
+
# is applicable
|
55
|
+
def applicable?(agent)
|
56
|
+
raise NotImplementedError
|
57
|
+
end
|
58
|
+
|
59
|
+
def scrape(agent)
|
60
|
+
raise NotImplementedError
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# A MechanizedContext is similar to a scraper, but it
|
67
|
+
# assumes that the @agent has already navigated to
|
68
|
+
# the URL that is going to be scraped.
|
69
|
+
class MechanizedContext
|
70
|
+
|
71
|
+
attr_accessor :agent
|
72
|
+
|
73
|
+
def initialize(agent: Mechanized.new)
|
74
|
+
@agent = agent
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def scrape
|
79
|
+
raise NotImplementedError
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
require 'amazon_deets/general_merchandise'
|
3
|
+
require 'amazon_deets/kindle'
|
4
|
+
|
5
|
+
module AmazonDeets
|
6
|
+
|
7
|
+
def self.create_scraper(agent: Mechanize.new)
|
8
|
+
MechanizedScraper.new(
|
9
|
+
agent: agent,
|
10
|
+
fragments: [
|
11
|
+
KindleFragment.new,
|
12
|
+
GeneralMerchandiseFragment.new
|
13
|
+
]
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,95 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
require 'amazon_deets/core'
|
6
|
+
|
7
|
+
module AmazonDeets
|
8
|
+
|
9
|
+
class GeneralMerchandiseFragment < MechanizedFragment
|
10
|
+
|
11
|
+
def applicable?(agent)
|
12
|
+
agent.page.search("h1#title").any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape(agent)
|
16
|
+
context = Context.new(agent: agent)
|
17
|
+
return context.scrape
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
class Context < MechanizedContext
|
22
|
+
LOG = Logbert[self]
|
23
|
+
|
24
|
+
RatingRegex = /(.+)\s+out\sof/
|
25
|
+
|
26
|
+
def title
|
27
|
+
result = agent.page.search("//h1[@id='title']").first
|
28
|
+
if result
|
29
|
+
return result.text.strip
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def url
|
34
|
+
agent.page.uri.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_price
|
38
|
+
lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
|
39
|
+
if lp_element.nil?
|
40
|
+
lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
|
41
|
+
end
|
42
|
+
|
43
|
+
if lp_element
|
44
|
+
return lp_element.text.gsub(/[^.\d]/, "")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def current_price
|
49
|
+
cp_element = agent.page.search("//span[@id='priceblock_saleprice']").first
|
50
|
+
if cp_element
|
51
|
+
return cp_element.text
|
52
|
+
else
|
53
|
+
LOG.debug "Looks like no sale is going on. Returning list price"
|
54
|
+
return list_price
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def rating
|
59
|
+
result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
|
60
|
+
if result
|
61
|
+
m = RatingRegex.match result[:title]
|
62
|
+
if m and m[1]
|
63
|
+
return m[1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def reviews
|
69
|
+
reviews_element = agent.page.search("//div[@id='averageCustomerReviews']//a[contains(text(), 'reviews')]")
|
70
|
+
if reviews_element
|
71
|
+
text = reviews_element.text.gsub(/[^\d]/, "")
|
72
|
+
return text.to_i unless text.empty?
|
73
|
+
else
|
74
|
+
LOG.warning "Reviews element could not be found"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
def scrape
|
80
|
+
return {
|
81
|
+
title: title,
|
82
|
+
url: url,
|
83
|
+
list_price: list_price,
|
84
|
+
current_price: current_price,
|
85
|
+
rating: rating,
|
86
|
+
reviews: reviews
|
87
|
+
}
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
require 'amazon_deets/core'
|
6
|
+
|
7
|
+
module AmazonDeets
|
8
|
+
|
9
|
+
class KindleFragment < MechanizedFragment
|
10
|
+
|
11
|
+
def applicable?(agent)
|
12
|
+
agent.page.search("div.kindleBanner").any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape(agent)
|
16
|
+
context = Context.new(agent: agent)
|
17
|
+
return context.scrape
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
class Context < MechanizedContext
|
22
|
+
LOG = Logbert[self]
|
23
|
+
|
24
|
+
RatingRegex = /(.+)\s+out\sof/
|
25
|
+
|
26
|
+
def title
|
27
|
+
result = agent.page.search("span#btAsinTitle").first
|
28
|
+
if result
|
29
|
+
return result.text.strip
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def url
|
34
|
+
agent.page.uri.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_price
|
38
|
+
lp_element = agent.page.search("td.listPrice").first
|
39
|
+
if lp_element
|
40
|
+
return lp_element.text.gsub(/[^.\d]/, "")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def current_price
|
45
|
+
cp_element = agent.page.search("td b.priceLarge").first
|
46
|
+
if cp_element
|
47
|
+
return cp_element.text.gsub(/[^.\d]/, "")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def rating
|
52
|
+
result = agent.page.search("span.crAvgStars span[title$='5 stars']").first
|
53
|
+
if result
|
54
|
+
m = RatingRegex.match result[:title]
|
55
|
+
LOG.info result[:title]
|
56
|
+
if m and m[1]
|
57
|
+
return m[1]
|
58
|
+
end
|
59
|
+
else
|
60
|
+
LOG.warning "Unable to locate rating element"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def reviews
|
65
|
+
reviews_element = agent.page.search("//span[@class='crAvgStars']/a[contains(text(), 'reviews')]")
|
66
|
+
if reviews_element
|
67
|
+
text = reviews_element.text.gsub(/[^\d]/, "")
|
68
|
+
return text.to_i unless text.empty?
|
69
|
+
else
|
70
|
+
LOG.warning "Reviews element could not be found"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrape
|
75
|
+
return {
|
76
|
+
title: title,
|
77
|
+
url: url,
|
78
|
+
list_price: list_price,
|
79
|
+
current_price: current_price,
|
80
|
+
rating: rating,
|
81
|
+
reviews: reviews
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amazon_deets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Lauber
|
@@ -44,8 +44,12 @@ executables: []
|
|
44
44
|
extensions: []
|
45
45
|
extra_rdoc_files: []
|
46
46
|
files:
|
47
|
+
- lib/amazon_deets/core.rb
|
48
|
+
- lib/amazon_deets/factories.rb
|
49
|
+
- lib/amazon_deets/general_merchandise.rb
|
50
|
+
- lib/amazon_deets/kindle.rb
|
47
51
|
- lib/amazon_deets.rb
|
48
|
-
homepage:
|
52
|
+
homepage: https://github.com/briandamaged/amazon_deets
|
49
53
|
licenses:
|
50
54
|
- MIT
|
51
55
|
metadata: {}
|