amazon_deets 0.0.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/amazon_deets.rb +1 -119
- data/lib/amazon_deets/core.rb +85 -0
- data/lib/amazon_deets/factories.rb +18 -0
- data/lib/amazon_deets/general_merchandise.rb +95 -0
- data/lib/amazon_deets/kindle.rb +90 -0
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4e516ca707d2e7250a0b203395e778984ac40bc
|
4
|
+
data.tar.gz: da8ad327292bdd9fee2c5dcd1a0135ea1a1c970d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 96d5ce3798cefd24d568dbea661d92c77cbabc68bef6272e3b9e54406fce8d4cf4f4faaf8283e85a259fe9303aea263176e9c92b66c3b23db5245ff5339a5498
|
7
|
+
data.tar.gz: 35996047d38fcb28a35a8e1029f7889d10d8a1b0723694ce5902ab9f90c297f7f6b68026b997b361f8a175d7d77684496573b97437edc4cfa98e203931bd214a
|
data/lib/amazon_deets.rb
CHANGED
@@ -1,120 +1,2 @@
|
|
1
|
-
require 'logbert'
|
2
|
-
require 'mechanize'
|
3
|
-
|
4
|
-
module AmazonDeets
|
5
|
-
|
6
|
-
class Grabber
|
7
|
-
LOG = Logbert[self]
|
8
|
-
|
9
|
-
RatingRegex = /(.+)\s+out\sof/
|
10
|
-
ReviewsRegex = /(\d+)/
|
11
|
-
|
12
|
-
attr_accessor :agent
|
13
|
-
|
14
|
-
def initialize(agent: Mechanize.new)
|
15
|
-
@agent = agent
|
16
|
-
end
|
17
|
-
|
18
|
-
def title
|
19
|
-
result = agent.page.search("//h1[@id='title']").first
|
20
|
-
if result
|
21
|
-
return result.text.strip
|
22
|
-
end
|
23
|
-
|
24
|
-
result = agent.page.search("span#btAsinTitle").first
|
25
|
-
if result
|
26
|
-
return result.text.strip
|
27
|
-
end
|
28
|
-
|
29
|
-
return nil
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
def url
|
34
|
-
agent.page.uri.to_s
|
35
|
-
end
|
36
|
-
|
37
|
-
|
38
|
-
def list_price
|
39
|
-
lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
|
40
|
-
if lp_element.nil?
|
41
|
-
lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
|
42
|
-
end
|
43
|
-
|
44
|
-
if lp_element
|
45
|
-
return lp_element.text.gsub(/[^.\d]/, "")
|
46
|
-
else
|
47
|
-
return nil
|
48
|
-
end
|
49
|
-
|
50
|
-
end
|
51
|
-
|
52
|
-
def current_price
|
53
|
-
current_price_element = agent.page.search("//span[@id='priceblock_saleprice']").first
|
54
|
-
if current_price_element
|
55
|
-
return current_price_element.text
|
56
|
-
else
|
57
|
-
LOG.debug "Looks like no sale is going on. Returning list price"
|
58
|
-
return list_price
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
|
63
|
-
def rating_text
|
64
|
-
result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
|
65
|
-
if result
|
66
|
-
return result[:title]
|
67
|
-
end
|
68
|
-
|
69
|
-
result = agent.page.search("div.acrRating").first
|
70
|
-
if result
|
71
|
-
return result.text
|
72
|
-
end
|
73
|
-
|
74
|
-
return nil
|
75
|
-
end
|
76
|
-
|
77
|
-
def rating
|
78
|
-
text = rating_text
|
79
|
-
if text
|
80
|
-
m = RatingRegex.match(text)
|
81
|
-
if m and m[1]
|
82
|
-
return m[1].to_f
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
return nil
|
87
|
-
end
|
88
|
-
|
89
|
-
def reviews
|
90
|
-
reviews_element = agent.page.search("//div[@id='summaryStars']/a")
|
91
|
-
if reviews_element
|
92
|
-
text = reviews_element.text.gsub(/[^\d]/, "")
|
93
|
-
|
94
|
-
return text.to_i unless text.empty?
|
95
|
-
end
|
96
|
-
return nil
|
97
|
-
end
|
98
|
-
|
99
|
-
|
100
|
-
def details_hash
|
101
|
-
return {
|
102
|
-
title: title,
|
103
|
-
url: url,
|
104
|
-
list_price: list_price,
|
105
|
-
current_price: current_price,
|
106
|
-
rating: rating,
|
107
|
-
reviews: reviews
|
108
|
-
}
|
109
|
-
end
|
110
|
-
|
111
|
-
|
112
|
-
def grab(url)
|
113
|
-
agent.get(url)
|
114
|
-
details_hash
|
115
|
-
end
|
116
|
-
|
117
|
-
end
|
118
|
-
|
119
|
-
end
|
120
1
|
|
2
|
+
require 'amazon_deets/factories'
|
@@ -0,0 +1,85 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
|
6
|
+
module AmazonDeets
|
7
|
+
|
8
|
+
# Basic interface for the scrapers. Point it to
|
9
|
+
# a URL, and it does the scrape. BOOM!
|
10
|
+
class AbstractScraper
|
11
|
+
|
12
|
+
def scrape(url)
|
13
|
+
raise NotImplementedError
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
|
20
|
+
class MechanizedScraper < AbstractScraper
|
21
|
+
|
22
|
+
attr_accessor :agent
|
23
|
+
attr_accessor :fragments
|
24
|
+
|
25
|
+
def initialize(agent: Mechanize.new, fragments: Array.new)
|
26
|
+
@agent = agent
|
27
|
+
@fragments = fragments
|
28
|
+
end
|
29
|
+
|
30
|
+
def scrape(url)
|
31
|
+
agent.get(url)
|
32
|
+
fragments.each do |f|
|
33
|
+
if f.applicable?(agent)
|
34
|
+
return f.scrape(agent)
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
|
42
|
+
# Amazon renders different HTML dependending upon
|
43
|
+
# the type of product that you are viewing. This
|
44
|
+
# means that the scraper queries need to change
|
45
|
+
# depending upon whether you want the data for a
|
46
|
+
# Kindle book or some general merchandise. Rather
|
47
|
+
# than building one super-complicated scraper, we'll
|
48
|
+
# break the code into multiple simple scrapers that
|
49
|
+
# focus on solving specific problems.
|
50
|
+
#
|
51
|
+
class MechanizedFragment
|
52
|
+
|
53
|
+
# Decides whether or not this MechanizedFragment
|
54
|
+
# is applicable
|
55
|
+
def applicable?(agent)
|
56
|
+
raise NotImplementedError
|
57
|
+
end
|
58
|
+
|
59
|
+
def scrape(agent)
|
60
|
+
raise NotImplementedError
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
# A MechanizedContext is similar to a scraper, but it
|
67
|
+
# assumes that the @agent has already navigated to
|
68
|
+
# the URL that is going to be scraped.
|
69
|
+
class MechanizedContext
|
70
|
+
|
71
|
+
attr_accessor :agent
|
72
|
+
|
73
|
+
def initialize(agent: Mechanized.new)
|
74
|
+
@agent = agent
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
def scrape
|
79
|
+
raise NotImplementedError
|
80
|
+
end
|
81
|
+
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
require 'amazon_deets/general_merchandise'
|
3
|
+
require 'amazon_deets/kindle'
|
4
|
+
|
5
|
+
module AmazonDeets
|
6
|
+
|
7
|
+
def self.create_scraper(agent: Mechanize.new)
|
8
|
+
MechanizedScraper.new(
|
9
|
+
agent: agent,
|
10
|
+
fragments: [
|
11
|
+
KindleFragment.new,
|
12
|
+
GeneralMerchandiseFragment.new
|
13
|
+
]
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
18
|
+
|
@@ -0,0 +1,95 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
require 'amazon_deets/core'
|
6
|
+
|
7
|
+
module AmazonDeets
|
8
|
+
|
9
|
+
class GeneralMerchandiseFragment < MechanizedFragment
|
10
|
+
|
11
|
+
def applicable?(agent)
|
12
|
+
agent.page.search("h1#title").any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape(agent)
|
16
|
+
context = Context.new(agent: agent)
|
17
|
+
return context.scrape
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
class Context < MechanizedContext
|
22
|
+
LOG = Logbert[self]
|
23
|
+
|
24
|
+
RatingRegex = /(.+)\s+out\sof/
|
25
|
+
|
26
|
+
def title
|
27
|
+
result = agent.page.search("//h1[@id='title']").first
|
28
|
+
if result
|
29
|
+
return result.text.strip
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def url
|
34
|
+
agent.page.uri.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_price
|
38
|
+
lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
|
39
|
+
if lp_element.nil?
|
40
|
+
lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
|
41
|
+
end
|
42
|
+
|
43
|
+
if lp_element
|
44
|
+
return lp_element.text.gsub(/[^.\d]/, "")
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def current_price
|
49
|
+
cp_element = agent.page.search("//span[@id='priceblock_saleprice']").first
|
50
|
+
if cp_element
|
51
|
+
return cp_element.text
|
52
|
+
else
|
53
|
+
LOG.debug "Looks like no sale is going on. Returning list price"
|
54
|
+
return list_price
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def rating
|
59
|
+
result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
|
60
|
+
if result
|
61
|
+
m = RatingRegex.match result[:title]
|
62
|
+
if m and m[1]
|
63
|
+
return m[1]
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def reviews
|
69
|
+
reviews_element = agent.page.search("//div[@id='averageCustomerReviews']//a[contains(text(), 'reviews')]")
|
70
|
+
if reviews_element
|
71
|
+
text = reviews_element.text.gsub(/[^\d]/, "")
|
72
|
+
return text.to_i unless text.empty?
|
73
|
+
else
|
74
|
+
LOG.warning "Reviews element could not be found"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
|
79
|
+
def scrape
|
80
|
+
return {
|
81
|
+
title: title,
|
82
|
+
url: url,
|
83
|
+
list_price: list_price,
|
84
|
+
current_price: current_price,
|
85
|
+
rating: rating,
|
86
|
+
reviews: reviews
|
87
|
+
}
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|
@@ -0,0 +1,90 @@
|
|
1
|
+
|
2
|
+
require 'logbert'
|
3
|
+
require 'mechanize'
|
4
|
+
|
5
|
+
require 'amazon_deets/core'
|
6
|
+
|
7
|
+
module AmazonDeets
|
8
|
+
|
9
|
+
class KindleFragment < MechanizedFragment
|
10
|
+
|
11
|
+
def applicable?(agent)
|
12
|
+
agent.page.search("div.kindleBanner").any?
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape(agent)
|
16
|
+
context = Context.new(agent: agent)
|
17
|
+
return context.scrape
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
class Context < MechanizedContext
|
22
|
+
LOG = Logbert[self]
|
23
|
+
|
24
|
+
RatingRegex = /(.+)\s+out\sof/
|
25
|
+
|
26
|
+
def title
|
27
|
+
result = agent.page.search("span#btAsinTitle").first
|
28
|
+
if result
|
29
|
+
return result.text.strip
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def url
|
34
|
+
agent.page.uri.to_s
|
35
|
+
end
|
36
|
+
|
37
|
+
def list_price
|
38
|
+
lp_element = agent.page.search("td.listPrice").first
|
39
|
+
if lp_element
|
40
|
+
return lp_element.text.gsub(/[^.\d]/, "")
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def current_price
|
45
|
+
cp_element = agent.page.search("td b.priceLarge").first
|
46
|
+
if cp_element
|
47
|
+
return cp_element.text.gsub(/[^.\d]/, "")
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def rating
|
52
|
+
result = agent.page.search("span.crAvgStars span[title$='5 stars']").first
|
53
|
+
if result
|
54
|
+
m = RatingRegex.match result[:title]
|
55
|
+
LOG.info result[:title]
|
56
|
+
if m and m[1]
|
57
|
+
return m[1]
|
58
|
+
end
|
59
|
+
else
|
60
|
+
LOG.warning "Unable to locate rating element"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def reviews
|
65
|
+
reviews_element = agent.page.search("//span[@class='crAvgStars']/a[contains(text(), 'reviews')]")
|
66
|
+
if reviews_element
|
67
|
+
text = reviews_element.text.gsub(/[^\d]/, "")
|
68
|
+
return text.to_i unless text.empty?
|
69
|
+
else
|
70
|
+
LOG.warning "Reviews element could not be found"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def scrape
|
75
|
+
return {
|
76
|
+
title: title,
|
77
|
+
url: url,
|
78
|
+
list_price: list_price,
|
79
|
+
current_price: current_price,
|
80
|
+
rating: rating,
|
81
|
+
reviews: reviews
|
82
|
+
}
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amazon_deets
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brian Lauber
|
@@ -44,8 +44,12 @@ executables: []
|
|
44
44
|
extensions: []
|
45
45
|
extra_rdoc_files: []
|
46
46
|
files:
|
47
|
+
- lib/amazon_deets/core.rb
|
48
|
+
- lib/amazon_deets/factories.rb
|
49
|
+
- lib/amazon_deets/general_merchandise.rb
|
50
|
+
- lib/amazon_deets/kindle.rb
|
47
51
|
- lib/amazon_deets.rb
|
48
|
-
homepage:
|
52
|
+
homepage: https://github.com/briandamaged/amazon_deets
|
49
53
|
licenses:
|
50
54
|
- MIT
|
51
55
|
metadata: {}
|