amazon_deets 0.0.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be524b56abcc6dab4e880b8b27ad49a39ac917b2
4
- data.tar.gz: 60047f9e529b38ad24d11e2562cb2bf5ba1ee590
3
+ metadata.gz: f4e516ca707d2e7250a0b203395e778984ac40bc
4
+ data.tar.gz: da8ad327292bdd9fee2c5dcd1a0135ea1a1c970d
5
5
  SHA512:
6
- metadata.gz: e9cedb5fa453d9f082a403998154297bd1b1fd7af47cb23f53f4929ad849af96b92572b307e3f0e4fe3da4314cfb21f0b723c9834b351ee77f5fce08337f1eb7
7
- data.tar.gz: fb0a0f570667b9c583e1e22240d3738e7397c97ca1304ecc4de80169f00db5564a818d0faa2fed3031190810d3dcfcdbb61ce025adaa45af9292b372fe91aad4
6
+ metadata.gz: 96d5ce3798cefd24d568dbea661d92c77cbabc68bef6272e3b9e54406fce8d4cf4f4faaf8283e85a259fe9303aea263176e9c92b66c3b23db5245ff5339a5498
7
+ data.tar.gz: 35996047d38fcb28a35a8e1029f7889d10d8a1b0723694ce5902ab9f90c297f7f6b68026b997b361f8a175d7d77684496573b97437edc4cfa98e203931bd214a
@@ -1,120 +1,2 @@
1
- require 'logbert'
2
- require 'mechanize'
3
-
4
- module AmazonDeets
5
-
6
- class Grabber
7
- LOG = Logbert[self]
8
-
9
- RatingRegex = /(.+)\s+out\sof/
10
- ReviewsRegex = /(\d+)/
11
-
12
- attr_accessor :agent
13
-
14
- def initialize(agent: Mechanize.new)
15
- @agent = agent
16
- end
17
-
18
- def title
19
- result = agent.page.search("//h1[@id='title']").first
20
- if result
21
- return result.text.strip
22
- end
23
-
24
- result = agent.page.search("span#btAsinTitle").first
25
- if result
26
- return result.text.strip
27
- end
28
-
29
- return nil
30
- end
31
-
32
-
33
- def url
34
- agent.page.uri.to_s
35
- end
36
-
37
-
38
- def list_price
39
- lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
40
- if lp_element.nil?
41
- lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
42
- end
43
-
44
- if lp_element
45
- return lp_element.text.gsub(/[^.\d]/, "")
46
- else
47
- return nil
48
- end
49
-
50
- end
51
-
52
- def current_price
53
- current_price_element = agent.page.search("//span[@id='priceblock_saleprice']").first
54
- if current_price_element
55
- return current_price_element.text
56
- else
57
- LOG.debug "Looks like no sale is going on. Returning list price"
58
- return list_price
59
- end
60
- end
61
-
62
-
63
- def rating_text
64
- result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
65
- if result
66
- return result[:title]
67
- end
68
-
69
- result = agent.page.search("div.acrRating").first
70
- if result
71
- return result.text
72
- end
73
-
74
- return nil
75
- end
76
-
77
- def rating
78
- text = rating_text
79
- if text
80
- m = RatingRegex.match(text)
81
- if m and m[1]
82
- return m[1].to_f
83
- end
84
- end
85
-
86
- return nil
87
- end
88
-
89
- def reviews
90
- reviews_element = agent.page.search("//div[@id='summaryStars']/a")
91
- if reviews_element
92
- text = reviews_element.text.gsub(/[^\d]/, "")
93
-
94
- return text.to_i unless text.empty?
95
- end
96
- return nil
97
- end
98
-
99
-
100
- def details_hash
101
- return {
102
- title: title,
103
- url: url,
104
- list_price: list_price,
105
- current_price: current_price,
106
- rating: rating,
107
- reviews: reviews
108
- }
109
- end
110
-
111
-
112
- def grab(url)
113
- agent.get(url)
114
- details_hash
115
- end
116
-
117
- end
118
-
119
- end
120
1
 
2
+ require 'amazon_deets/factories'
@@ -0,0 +1,85 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+
6
+ module AmazonDeets
7
+
8
+ # Basic interface for the scrapers. Point it to
9
+ # a URL, and it does the scrape. BOOM!
10
+ class AbstractScraper
11
+
12
+ def scrape(url)
13
+ raise NotImplementedError
14
+ end
15
+
16
+ end
17
+
18
+
19
+
20
+ class MechanizedScraper < AbstractScraper
21
+
22
+ attr_accessor :agent
23
+ attr_accessor :fragments
24
+
25
+ def initialize(agent: Mechanize.new, fragments: Array.new)
26
+ @agent = agent
27
+ @fragments = fragments
28
+ end
29
+
30
+ def scrape(url)
31
+ agent.get(url)
32
+ fragments.each do |f|
33
+ if f.applicable?(agent)
34
+ return f.scrape(agent)
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+
41
+
42
+ # Amazon renders different HTML dependending upon
43
+ # the type of product that you are viewing. This
44
+ # means that the scraper queries need to change
45
+ # depending upon whether you want the data for a
46
+ # Kindle book or some general merchandise. Rather
47
+ # than building one super-complicated scraper, we'll
48
+ # break the code into multiple simple scrapers that
49
+ # focus on solving specific problems.
50
+ #
51
+ class MechanizedFragment
52
+
53
+ # Decides whether or not this MechanizedFragment
54
+ # is applicable
55
+ def applicable?(agent)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ def scrape(agent)
60
+ raise NotImplementedError
61
+ end
62
+
63
+ end
64
+
65
+
66
+ # A MechanizedContext is similar to a scraper, but it
67
+ # assumes that the @agent has already navigated to
68
+ # the URL that is going to be scraped.
69
+ class MechanizedContext
70
+
71
+ attr_accessor :agent
72
+
73
+ def initialize(agent: Mechanized.new)
74
+ @agent = agent
75
+ end
76
+
77
+
78
+ def scrape
79
+ raise NotImplementedError
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+
@@ -0,0 +1,18 @@
1
+
2
+ require 'amazon_deets/general_merchandise'
3
+ require 'amazon_deets/kindle'
4
+
5
+ module AmazonDeets
6
+
7
+ def self.create_scraper(agent: Mechanize.new)
8
+ MechanizedScraper.new(
9
+ agent: agent,
10
+ fragments: [
11
+ KindleFragment.new,
12
+ GeneralMerchandiseFragment.new
13
+ ]
14
+ )
15
+ end
16
+
17
+ end
18
+
@@ -0,0 +1,95 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+ require 'amazon_deets/core'
6
+
7
+ module AmazonDeets
8
+
9
+ class GeneralMerchandiseFragment < MechanizedFragment
10
+
11
+ def applicable?(agent)
12
+ agent.page.search("h1#title").any?
13
+ end
14
+
15
+ def scrape(agent)
16
+ context = Context.new(agent: agent)
17
+ return context.scrape
18
+ end
19
+
20
+
21
+ class Context < MechanizedContext
22
+ LOG = Logbert[self]
23
+
24
+ RatingRegex = /(.+)\s+out\sof/
25
+
26
+ def title
27
+ result = agent.page.search("//h1[@id='title']").first
28
+ if result
29
+ return result.text.strip
30
+ end
31
+ end
32
+
33
+ def url
34
+ agent.page.uri.to_s
35
+ end
36
+
37
+ def list_price
38
+ lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
39
+ if lp_element.nil?
40
+ lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
41
+ end
42
+
43
+ if lp_element
44
+ return lp_element.text.gsub(/[^.\d]/, "")
45
+ end
46
+ end
47
+
48
+ def current_price
49
+ cp_element = agent.page.search("//span[@id='priceblock_saleprice']").first
50
+ if cp_element
51
+ return cp_element.text
52
+ else
53
+ LOG.debug "Looks like no sale is going on. Returning list price"
54
+ return list_price
55
+ end
56
+ end
57
+
58
+ def rating
59
+ result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
60
+ if result
61
+ m = RatingRegex.match result[:title]
62
+ if m and m[1]
63
+ return m[1]
64
+ end
65
+ end
66
+ end
67
+
68
+ def reviews
69
+ reviews_element = agent.page.search("//div[@id='averageCustomerReviews']//a[contains(text(), 'reviews')]")
70
+ if reviews_element
71
+ text = reviews_element.text.gsub(/[^\d]/, "")
72
+ return text.to_i unless text.empty?
73
+ else
74
+ LOG.warning "Reviews element could not be found"
75
+ end
76
+ end
77
+
78
+
79
+ def scrape
80
+ return {
81
+ title: title,
82
+ url: url,
83
+ list_price: list_price,
84
+ current_price: current_price,
85
+ rating: rating,
86
+ reviews: reviews
87
+ }
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+ end
95
+
@@ -0,0 +1,90 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+ require 'amazon_deets/core'
6
+
7
+ module AmazonDeets
8
+
9
+ class KindleFragment < MechanizedFragment
10
+
11
+ def applicable?(agent)
12
+ agent.page.search("div.kindleBanner").any?
13
+ end
14
+
15
+ def scrape(agent)
16
+ context = Context.new(agent: agent)
17
+ return context.scrape
18
+ end
19
+
20
+
21
+ class Context < MechanizedContext
22
+ LOG = Logbert[self]
23
+
24
+ RatingRegex = /(.+)\s+out\sof/
25
+
26
+ def title
27
+ result = agent.page.search("span#btAsinTitle").first
28
+ if result
29
+ return result.text.strip
30
+ end
31
+ end
32
+
33
+ def url
34
+ agent.page.uri.to_s
35
+ end
36
+
37
+ def list_price
38
+ lp_element = agent.page.search("td.listPrice").first
39
+ if lp_element
40
+ return lp_element.text.gsub(/[^.\d]/, "")
41
+ end
42
+ end
43
+
44
+ def current_price
45
+ cp_element = agent.page.search("td b.priceLarge").first
46
+ if cp_element
47
+ return cp_element.text.gsub(/[^.\d]/, "")
48
+ end
49
+ end
50
+
51
+ def rating
52
+ result = agent.page.search("span.crAvgStars span[title$='5 stars']").first
53
+ if result
54
+ m = RatingRegex.match result[:title]
55
+ LOG.info result[:title]
56
+ if m and m[1]
57
+ return m[1]
58
+ end
59
+ else
60
+ LOG.warning "Unable to locate rating element"
61
+ end
62
+ end
63
+
64
+ def reviews
65
+ reviews_element = agent.page.search("//span[@class='crAvgStars']/a[contains(text(), 'reviews')]")
66
+ if reviews_element
67
+ text = reviews_element.text.gsub(/[^\d]/, "")
68
+ return text.to_i unless text.empty?
69
+ else
70
+ LOG.warning "Reviews element could not be found"
71
+ end
72
+ end
73
+
74
+ def scrape
75
+ return {
76
+ title: title,
77
+ url: url,
78
+ list_price: list_price,
79
+ current_price: current_price,
80
+ rating: rating,
81
+ reviews: reviews
82
+ }
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+
89
+ end
90
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amazon_deets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Lauber
@@ -44,8 +44,12 @@ executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
46
46
  files:
47
+ - lib/amazon_deets/core.rb
48
+ - lib/amazon_deets/factories.rb
49
+ - lib/amazon_deets/general_merchandise.rb
50
+ - lib/amazon_deets/kindle.rb
47
51
  - lib/amazon_deets.rb
48
- homepage:
52
+ homepage: https://github.com/briandamaged/amazon_deets
49
53
  licenses:
50
54
  - MIT
51
55
  metadata: {}