amazon_deets 0.0.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be524b56abcc6dab4e880b8b27ad49a39ac917b2
4
- data.tar.gz: 60047f9e529b38ad24d11e2562cb2bf5ba1ee590
3
+ metadata.gz: f4e516ca707d2e7250a0b203395e778984ac40bc
4
+ data.tar.gz: da8ad327292bdd9fee2c5dcd1a0135ea1a1c970d
5
5
  SHA512:
6
- metadata.gz: e9cedb5fa453d9f082a403998154297bd1b1fd7af47cb23f53f4929ad849af96b92572b307e3f0e4fe3da4314cfb21f0b723c9834b351ee77f5fce08337f1eb7
7
- data.tar.gz: fb0a0f570667b9c583e1e22240d3738e7397c97ca1304ecc4de80169f00db5564a818d0faa2fed3031190810d3dcfcdbb61ce025adaa45af9292b372fe91aad4
6
+ metadata.gz: 96d5ce3798cefd24d568dbea661d92c77cbabc68bef6272e3b9e54406fce8d4cf4f4faaf8283e85a259fe9303aea263176e9c92b66c3b23db5245ff5339a5498
7
+ data.tar.gz: 35996047d38fcb28a35a8e1029f7889d10d8a1b0723694ce5902ab9f90c297f7f6b68026b997b361f8a175d7d77684496573b97437edc4cfa98e203931bd214a
@@ -1,120 +1,2 @@
1
- require 'logbert'
2
- require 'mechanize'
3
-
4
- module AmazonDeets
5
-
6
- class Grabber
7
- LOG = Logbert[self]
8
-
9
- RatingRegex = /(.+)\s+out\sof/
10
- ReviewsRegex = /(\d+)/
11
-
12
- attr_accessor :agent
13
-
14
- def initialize(agent: Mechanize.new)
15
- @agent = agent
16
- end
17
-
18
- def title
19
- result = agent.page.search("//h1[@id='title']").first
20
- if result
21
- return result.text.strip
22
- end
23
-
24
- result = agent.page.search("span#btAsinTitle").first
25
- if result
26
- return result.text.strip
27
- end
28
-
29
- return nil
30
- end
31
-
32
-
33
- def url
34
- agent.page.uri.to_s
35
- end
36
-
37
-
38
- def list_price
39
- lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
40
- if lp_element.nil?
41
- lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
42
- end
43
-
44
- if lp_element
45
- return lp_element.text.gsub(/[^.\d]/, "")
46
- else
47
- return nil
48
- end
49
-
50
- end
51
-
52
- def current_price
53
- current_price_element = agent.page.search("//span[@id='priceblock_saleprice']").first
54
- if current_price_element
55
- return current_price_element.text
56
- else
57
- LOG.debug "Looks like no sale is going on. Returning list price"
58
- return list_price
59
- end
60
- end
61
-
62
-
63
- def rating_text
64
- result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
65
- if result
66
- return result[:title]
67
- end
68
-
69
- result = agent.page.search("div.acrRating").first
70
- if result
71
- return result.text
72
- end
73
-
74
- return nil
75
- end
76
-
77
- def rating
78
- text = rating_text
79
- if text
80
- m = RatingRegex.match(text)
81
- if m and m[1]
82
- return m[1].to_f
83
- end
84
- end
85
-
86
- return nil
87
- end
88
-
89
- def reviews
90
- reviews_element = agent.page.search("//div[@id='summaryStars']/a")
91
- if reviews_element
92
- text = reviews_element.text.gsub(/[^\d]/, "")
93
-
94
- return text.to_i unless text.empty?
95
- end
96
- return nil
97
- end
98
-
99
-
100
- def details_hash
101
- return {
102
- title: title,
103
- url: url,
104
- list_price: list_price,
105
- current_price: current_price,
106
- rating: rating,
107
- reviews: reviews
108
- }
109
- end
110
-
111
-
112
- def grab(url)
113
- agent.get(url)
114
- details_hash
115
- end
116
-
117
- end
118
-
119
- end
120
1
 
2
+ require 'amazon_deets/factories'
@@ -0,0 +1,85 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+
6
+ module AmazonDeets
7
+
8
+ # Basic interface for the scrapers. Point it to
9
+ # a URL, and it does the scrape. BOOM!
10
+ class AbstractScraper
11
+
12
+ def scrape(url)
13
+ raise NotImplementedError
14
+ end
15
+
16
+ end
17
+
18
+
19
+
20
+ class MechanizedScraper < AbstractScraper
21
+
22
+ attr_accessor :agent
23
+ attr_accessor :fragments
24
+
25
+ def initialize(agent: Mechanize.new, fragments: Array.new)
26
+ @agent = agent
27
+ @fragments = fragments
28
+ end
29
+
30
+ def scrape(url)
31
+ agent.get(url)
32
+ fragments.each do |f|
33
+ if f.applicable?(agent)
34
+ return f.scrape(agent)
35
+ end
36
+ end
37
+ end
38
+
39
+ end
40
+
41
+
42
+ # Amazon renders different HTML dependending upon
43
+ # the type of product that you are viewing. This
44
+ # means that the scraper queries need to change
45
+ # depending upon whether you want the data for a
46
+ # Kindle book or some general merchandise. Rather
47
+ # than building one super-complicated scraper, we'll
48
+ # break the code into multiple simple scrapers that
49
+ # focus on solving specific problems.
50
+ #
51
+ class MechanizedFragment
52
+
53
+ # Decides whether or not this MechanizedFragment
54
+ # is applicable
55
+ def applicable?(agent)
56
+ raise NotImplementedError
57
+ end
58
+
59
+ def scrape(agent)
60
+ raise NotImplementedError
61
+ end
62
+
63
+ end
64
+
65
+
66
+ # A MechanizedContext is similar to a scraper, but it
67
+ # assumes that the @agent has already navigated to
68
+ # the URL that is going to be scraped.
69
+ class MechanizedContext
70
+
71
+ attr_accessor :agent
72
+
73
+ def initialize(agent: Mechanized.new)
74
+ @agent = agent
75
+ end
76
+
77
+
78
+ def scrape
79
+ raise NotImplementedError
80
+ end
81
+
82
+ end
83
+
84
+ end
85
+
@@ -0,0 +1,18 @@
1
+
2
+ require 'amazon_deets/general_merchandise'
3
+ require 'amazon_deets/kindle'
4
+
5
+ module AmazonDeets
6
+
7
+ def self.create_scraper(agent: Mechanize.new)
8
+ MechanizedScraper.new(
9
+ agent: agent,
10
+ fragments: [
11
+ KindleFragment.new,
12
+ GeneralMerchandiseFragment.new
13
+ ]
14
+ )
15
+ end
16
+
17
+ end
18
+
@@ -0,0 +1,95 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+ require 'amazon_deets/core'
6
+
7
+ module AmazonDeets
8
+
9
+ class GeneralMerchandiseFragment < MechanizedFragment
10
+
11
+ def applicable?(agent)
12
+ agent.page.search("h1#title").any?
13
+ end
14
+
15
+ def scrape(agent)
16
+ context = Context.new(agent: agent)
17
+ return context.scrape
18
+ end
19
+
20
+
21
+ class Context < MechanizedContext
22
+ LOG = Logbert[self]
23
+
24
+ RatingRegex = /(.+)\s+out\sof/
25
+
26
+ def title
27
+ result = agent.page.search("//h1[@id='title']").first
28
+ if result
29
+ return result.text.strip
30
+ end
31
+ end
32
+
33
+ def url
34
+ agent.page.uri.to_s
35
+ end
36
+
37
+ def list_price
38
+ lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
39
+ if lp_element.nil?
40
+ lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
41
+ end
42
+
43
+ if lp_element
44
+ return lp_element.text.gsub(/[^.\d]/, "")
45
+ end
46
+ end
47
+
48
+ def current_price
49
+ cp_element = agent.page.search("//span[@id='priceblock_saleprice']").first
50
+ if cp_element
51
+ return cp_element.text
52
+ else
53
+ LOG.debug "Looks like no sale is going on. Returning list price"
54
+ return list_price
55
+ end
56
+ end
57
+
58
+ def rating
59
+ result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
60
+ if result
61
+ m = RatingRegex.match result[:title]
62
+ if m and m[1]
63
+ return m[1]
64
+ end
65
+ end
66
+ end
67
+
68
+ def reviews
69
+ reviews_element = agent.page.search("//div[@id='averageCustomerReviews']//a[contains(text(), 'reviews')]")
70
+ if reviews_element
71
+ text = reviews_element.text.gsub(/[^\d]/, "")
72
+ return text.to_i unless text.empty?
73
+ else
74
+ LOG.warning "Reviews element could not be found"
75
+ end
76
+ end
77
+
78
+
79
+ def scrape
80
+ return {
81
+ title: title,
82
+ url: url,
83
+ list_price: list_price,
84
+ current_price: current_price,
85
+ rating: rating,
86
+ reviews: reviews
87
+ }
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
94
+ end
95
+
@@ -0,0 +1,90 @@
1
+
2
+ require 'logbert'
3
+ require 'mechanize'
4
+
5
+ require 'amazon_deets/core'
6
+
7
+ module AmazonDeets
8
+
9
+ class KindleFragment < MechanizedFragment
10
+
11
+ def applicable?(agent)
12
+ agent.page.search("div.kindleBanner").any?
13
+ end
14
+
15
+ def scrape(agent)
16
+ context = Context.new(agent: agent)
17
+ return context.scrape
18
+ end
19
+
20
+
21
+ class Context < MechanizedContext
22
+ LOG = Logbert[self]
23
+
24
+ RatingRegex = /(.+)\s+out\sof/
25
+
26
+ def title
27
+ result = agent.page.search("span#btAsinTitle").first
28
+ if result
29
+ return result.text.strip
30
+ end
31
+ end
32
+
33
+ def url
34
+ agent.page.uri.to_s
35
+ end
36
+
37
+ def list_price
38
+ lp_element = agent.page.search("td.listPrice").first
39
+ if lp_element
40
+ return lp_element.text.gsub(/[^.\d]/, "")
41
+ end
42
+ end
43
+
44
+ def current_price
45
+ cp_element = agent.page.search("td b.priceLarge").first
46
+ if cp_element
47
+ return cp_element.text.gsub(/[^.\d]/, "")
48
+ end
49
+ end
50
+
51
+ def rating
52
+ result = agent.page.search("span.crAvgStars span[title$='5 stars']").first
53
+ if result
54
+ m = RatingRegex.match result[:title]
55
+ LOG.info result[:title]
56
+ if m and m[1]
57
+ return m[1]
58
+ end
59
+ else
60
+ LOG.warning "Unable to locate rating element"
61
+ end
62
+ end
63
+
64
+ def reviews
65
+ reviews_element = agent.page.search("//span[@class='crAvgStars']/a[contains(text(), 'reviews')]")
66
+ if reviews_element
67
+ text = reviews_element.text.gsub(/[^\d]/, "")
68
+ return text.to_i unless text.empty?
69
+ else
70
+ LOG.warning "Reviews element could not be found"
71
+ end
72
+ end
73
+
74
+ def scrape
75
+ return {
76
+ title: title,
77
+ url: url,
78
+ list_price: list_price,
79
+ current_price: current_price,
80
+ rating: rating,
81
+ reviews: reviews
82
+ }
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+
89
+ end
90
+
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amazon_deets
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Brian Lauber
@@ -44,8 +44,12 @@ executables: []
44
44
  extensions: []
45
45
  extra_rdoc_files: []
46
46
  files:
47
+ - lib/amazon_deets/core.rb
48
+ - lib/amazon_deets/factories.rb
49
+ - lib/amazon_deets/general_merchandise.rb
50
+ - lib/amazon_deets/kindle.rb
47
51
  - lib/amazon_deets.rb
48
- homepage:
52
+ homepage: https://github.com/briandamaged/amazon_deets
49
53
  licenses:
50
54
  - MIT
51
55
  metadata: {}