amazoned 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4ff0dac060161cb2ba774302dcadfa145332a1f
4
+ data.tar.gz: 30a81104eb62ef099d4515a3b6effbdde23520b3
5
+ SHA512:
6
+ metadata.gz: 134f051f0e192d2490422a3ee8b061787234a2b0c1cbd2b9af2b3a0d3d9d5745133f2b166e63458289b960667e21cdc58a30bdcca6539dcae7253365a24df71d
7
+ data.tar.gz: 7d88efe96872019f2258cd95b9e869c58a8d59dde6028db3bf91a55794f59a578605151941f64b2d66ec5ec10c168f976adf437ed158eb4ecdc806ac66790b28
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.5.1
5
+ before_install: gem install bundler -v 1.16.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in amazonian.gemspec
6
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 kelseydh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Amazoned
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/amazoned`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'amazoned'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install amazoned
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/amazoned.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,32 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "amazoned/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "amazoned"
8
+ spec.version = Amazoned::VERSION
9
+ spec.authors = ["kelseydh"]
10
+ spec.email = ["kelseyh@gmail.com"]
11
+
12
+ spec.summary = %q{A manual scraper for Amazon ASIN product data}
13
+ spec.description = %q{This gem allows you to scrap product information from Amazon without the need to register for Amazon's API}
14
+ spec.homepage = "http://twitter.com/kelsoh"
15
+ spec.license = "MIT"
16
+
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ end
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.16"
28
+ spec.add_development_dependency "rake", "~> 10.0"
29
+ spec.add_development_dependency "rspec", "~> 3.0"
30
+ spec.add_development_dependency "mechanize", '~> 2.7', '>= 2.7.6'
31
+ spec.add_development_dependency "activesupport", '~> 5.2', '>= 4.2.0'
32
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "amazoned"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,26 @@
1
+ require 'active_support/core_ext/string'
2
+ require 'amazoned/version'
3
+ require 'amazoned/client'
4
+ require 'amazoned/parser'
5
+ require 'mechanize'
6
+ require 'nokogiri'
7
+ require 'byebug'
8
+
9
+ module Amazoned
10
+ @max_network_retries = 3
11
+ @max_network_retry_delay = 2.3
12
+ @initial_network_retry_delay = 1.1 # requests under a second could flag us as a scraping bot
13
+
14
+
15
+ class << self
16
+ attr_reader :max_network_retry_delay, :initial_network_retry_delay
17
+ end
18
+
19
+ def self.max_network_retries
20
+ @max_network_retries
21
+ end
22
+
23
+ def self.max_network_retries=(val)
24
+ @max_network_retries = val.to_i
25
+ end
26
+ end
@@ -0,0 +1,76 @@
1
+ cled Amazoned::ProductNotFoundError < StandardError; end
2
+ class Amazoned::BotDeniedAccessError < StandardError; end
3
+ class Amazoned::Client
4
+ attr_reader :asin
5
+
6
+ def initialize(asin)
7
+ @asin = asin
8
+ end
9
+
10
+ def call
11
+ response = get_product
12
+ Amazoned::Parser.new(response).call
13
+ end
14
+
15
+ def get_product(num_retries = 1)
16
+ agent = Mechanize.new.tap do |web|
17
+ web.html_parser = HtmlParser # Avoid encoding issues: https://stackoverflow.com/a/20666246/3448554
18
+ web.user_agent_alias = (Mechanize::AGENT_ALIASES.keys - ['Mechanize']).sample # spoof every request with a random User Agent as a way to hit fewer CAPTCHA walls
19
+ end
20
+
21
+ begin
22
+ # Start GET request of Amazon page using ASIN.
23
+ response = agent.get("https://www.amazon.com/dp/#{asin}")
24
+ if request_failed(response)
25
+ puts "Request failed! Trying again..."
26
+ # On failure, recursively try again to be resilient against one-off failures
27
+ if num_retries <= Amazoned.max_network_retries
28
+ sleep self.class.sleep_time(num_retries)
29
+ get_product(num_retries += 1)
30
+ else
31
+ handle_failed_request!(response)
32
+ end
33
+ else
34
+ response
35
+ end
36
+ rescue Mechanize::ResponseCodeError => e
37
+ raise Amazoned::ProductNotFoundError
38
+ end
39
+ end
40
+
41
+ def request_failed(response)
42
+ return true if response.xpath('//p[contains(text(), "Sorry, we just need to make sure")]').any? # captcha hit
43
+ false
44
+ end
45
+
46
+ def handle_failed_request!(response)
47
+ # Raise this error when we can't penetrate Amazon's CAPTCHA wall
48
+ raise Amazoned::BotDeniedAccessError if response.xpath('//p[contains(text(), "Sorry, we just need to make sure")]').any?
49
+ end
50
+
51
+ # Taken from Stripe API
52
+ # Stripe uses jitter to smooth server load; we use it to obfuscate timing detection of our scraper bot
53
+ # https://github.com/stripe/stripe-ruby/blob/ec66c3f0f44274f885de8d13de5dce2657932121/lib/stripe/stripe_client.rb#L80
54
+ def self.sleep_time(num_retries)
55
+ # Apply exponential backoff with initial_network_retry_delay on the
56
+ # number of num_retries so far as inputs. Do not allow the number to exceed
57
+ # max_network_retry_delay.
58
+ sleep_seconds = [Amazoned.initial_network_retry_delay * (2**(num_retries - 1)), Amazoned.max_network_retry_delay].min
59
+
60
+ # Apply some jitter by randomizing the value in the range of (sleep_seconds
61
+ # / 2) to (sleep_seconds).
62
+ sleep_seconds *= (0.5 * (1 + rand))
63
+
64
+ # But never sleep less than the base sleep seconds.
65
+ sleep_seconds = [Amazoned.initial_network_retry_delay, sleep_seconds].max
66
+
67
+ sleep_seconds
68
+ end
69
+ end
70
+
71
+ class HtmlParser
72
+ def self.parse(body, url, encoding)
73
+ body.encode!('UTF-8', encoding, invalid: :replace, undef: :replace, replace: '')
74
+ Nokogiri::HTML::Document.parse(body, url, 'UTF-8')
75
+ end
76
+ end
@@ -0,0 +1,109 @@
1
+ class Amazoned::Parser
2
+ attr_accessor :product_hash
3
+ attr_reader :html_doc, :response
4
+
5
+ def initialize(response)
6
+ @product_hash = Hash.new
7
+ @response = response
8
+ @html_doc = Nokogiri::HTML(response.body)
9
+ end
10
+
11
+ def call
12
+ parse_response_for_product_details( response )
13
+ end
14
+
15
+ def parse_response_for_product_details(response)
16
+ product_hash[:best_sellers_rank] = []
17
+
18
+ ########
19
+ # # Parent category Seller Rank Parser
20
+ ########
21
+ parsed_parent_category = html_doc.css('#SalesRank').text.partition("(").first.chop.partition("#").last.partition("in").map(&:strip) - ["in"]
22
+ product_hash[:rank] = parsed_parent_category.first.delete(',').to_i # "903,610" -> 903610
23
+ product_hash[:category] = parsed_parent_category.last
24
+
25
+ ########
26
+ # # Subcategory Seller Rank Parser
27
+ ########
28
+ extract_subcategory_rankings( html_doc.css('.zg_hrsr_item') )
29
+
30
+ ########
31
+ # # Package Dimension Parser
32
+ ########
33
+ # Package Dimension Parsing Strategy 1:
34
+ product_hash[:package_dimensions] = html_doc.css('.size-weight').children.map{|r| r.text}.reject{|r| !r.match?("inches")}.first
35
+
36
+ # Package Dimension Parsing Strategy 2:
37
+ if product_hash[:package_dimensions].blank?
38
+
39
+ # Find an index for the string "Package Dimensions" within a string text extraction of the page
40
+ str_index = html_doc.inner_text.index("Package Dimensions")
41
+
42
+ unless str_index.nil?
43
+
44
+ # Reduce string representing the html page down to a smaller target string including "Package Dimensions" and the weights
45
+ str = html_doc.inner_text[str_index .. str_index + 150]
46
+
47
+ # Find within target string an index for where the word "inches" appears, then grab characters around it
48
+ product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
49
+ end
50
+ end
51
+
52
+ # Package Dimension Parsing Strategy 3:
53
+ response.search('.//*[@class="a-color-secondary a-size-base prodDetSectionEntry"]').map{|n| n.parent}.each do |n|
54
+
55
+ # Parse html in each row of Amazon's product details table to get back a string. E.g: "\n \n Best Sellers Rank\n \n \n \n \n #63 in Toys & Games (See Top 100 in Toys & Games)\n \n \n #3 in Toys & Games > Baby & Toddler Toys > Teethers\n \n \n \n \n "
56
+ str = n.children.inner_text
57
+ if product_hash[:best_sellers_rank].blank?
58
+ str.match("Best Sellers Rank") do |m|
59
+ # Gnarly string manipulation extracts the array: ["63", "in", "Toys & Games"]
60
+ parsed_parent_category = str.partition("(").first.chop.partition("#").last.partition("in").map(&:strip)
61
+
62
+ # From ["63", "in", "Toys & Games"] we only care about first & last parts of this array
63
+ product_hash[:rank] = parsed_parent_category.first.delete(',').to_i
64
+ product_hash[:category] = parsed_parent_category.last
65
+
66
+ parsed_category = str.partition(")").last.partition("in").map(&:strip).map{|i| i.gsub("#", "")} - ["in"]
67
+
68
+ hsh = {}
69
+ hsh[:rank] = parsed_category.first.delete(',').to_i
70
+ hsh[:ladder] = parsed_category.last
71
+ product_hash[:best_sellers_rank] << hsh
72
+ end
73
+ end
74
+
75
+ if product_hash[:product_dimensions].blank?
76
+
77
+ # Use pattern matching to extract the product details we care about
78
+ str.match("Product Dimensions") do |m|
79
+ product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
80
+ end
81
+ end
82
+ end
83
+ product_hash
84
+ end
85
+
86
+ def extract_subcategory_rankings(nokogiri_html)
87
+ # Below is gnarly string manipulation to parse text strings like:
88
+ # "\n #2\n in Baby > Baby Care > Health\n \n #2\n in Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers\n "
89
+ # into:
90
+ # [["2", "Baby > Baby Care > Health"], ["2", "Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers"]]
91
+ nokogiri_html
92
+ .map{|i| i.text}
93
+ .map{|i| i.partition("in")
94
+ .map(&:strip)}
95
+ .map{|i| i - ["in"] }
96
+ .map{|i|
97
+ i.map{|ii|
98
+ ii.gsub("#", "") # remove '#' from '#2'
99
+ .gsub("\u00A0", "") # remove No-Break Space Unicode characters (U+00A0) since Ruby's .strip command won't remove them
100
+ }
101
+ }.each do |i|
102
+ hsh = {}
103
+ hsh[:rank] = i.first.to_i
104
+ hsh[:ladder] = i.last
105
+ product_hash[:best_sellers_rank] << hsh
106
+ end
107
+ end
108
+
109
+ end
@@ -0,0 +1,3 @@
1
+ module Amazoned
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amazoned
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - kelseydh
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mechanize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.7'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 2.7.6
65
+ type: :development
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '2.7'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 2.7.6
75
+ - !ruby/object:Gem::Dependency
76
+ name: activesupport
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.2'
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 4.2.0
85
+ type: :development
86
+ prerelease: false
87
+ version_requirements: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - "~>"
90
+ - !ruby/object:Gem::Version
91
+ version: '5.2'
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 4.2.0
95
+ description: This gem allows you to scrap product information from Amazon without
96
+ the need to register for Amazon's API
97
+ email:
98
+ - kelseyh@gmail.com
99
+ executables: []
100
+ extensions: []
101
+ extra_rdoc_files: []
102
+ files:
103
+ - ".gitignore"
104
+ - ".rspec"
105
+ - ".travis.yml"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - amazoned.gemspec
111
+ - bin/console
112
+ - bin/setup
113
+ - lib/amazoned.rb
114
+ - lib/amazoned/client.rb
115
+ - lib/amazoned/parser.rb
116
+ - lib/amazoned/version.rb
117
+ homepage: http://twitter.com/kelsoh
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.6.13
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: A manual scraper for Amazon ASIN product data
141
+ test_files: []