amazoned 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: c4ff0dac060161cb2ba774302dcadfa145332a1f
4
+ data.tar.gz: 30a81104eb62ef099d4515a3b6effbdde23520b3
5
+ SHA512:
6
+ metadata.gz: 134f051f0e192d2490422a3ee8b061787234a2b0c1cbd2b9af2b3a0d3d9d5745133f2b166e63458289b960667e21cdc58a30bdcca6539dcae7253365a24df71d
7
+ data.tar.gz: 7d88efe96872019f2258cd95b9e869c58a8d59dde6028db3bf91a55794f59a578605151941f64b2d66ec5ec10c168f976adf437ed158eb4ecdc806ac66790b28
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /_yardoc/
4
+ /coverage/
5
+ /doc/
6
+ /pkg/
7
+ /spec/reports/
8
+ /tmp/
9
+
10
+ # rspec failure tracking
11
+ .rspec_status
data/.rspec ADDED
@@ -0,0 +1,3 @@
1
+ --format documentation
2
+ --color
3
+ --require spec_helper
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.5.1
5
+ before_install: gem install bundler -v 1.16.2
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source "https://rubygems.org"
2
+
3
+ git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
4
+
5
+ # Specify your gem's dependencies in amazonian.gemspec
6
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2018 kelseydh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,39 @@
1
+ # Amazoned
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/amazoned`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'amazoned'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install amazoned
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/amazoned.
36
+
37
+ ## License
38
+
39
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,32 @@
1
+
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "amazoned/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "amazoned"
8
+ spec.version = Amazoned::VERSION
9
+ spec.authors = ["kelseydh"]
10
+ spec.email = ["kelseyh@gmail.com"]
11
+
12
+ spec.summary = %q{A manual scraper for Amazon ASIN product data}
13
+ spec.description = %q{This gem allows you to scrap product information from Amazon without the need to register for Amazon's API}
14
+ spec.homepage = "http://twitter.com/kelsoh"
15
+ spec.license = "MIT"
16
+
17
+
18
+ # Specify which files should be added to the gem when it is released.
19
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
20
+ spec.files = Dir.chdir(File.expand_path('..', __FILE__)) do
21
+ `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
22
+ end
23
+ spec.bindir = "exe"
24
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
25
+ spec.require_paths = ["lib"]
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.16"
28
+ spec.add_development_dependency "rake", "~> 10.0"
29
+ spec.add_development_dependency "rspec", "~> 3.0"
30
+ spec.add_development_dependency "mechanize", '~> 2.7', '>= 2.7.6'
31
+ spec.add_development_dependency "activesupport", '~> 5.2', '>= 4.2.0'
32
+ end
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "amazoned"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start(__FILE__)
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,26 @@
1
+ require 'active_support/core_ext/string'
2
+ require 'amazoned/version'
3
+ require 'amazoned/client'
4
+ require 'amazoned/parser'
5
+ require 'mechanize'
6
+ require 'nokogiri'
7
+ require 'byebug'
8
+
9
+ module Amazoned
10
+ @max_network_retries = 3
11
+ @max_network_retry_delay = 2.3
12
+ @initial_network_retry_delay = 1.1 # requests under a second could flag us as a scraping bot
13
+
14
+
15
+ class << self
16
+ attr_reader :max_network_retry_delay, :initial_network_retry_delay
17
+ end
18
+
19
+ def self.max_network_retries
20
+ @max_network_retries
21
+ end
22
+
23
+ def self.max_network_retries=(val)
24
+ @max_network_retries = val.to_i
25
+ end
26
+ end
@@ -0,0 +1,76 @@
1
+ cled Amazoned::ProductNotFoundError < StandardError; end
2
+ class Amazoned::BotDeniedAccessError < StandardError; end
3
+ class Amazoned::Client
4
+ attr_reader :asin
5
+
6
+ def initialize(asin)
7
+ @asin = asin
8
+ end
9
+
10
+ def call
11
+ response = get_product
12
+ Amazoned::Parser.new(response).call
13
+ end
14
+
15
+ def get_product(num_retries = 1)
16
+ agent = Mechanize.new.tap do |web|
17
+ web.html_parser = HtmlParser # Avoid encoding issues: https://stackoverflow.com/a/20666246/3448554
18
+ web.user_agent_alias = (Mechanize::AGENT_ALIASES.keys - ['Mechanize']).sample # spoof every request with a random User Agent as a way to hit fewer CAPTCHA walls
19
+ end
20
+
21
+ begin
22
+ # Start GET request of Amazon page using ASIN.
23
+ response = agent.get("https://www.amazon.com/dp/#{asin}")
24
+ if request_failed(response)
25
+ puts "Request failed! Trying again..."
26
+ # On failure, recursively try again to be resilient against one-off failures
27
+ if num_retries <= Amazoned.max_network_retries
28
+ sleep self.class.sleep_time(num_retries)
29
+ get_product(num_retries += 1)
30
+ else
31
+ handle_failed_request!(response)
32
+ end
33
+ else
34
+ response
35
+ end
36
+ rescue Mechanize::ResponseCodeError => e
37
+ raise Amazoned::ProductNotFoundError
38
+ end
39
+ end
40
+
41
+ def request_failed(response)
42
+ return true if response.xpath('//p[contains(text(), "Sorry, we just need to make sure")]').any? # captcha hit
43
+ false
44
+ end
45
+
46
+ def handle_failed_request!(response)
47
+ # Raise this error when we can't penetrate Amazon's CAPTCHA wall
48
+ raise Amazoned::BotDeniedAccessError if response.xpath('//p[contains(text(), "Sorry, we just need to make sure")]').any?
49
+ end
50
+
51
+ # Taken from Stripe API
52
+ # Stripe uses jitter to smooth server load; we use it to obfuscate timing detection of our scraper bot
53
+ # https://github.com/stripe/stripe-ruby/blob/ec66c3f0f44274f885de8d13de5dce2657932121/lib/stripe/stripe_client.rb#L80
54
+ def self.sleep_time(num_retries)
55
+ # Apply exponential backoff with initial_network_retry_delay on the
56
+ # number of num_retries so far as inputs. Do not allow the number to exceed
57
+ # max_network_retry_delay.
58
+ sleep_seconds = [Amazoned.initial_network_retry_delay * (2**(num_retries - 1)), Amazoned.max_network_retry_delay].min
59
+
60
+ # Apply some jitter by randomizing the value in the range of (sleep_seconds
61
+ # / 2) to (sleep_seconds).
62
+ sleep_seconds *= (0.5 * (1 + rand))
63
+
64
+ # But never sleep less than the base sleep seconds.
65
+ sleep_seconds = [Amazoned.initial_network_retry_delay, sleep_seconds].max
66
+
67
+ sleep_seconds
68
+ end
69
+ end
70
+
71
+ class HtmlParser
72
+ def self.parse(body, url, encoding)
73
+ body.encode!('UTF-8', encoding, invalid: :replace, undef: :replace, replace: '')
74
+ Nokogiri::HTML::Document.parse(body, url, 'UTF-8')
75
+ end
76
+ end
@@ -0,0 +1,109 @@
1
+ class Amazoned::Parser
2
+ attr_accessor :product_hash
3
+ attr_reader :html_doc, :response
4
+
5
+ def initialize(response)
6
+ @product_hash = Hash.new
7
+ @response = response
8
+ @html_doc = Nokogiri::HTML(response.body)
9
+ end
10
+
11
+ def call
12
+ parse_response_for_product_details( response )
13
+ end
14
+
15
+ def parse_response_for_product_details(response)
16
+ product_hash[:best_sellers_rank] = []
17
+
18
+ ########
19
+ # # Parent category Seller Rank Parser
20
+ ########
21
+ parsed_parent_category = html_doc.css('#SalesRank').text.partition("(").first.chop.partition("#").last.partition("in").map(&:strip) - ["in"]
22
+ product_hash[:rank] = parsed_parent_category.first.delete(',').to_i # "903,610" -> 903610
23
+ product_hash[:category] = parsed_parent_category.last
24
+
25
+ ########
26
+ # # Subcategory Seller Rank Parser
27
+ ########
28
+ extract_subcategory_rankings( html_doc.css('.zg_hrsr_item') )
29
+
30
+ ########
31
+ # # Package Dimension Parser
32
+ ########
33
+ # Package Dimension Parsing Strategy 1:
34
+ product_hash[:package_dimensions] = html_doc.css('.size-weight').children.map{|r| r.text}.reject{|r| !r.match?("inches")}.first
35
+
36
+ # Package Dimension Parsing Strategy 2:
37
+ if product_hash[:package_dimensions].blank?
38
+
39
+ # Find an index for the string "Package Dimensions" within a string text extraction of the page
40
+ str_index = html_doc.inner_text.index("Package Dimensions")
41
+
42
+ unless str_index.nil?
43
+
44
+ # Reduce string representing the html page down to a smaller target string including "Package Dimensions" and the weights
45
+ str = html_doc.inner_text[str_index .. str_index + 150]
46
+
47
+ # Find within target string an index for where the word "inches" appears, then grab characters around it
48
+ product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
49
+ end
50
+ end
51
+
52
+ # Package Dimension Parsing Strategy 3:
53
+ response.search('.//*[@class="a-color-secondary a-size-base prodDetSectionEntry"]').map{|n| n.parent}.each do |n|
54
+
55
+ # Parse html in each row of Amazon's product details table to get back a string. E.g: "\n \n Best Sellers Rank\n \n \n \n \n #63 in Toys & Games (See Top 100 in Toys & Games)\n \n \n #3 in Toys & Games > Baby & Toddler Toys > Teethers\n \n \n \n \n "
56
+ str = n.children.inner_text
57
+ if product_hash[:best_sellers_rank].blank?
58
+ str.match("Best Sellers Rank") do |m|
59
+ # Gnarly string manipulation extracts the array: ["63", "in", "Toys & Games"]
60
+ parsed_parent_category = str.partition("(").first.chop.partition("#").last.partition("in").map(&:strip)
61
+
62
+ # From ["63", "in", "Toys & Games"] we only care about first & last parts of this array
63
+ product_hash[:rank] = parsed_parent_category.first.delete(',').to_i
64
+ product_hash[:category] = parsed_parent_category.last
65
+
66
+ parsed_category = str.partition(")").last.partition("in").map(&:strip).map{|i| i.gsub("#", "")} - ["in"]
67
+
68
+ hsh = {}
69
+ hsh[:rank] = parsed_category.first.delete(',').to_i
70
+ hsh[:ladder] = parsed_category.last
71
+ product_hash[:best_sellers_rank] << hsh
72
+ end
73
+ end
74
+
75
+ if product_hash[:product_dimensions].blank?
76
+
77
+ # Use pattern matching to extract the product details we care about
78
+ str.match("Product Dimensions") do |m|
79
+ product_hash[:package_dimensions] = str[str.index("inches")- 20.. str.index("inches")+8].strip
80
+ end
81
+ end
82
+ end
83
+ product_hash
84
+ end
85
+
86
+ def extract_subcategory_rankings(nokogiri_html)
87
+ # Below is gnarly string manipulation to parse text strings like:
88
+ # "\n #2\n in Baby > Baby Care > Health\n \n #2\n in Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers\n "
89
+ # into:
90
+ # [["2", "Baby > Baby Care > Health"], ["2", "Baby > Baby Care > Pacifiers, Teethers & Teething Relief > Teethers"]]
91
+ nokogiri_html
92
+ .map{|i| i.text}
93
+ .map{|i| i.partition("in")
94
+ .map(&:strip)}
95
+ .map{|i| i - ["in"] }
96
+ .map{|i|
97
+ i.map{|ii|
98
+ ii.gsub("#", "") # remove '#' from '#2'
99
+ .gsub("\u00A0", "") # remove No-Break Space Unicode characters (U+00A0) since Ruby's .strip command won't remove them
100
+ }
101
+ }.each do |i|
102
+ hsh = {}
103
+ hsh[:rank] = i.first.to_i
104
+ hsh[:ladder] = i.last
105
+ product_hash[:best_sellers_rank] << hsh
106
+ end
107
+ end
108
+
109
+ end
@@ -0,0 +1,3 @@
1
+ module Amazoned
2
+ VERSION = "0.1.0"
3
+ end
metadata ADDED
@@ -0,0 +1,141 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: amazoned
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - kelseydh
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2018-07-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.16'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '3.0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '3.0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mechanize
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '2.7'
62
+ - - ">="
63
+ - !ruby/object:Gem::Version
64
+ version: 2.7.6
65
+ type: :development
66
+ prerelease: false
67
+ version_requirements: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - "~>"
70
+ - !ruby/object:Gem::Version
71
+ version: '2.7'
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: 2.7.6
75
+ - !ruby/object:Gem::Dependency
76
+ name: activesupport
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '5.2'
82
+ - - ">="
83
+ - !ruby/object:Gem::Version
84
+ version: 4.2.0
85
+ type: :development
86
+ prerelease: false
87
+ version_requirements: !ruby/object:Gem::Requirement
88
+ requirements:
89
+ - - "~>"
90
+ - !ruby/object:Gem::Version
91
+ version: '5.2'
92
+ - - ">="
93
+ - !ruby/object:Gem::Version
94
+ version: 4.2.0
95
+ description: This gem allows you to scrap product information from Amazon without
96
+ the need to register for Amazon's API
97
+ email:
98
+ - kelseyh@gmail.com
99
+ executables: []
100
+ extensions: []
101
+ extra_rdoc_files: []
102
+ files:
103
+ - ".gitignore"
104
+ - ".rspec"
105
+ - ".travis.yml"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - amazoned.gemspec
111
+ - bin/console
112
+ - bin/setup
113
+ - lib/amazoned.rb
114
+ - lib/amazoned/client.rb
115
+ - lib/amazoned/parser.rb
116
+ - lib/amazoned/version.rb
117
+ homepage: http://twitter.com/kelsoh
118
+ licenses:
119
+ - MIT
120
+ metadata: {}
121
+ post_install_message:
122
+ rdoc_options: []
123
+ require_paths:
124
+ - lib
125
+ required_ruby_version: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - ">="
128
+ - !ruby/object:Gem::Version
129
+ version: '0'
130
+ required_rubygems_version: !ruby/object:Gem::Requirement
131
+ requirements:
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: '0'
135
+ requirements: []
136
+ rubyforge_project:
137
+ rubygems_version: 2.6.13
138
+ signing_key:
139
+ specification_version: 4
140
+ summary: A manual scraper for Amazon ASIN product data
141
+ test_files: []