amazon-search 1.4.2 → 1.4.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 906821671967d660123351f7ab6b0fb00a33fdf6
4
- data.tar.gz: 0c737cf462e32ea69d9bfbd4b5f914db3b7b6487
3
+ metadata.gz: 9e25e25215f49b4726b1e06db9901e738f866993
4
+ data.tar.gz: fffdba88068487fb177d04b1a3fa66a67a435ea1
5
5
  SHA512:
6
- metadata.gz: 03955719426cdda4f0cbf0a406dd6933ccdd4caba6d724c6c42182f531841dbfbfc96a71a252f6b27cc1db6e8b0f792d6dfafa3e822d101c0e90e02807d6fc7b
7
- data.tar.gz: acb67b2d654f4f4b8e4fbd67e2ce75ad9593d2afc07f309e045fbf83d2780efb62ddd1816fab842dafcfa25305e5274c0c3daa0979ad744b81798555e240ab0a
6
+ metadata.gz: 61d0e4ce208691c9cc62929bd3bd91dc62ca74bee9b0523a580d80b8ad874bbebb4cf9ec11de7407d6929fbc8173f0943dcfe914dfd295c783a20f30915d2374
7
+ data.tar.gz: 9bc77c89ddc0e9843e128649925538c069acf75e3f951a7628c1be12ffe5e6401d7a24a079b8c44413bcc608e6e8cefeed58210525e55e434e4da448c1e7b94f
@@ -1,12 +1,12 @@
1
- == amazon-search
1
+ # amazon-search
2
2
 
3
3
  Amazon Search is a simple Ruby tool to search for Amazon products.
4
4
 
5
5
  This tool screenscrapes an Amazon search and returns a hash of the product results. Configuration of Amazon's API is not needed.
6
6
 
7
- The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XML and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
7
+ The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XPath and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
8
8
 
9
- == DATA COLLECTED
9
+ ## DATA COLLECTED
10
10
  * title
11
11
  * price
12
12
  * stars
@@ -16,13 +16,15 @@ The functionality is centered around mechanize pagination for the screen scrapin
16
16
  * seller
17
17
 
18
18
 
19
- == INSTALLATION
19
+ ## INSTALLATION
20
20
 
21
+ ```
21
22
  $ gem install amazon-search
23
+ ```
22
24
 
25
+ ## EXAMPLE
23
26
 
24
- == EXAMPLE
25
-
27
+ ```ruby
26
28
  require 'amazon-search'
27
29
 
28
30
  # search for products by string
@@ -37,12 +39,12 @@ The functionality is centered around mechanize pagination for the screen scrapin
37
39
 
38
40
  # reference any product by the order it appeared in search results
39
41
 
42
+
40
43
  $products[0] # => references the first product found in search
41
44
  $products[30] # => references the 29th product found in search
42
45
 
43
46
 
44
- # reference any product by the order it appeared in search results
45
- # and display attributes of that product
47
+ # display attributes of specific product
46
48
  # all available attributes are:
47
49
 
48
50
  $products[0][:title] # => the first product's title
@@ -71,10 +73,11 @@ The functionality is centered around mechanize pagination for the screen scrapin
71
73
  puts product[:title]
72
74
  puts product[:stars]
73
75
  # etc ...
74
-
76
+ end
77
+ ```
75
78
 
76
79
 
77
- == MIT LICENSE
80
+ ## MIT LICENSE
78
81
 
79
82
  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
80
83
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = %q{amazon-search}
5
- gem.version = '1.4.2'
5
+ gem.version = '1.4.4'
6
6
  gem.date = '2015-09-19'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
14
14
  gem.description = "Simple screenscraper to search Amazon and return product titles, urls, image href, etc."
15
15
  gem.authors = ["John Mason"]
16
16
  gem.email = 'mace2345@gmail.com'
17
- gem.homepage = 'https://github.com/m8ss/amazon-search'
17
+ gem.homepage = 'https://github.com/m8ss/amazon-search'
18
18
  gem.license = 'MIT'
19
19
 
20
20
  gem.add_runtime_dependency('mechanize', '~> 2.7')
@@ -1,25 +1,166 @@
1
- #!/usr/bin/env ruby
2
-
3
1
  require 'mechanize'
4
- require 'amazon-search/form'
5
- require 'amazon-search/scan'
6
- require 'amazon-search/products'
7
2
 
3
+ # actions of Amazon search
8
4
  module Amazon
9
- class << self
10
- # main method: process Amazon search
11
- def search(keywords)
12
- $keywords = keywords
5
+ class << self
6
+ attr_accessor :products, :title, :price, :stars, :reviews, :seller,
7
+ :image_url, :product_url, :product_num
13
8
 
14
- set_agent
15
- find_form
16
- submit_form
17
- scan
9
+ # main method: process Amazon search
10
+ def search(keywords)
11
+ @keywords = keywords
12
+ set_initial_values
13
+ set_agent
14
+ find_form
15
+ submit_form
16
+ scan
17
+ $products
18
+ end
18
19
 
19
- $products
20
- end
20
+ def set_initial_values
21
+ $products = {}
22
+ @product_num = 0
21
23
  end
22
- end
23
24
 
25
+ # prepares Mechanize
26
+ def set_agent
27
+ @agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
28
+ end
29
+
30
+ # finds Amazon search box
31
+ def find_form
32
+ @main_page = @agent.get('http://amazon.com')
33
+ @search_form = @main_page.form_with :name => 'site-search'
34
+ end
35
+
36
+ # submits Amazon search box
37
+ def submit_form
38
+ @search_form.field_with(:name => 'field-keywords').value = @keywords
39
+ @current_page = @agent.submit @search_form # submits form
40
+ end
41
+
42
+ # examine current_pagenum
43
+ def examine_current_pagenum
44
+ @current_pagenum =
45
+ @current_page.search '//*[contains(concat( " ", @class, " " ),
46
+ concat( " ", "pagnCur", " " ))]'
47
+
48
+ @current_pagenum = @current_pagenum.text.to_i # need integer for checks
49
+ end
50
+
51
+ # find last page number
52
+ def find_last_pagenum
53
+ @last_pagenum =
54
+ @current_page.search '//*[contains(concat( " ", @class, " " ),
55
+ concat( " ", "pagnDisabled", " " ))]'
56
+
57
+ @last_pagenum = @last_pagenum.text.to_i # need integer for checks
58
+ end
59
+
60
+ # load next page
61
+ def load_next_page
62
+ examine_current_pagenum # does this need to be here?
63
+
64
+ # find next page link
65
+ @next_page_link = @current_page.link_with :text => /Next Page/
66
+ @next_page = @next_page_link.click unless @current_pagenum == @last_pagenum
67
+ @current_page = @agent.get(@next_page.uri)
68
+ end
69
+
70
+ # cycle through search result pages and store product html
71
+ def scan
72
+ @pages = {}
73
+
74
+ find_last_pagenum
24
75
 
76
+ @last_pagenum.times do # paginate until on last page.
77
+ examine_current_pagenum
25
78
 
79
+ @current_divs = @current_page.search('//li[starts-with(@id, "result")]')
80
+ @pages[@page_num] = @current_divs # store page results
81
+
82
+ extract_product_data
83
+ load_next_page
84
+ end
85
+ puts "\n(scan complete.)"
86
+ end
87
+
88
+ # used for checking strings
89
+ def numeric?(s)
90
+ !!Float(s) rescue false
91
+ end
92
+
93
+ # puts product details to console
94
+ def display_product
95
+ STDOUT.puts '--' * 50
96
+ STDOUT.puts "title: \t\t#{@title}"
97
+ STDOUT.puts "seller: \t#{@seller}"
98
+ STDOUT.puts "price: \t\t#{@price}"
99
+ STDOUT.puts "stars: \t\t#{@stars}"
100
+ STDOUT.puts "reviews: \t#{@reviews}"
101
+ STDOUT.puts "image url: \t#{@image_href}"
102
+ STDOUT.puts "product url: \t#{@url}"
103
+ end
104
+
105
+ # extract product data
106
+ def extract_product_data
107
+ # TODO: fix this global variable...
108
+
109
+ # nokogiri syntax is needed when iterating...not mechanize!
110
+ # extract useful stuff from product html
111
+ @current_divs.each do |html|
112
+ # first select raw html
113
+ title = html.at_css('.s-access-title')
114
+ seller = html.at_css('.a-row > .a-spacing-none')
115
+ price = html.at_css('.s-price')
116
+ stars = html.at_css('.a-icon-star')
117
+ reviews = html.at_css('span+ .a-text-normal')
118
+ image_href = html.at_css('.s-access-image')
119
+ url = html.at_css('.a-row > a')
120
+
121
+ break if title.nil? == true # if it's nil it's prob an ad
122
+ break if price.nil? == true # no price? prob not worthy item
123
+ break if stars.nil? == true # no stars? not worth it
124
+
125
+ # extract text and set variables for puts
126
+ @title = title.text
127
+ @price = price.text
128
+ @stars = stars.text
129
+ @image_href = image_href['src']
130
+ @url = url['href']
131
+
132
+ # movies sometimes have text in review class
133
+ if numeric?(reviews.text)
134
+ @reviews = reviews.text
135
+ else
136
+ @reviews = 'Unknown'
137
+ end
138
+
139
+ if seller.nil? == true # sometimes seller is nil on movies, etc.
140
+ @seller = 'Unknown'
141
+ else
142
+ @seller = seller.text
143
+ end
144
+
145
+ # don't overload the server
146
+ sleep(0.05)
147
+
148
+ display_product
149
+
150
+ # store extracted text in products hash
151
+ # key is product count
152
+ $products[@product_num] = {
153
+ :title => @title,
154
+ :price => @price,
155
+ :stars => @stars,
156
+ :reviews => @reviews,
157
+ :image_href => @image_href,
158
+ :url => @url,
159
+ :seller => @seller
160
+ }
161
+
162
+ @product_num += 1 # ready for next product
163
+ end
164
+ end
165
+ end
166
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amazon-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.2
4
+ version: 1.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -31,16 +31,9 @@ executables: []
31
31
  extensions: []
32
32
  extra_rdoc_files: []
33
33
  files:
34
- - Readme.rdoc
34
+ - README.md
35
35
  - amazon-search.gemspec
36
36
  - lib/amazon-search.rb
37
- - lib/amazon-search/form.rb
38
- - lib/amazon-search/products.rb
39
- - lib/amazon-search/scan.rb
40
- - test/lib/amazon-search.rb
41
- - test/lib/amazon-search/form.rb
42
- - test/lib/amazon-search/products.rb
43
- - test/lib/amazon-search/scan.rb
44
37
  homepage: https://github.com/m8ss/amazon-search
45
38
  licenses:
46
39
  - MIT
@@ -61,12 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
61
54
  version: '0'
62
55
  requirements: []
63
56
  rubyforge_project:
64
- rubygems_version: 2.4.6
57
+ rubygems_version: 2.4.5
65
58
  signing_key:
66
59
  specification_version: 4
67
60
  summary: A simple screenscraper to search Amazon
68
- test_files:
69
- - test/lib/amazon-search.rb
70
- - test/lib/amazon-search/form.rb
71
- - test/lib/amazon-search/products.rb
72
- - test/lib/amazon-search/scan.rb
61
+ test_files: []
@@ -1,26 +0,0 @@
1
- require 'mechanize'
2
- require_relative './scan'
3
- require_relative './products'
4
-
5
- module Amazon
6
- class << self
7
- # prepares Mechanize
8
- def set_agent
9
- $agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
10
- end
11
-
12
- # finds Amazon search box
13
- def find_form
14
- $main_page = $agent.get("http://amazon.com")
15
- $search_form = $main_page.form_with :name => "site-search"
16
- end
17
-
18
- # submits Amazon search box
19
- def submit_form
20
- $search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
21
- $current_page = $agent.submit $search_form # submits form
22
-
23
- end
24
- end
25
-
26
- end
@@ -1,87 +0,0 @@
1
- require 'mechanize'
2
- require_relative './scan'
3
- require_relative './form'
4
-
5
- module Amazon
6
- class << self
7
- $products = {}
8
- $product_num = 0
9
-
10
- # used for checking strings
11
- def is_numeric?(s)
12
- !!Float(s) rescue false
13
- end
14
-
15
- # puts product details to console
16
- def display_product
17
- STDOUT.puts "--"*50
18
- STDOUT.puts "title: \t\t#{$title}"
19
- STDOUT.puts "seller: \t#{$seller}"
20
- STDOUT.puts "price: \t\t#{$price}"
21
- STDOUT.puts "stars: \t\t#{$stars}"
22
- STDOUT.puts "reviews: \t#{$reviews}"
23
- STDOUT.puts "image url: \t#{$image_href}"
24
- STDOUT.puts "product url: \t#{$url}"
25
- end
26
-
27
- # extract product data
28
- def extract_product_data
29
-
30
- # nokogiri syntax is needed when iterating...not mechanize!
31
- # extract useful stuff from product html
32
- $current_divs.each do |html|
33
- # first select raw html
34
- title = html.at_css(".s-access-title")
35
- seller = html.at_css(".a-row > .a-spacing-none")
36
- price = html.at_css(".s-price")
37
- stars = html.at_css(".a-icon-star")
38
- reviews = html.at_css("span+ .a-text-normal")
39
- image_href = html.at_css(".s-access-image")
40
- url = html.at_css(".a-row > a")
41
-
42
- break if title == nil # if it's nil it's prob an ad
43
- break if price == nil # no price? prob not worthy item
44
- break if stars == nil # no stars? not worth it
45
-
46
- # extract text and set variables for puts
47
- $title = title.text
48
- $price = price.text
49
- $stars = stars.text
50
- $image_href = image_href['src']
51
- $url = url['href']
52
-
53
- # movies sometimes have text in review class
54
- if is_numeric?(reviews.text)
55
- $reviews = reviews.text
56
- else
57
- $reviews = "Unknown"
58
- end
59
-
60
- if seller == nil # sometimes seller is nil on movies, etc.
61
- $seller = "Unknown"
62
- else
63
- $seller = seller.text
64
- end
65
-
66
- # don't overload the server
67
- sleep(0.05)
68
-
69
- display_product
70
-
71
- # store extracted text in products hash
72
- # key is product count
73
- $products[$product_num] = {
74
- title: $title,
75
- price: $price,
76
- stars: $stars,
77
- reviews: $reviews,
78
- image_href: $image_href,
79
- url: $url,
80
- seller: $seller,
81
- }
82
-
83
- $product_num +=1 # ready for next product
84
- end
85
- end
86
- end
87
- end
@@ -1,55 +0,0 @@
1
- require 'mechanize'
2
- require_relative './products'
3
- require_relative './form'
4
-
5
- module Amazon
6
- class << self
7
- # examine current_pagenum
8
- def examine_current_pagenum
9
- $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
- $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
- end
12
-
13
- # find last page number
14
- def find_last_pagenum
15
- $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
- $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
- end
18
-
19
-
20
- # load next page
21
- def load_next_page
22
-
23
- examine_current_pagenum # does this need to be here?
24
-
25
- $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
26
- $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
27
-
28
- $current_page = $agent.get($next_page.uri)
29
-
30
- end
31
-
32
-
33
- # cycle through search result pages and store product html
34
- def scan
35
- $pages = {}
36
-
37
- find_last_pagenum
38
-
39
- $last_pagenum.times do # paginate until on last page.
40
-
41
- examine_current_pagenum
42
-
43
- $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
44
- $pages[$page_num] = $current_divs # store page results
45
-
46
- extract_product_data
47
- load_next_page
48
-
49
- end
50
-
51
- puts "\n(scan complete.)"
52
- end
53
- end
54
-
55
- end
@@ -1,24 +0,0 @@
1
- #!/usr/bin/env ruby
2
-
3
- require 'mechanize'
4
- require './amazon-search/form'
5
- require './amazon-search/scan'
6
- require './amazon-search/products'
7
-
8
- module Amazon
9
- class << self
10
- def search(keywords)
11
- $keywords = keywords
12
-
13
- set_agent
14
- find_form
15
- submit_form
16
- scan
17
-
18
- $products
19
- end
20
- end
21
- end
22
-
23
-
24
-
@@ -1,26 +0,0 @@
1
- require 'mechanize'
2
- require_relative './scan'
3
- require_relative './products'
4
-
5
- module Amazon
6
- class << self
7
- # prepares Mechanize
8
- def set_agent
9
- $agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
10
- end
11
-
12
- # finds Amazon search box
13
- def find_form
14
- $main_page = $agent.get("http://amazon.com")
15
- $search_form = $main_page.form_with :name => "site-search"
16
- end
17
-
18
- # submits Amazon search box
19
- def submit_form
20
- $search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
21
- $current_page = $agent.submit $search_form # submits form
22
-
23
- end
24
- end
25
-
26
- end
@@ -1,87 +0,0 @@
1
- require 'mechanize'
2
- require_relative './scan'
3
- require_relative './form'
4
-
5
- module Amazon
6
- class << self
7
- $products = {}
8
- $product_num = 0
9
-
10
- # used for checking strings
11
- def is_numeric?(s)
12
- !!Float(s) rescue false
13
- end
14
-
15
- # currently not being used and needs adjusting
16
- def display_product
17
- STDOUT.puts "--"*50
18
- STDOUT.puts "title: \t\t#{$title}"
19
- STDOUT.puts "seller: \t#{$seller}"
20
- STDOUT.puts "price: \t\t#{$price}"
21
- STDOUT.puts "stars: \t\t#{$stars}"
22
- STDOUT.puts "reviews: \t#{$reviews}"
23
- STDOUT.puts "image url: \t#{$image_href}"
24
- STDOUT.puts "product url: \t#{$url}"
25
- end
26
-
27
- # extract product data
28
- def extract_product_data
29
-
30
- # nokogiri syntax is needed when iterating...not mechanize!
31
- # extract useful stuff from product html
32
- $current_divs.each do |html|
33
- # first select raw html
34
- title = html.at_css(".s-access-title")
35
- seller = html.at_css(".a-row > .a-spacing-none")
36
- price = html.at_css(".s-price")
37
- stars = html.at_css(".a-icon-star")
38
- reviews = html.at_css("span+ .a-text-normal")
39
- image_href = html.at_css(".s-access-image")
40
- url = html.at_css(".a-row > a")
41
-
42
- break if title == nil # if it's nil it's prob an ad
43
- break if price == nil # no price? prob not worthy item
44
- break if stars == nil # no stars? not worth it
45
-
46
- # extract text and set variables for puts
47
- $title = title.text
48
- $price = price.text
49
- $stars = stars.text
50
- $image_href = image_href['src']
51
- $url = url['href']
52
-
53
- # movies sometimes have text in review class
54
- if is_numeric?(reviews.text)
55
- $reviews = reviews.text
56
- else
57
- $reviews = "Unknown"
58
- end
59
-
60
- if seller == nil # sometimes seller is nil on movies, etc.
61
- $seller = "Unknown"
62
- else
63
- $seller = seller.text
64
- end
65
-
66
- # don't overload the server
67
- sleep(0.05)
68
-
69
- display_product
70
-
71
- # store extracted text in products hash
72
- # key is product count
73
- $products[$product_num] = {
74
- title: $title,
75
- price: $price,
76
- stars: $stars,
77
- reviews: $reviews,
78
- image_href: $image_href,
79
- url: $url,
80
- seller: $seller,
81
- }
82
-
83
- $product_num +=1 # ready for next product
84
- end
85
- end
86
- end
87
- end
@@ -1,57 +0,0 @@
1
- require 'mechanize'
2
- require_relative './products'
3
- require_relative './form'
4
-
5
- module Amazon
6
- class << self
7
- # examine current_pagenum
8
- def examine_current_pagenum
9
- $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
- $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
- end
12
-
13
- # find last page number
14
- def find_last_pagenum
15
- $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
- $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
- end
18
-
19
-
20
- # load next page
21
- def load_next_page
22
-
23
- examine_current_pagenum # does this need to be here?
24
-
25
- $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
26
- $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
27
-
28
- $current_page = $agent.get($next_page.uri)
29
-
30
- end
31
-
32
-
33
- # cycle through search result pages and store product html
34
- def scan
35
- $pages = {}
36
-
37
- find_last_pagenum
38
-
39
- $last_pagenum.times do # paginate until on last page.
40
-
41
- examine_current_pagenum
42
- puts "\nscanning page #{$current_pagenum} of #{$last_pagenum} @ #{$main_page.uri+$current_page.uri}"
43
-
44
- $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
45
- $pages[$page_num] = $current_divs # store page results
46
-
47
- extract_product_data
48
-
49
- load_next_page
50
-
51
- end
52
-
53
- puts "\n(scan complete.)"
54
- end
55
- end
56
-
57
- end