amazon-search 1.3.0 → 1.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Readme.rdoc +61 -3
- data/amazon-search.gemspec +1 -1
- data/lib/amazon-search.rb +2 -0
- data/lib/amazon-search/products.rb +56 -28
- data/lib/amazon-search/scan.rb +1 -2
- data/test/lib/amazon-search.rb +3 -1
- data/test/lib/amazon-search/form.rb +1 -1
- data/test/lib/amazon-search/products.rb +57 -29
- data/test/lib/amazon-search/scan.rb +57 -0
- metadata +3 -3
- data/test/lib/amazon-search/pages.rb +0 -113
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1f477c66bb11ecc5baef8af2c657320cc405e330
|
4
|
+
data.tar.gz: dd2ce5d6b22c3907bbe8bdfc41ff87ce4fd5e349
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8506a8cc975113e00d2e329a084a59c62fe7650a19d4c32c4d35f310be1bed884e337e0e4b79824b856bafdaa20fcbea1569eebc699b78e7bb56843ce05b3bc4
|
7
|
+
data.tar.gz: def6ff11fb46613b14464188a5bc7de794bf36161b46b1ac92ca96cbcfeb371ae89cced41462b24c6411db405eb5ee5ae08a78d431e09c4dad892f7bf4d4a888
|
data/Readme.rdoc
CHANGED
@@ -2,19 +2,77 @@
|
|
2
2
|
|
3
3
|
Amazon Search is a simple Ruby tool to search for Amazon products.
|
4
4
|
|
5
|
-
This
|
5
|
+
This tool screenscrapes an Amazon search and returns a hash of the product results. Configuration of Amazon's API is not needed.
|
6
|
+
|
7
|
+
The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XML and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
|
8
|
+
|
9
|
+
== DATA COLLECTED
|
10
|
+
* title
|
11
|
+
* price
|
12
|
+
* stars
|
13
|
+
* reviews
|
14
|
+
* image_href
|
15
|
+
* url
|
16
|
+
* seller
|
17
|
+
|
6
18
|
|
7
19
|
== INSTALLATION
|
8
20
|
|
9
21
|
$ gem install amazon-search
|
10
22
|
|
23
|
+
|
11
24
|
== EXAMPLE
|
12
25
|
|
13
26
|
require 'amazon-search'
|
14
27
|
|
15
|
-
#
|
16
|
-
|
28
|
+
# search for products by string
|
29
|
+
|
17
30
|
Amazon::search "ruby"
|
31
|
+
|
32
|
+
|
33
|
+
# search results are stored in global variable:
|
34
|
+
|
35
|
+
$products # => returns entire hash of products found in search
|
36
|
+
|
37
|
+
|
38
|
+
# reference any product by the order it appeared in search results
|
39
|
+
|
40
|
+
$products[0] # => references the first product found in search
|
41
|
+
$products[30] # => references the 29th product found in search
|
42
|
+
|
43
|
+
|
44
|
+
# reference any product by the order it appeared in search results
|
45
|
+
# and display attributes of that product
|
46
|
+
# all available attributes are:
|
47
|
+
|
48
|
+
$products[0][:title] # => the first product's title
|
49
|
+
$products[0][:price] # => etc...
|
50
|
+
$products[0][:stars]
|
51
|
+
$products[0][:reviews]
|
52
|
+
$products[0][:image_href]
|
53
|
+
$products[0][:url]
|
54
|
+
$products[0][:seller]
|
55
|
+
|
56
|
+
|
57
|
+
# Save search results in order to execute another search
|
58
|
+
### method 1)
|
59
|
+
|
60
|
+
example_search = Amazon::search "ruby"
|
61
|
+
|
62
|
+
### method 2)
|
63
|
+
|
64
|
+
example_search = $products # => only works after search has been done
|
65
|
+
|
66
|
+
|
67
|
+
# Iterate over all search results and return specific attributes
|
68
|
+
|
69
|
+
$products.each do |x|
|
70
|
+
product = x[1] # => index into array before keying hash
|
71
|
+
puts product[:title]
|
72
|
+
puts product[:stars]
|
73
|
+
# etc ...
|
74
|
+
|
75
|
+
|
18
76
|
|
19
77
|
== MIT LICENSE
|
20
78
|
|
data/amazon-search.gemspec
CHANGED
data/lib/amazon-search.rb
CHANGED
@@ -4,10 +4,33 @@ require_relative './form'
|
|
4
4
|
|
5
5
|
module Amazon
|
6
6
|
class << self
|
7
|
+
$products = {}
|
8
|
+
$product_num = 0
|
9
|
+
|
10
|
+
# used for checking strings
|
11
|
+
def is_numeric?(s)
|
12
|
+
!!Float(s) rescue false
|
13
|
+
end
|
14
|
+
|
15
|
+
# currently not being used and needs adjusting
|
16
|
+
def display_product
|
17
|
+
STDOUT.puts "--"*50
|
18
|
+
STDOUT.puts "title: \t\t#{$title}"
|
19
|
+
STDOUT.puts "seller: \t#{$seller}"
|
20
|
+
STDOUT.puts "price: \t\t#{$price}"
|
21
|
+
STDOUT.puts "stars: \t\t#{$stars}"
|
22
|
+
STDOUT.puts "reviews: \t#{$reviews}"
|
23
|
+
STDOUT.puts "image url: \t#{$image_href}"
|
24
|
+
STDOUT.puts "product url: \t#{$url}"
|
25
|
+
end
|
26
|
+
|
7
27
|
# extract product data
|
8
28
|
def extract_product_data
|
29
|
+
|
9
30
|
# nokogiri syntax is needed when iterating...not mechanize!
|
10
|
-
|
31
|
+
# extract useful stuff from product html
|
32
|
+
$current_divs.each do |html|
|
33
|
+
# first select raw html
|
11
34
|
title = html.at_css(".s-access-title")
|
12
35
|
seller = html.at_css(".a-row > .a-spacing-none")
|
13
36
|
price = html.at_css(".s-price")
|
@@ -19,41 +42,46 @@ module Amazon
|
|
19
42
|
break if title == nil # if it's nil it's prob an ad
|
20
43
|
break if price == nil # no price? prob not worthy item
|
21
44
|
break if stars == nil # no stars? not worth it
|
45
|
+
|
46
|
+
# extract text and set variables for puts
|
47
|
+
$title = title.text
|
48
|
+
$price = price.text
|
49
|
+
$stars = stars.text
|
50
|
+
$image_href = image_href['src']
|
51
|
+
$url = url['href']
|
52
|
+
|
53
|
+
# movies sometimes have text in review class
|
54
|
+
if is_numeric?(reviews.text)
|
55
|
+
$reviews = reviews.text
|
56
|
+
else
|
57
|
+
$reviews = "Unknown"
|
58
|
+
end
|
22
59
|
|
23
60
|
if seller == nil # sometimes seller is nil on movies, etc.
|
24
|
-
seller = "Unknown"
|
61
|
+
$seller = "Unknown"
|
25
62
|
else
|
26
|
-
seller = seller.text
|
63
|
+
$seller = seller.text
|
27
64
|
end
|
28
|
-
|
29
|
-
# extract text and set variables for puts
|
30
|
-
title = title.text
|
31
|
-
price = price.text
|
32
|
-
stars = stars.text
|
33
|
-
reviews = reviews.text
|
34
|
-
image_href = image_href['src']
|
35
|
-
url = url['href']
|
36
65
|
|
37
|
-
|
66
|
+
# don't overload the server
|
67
|
+
sleep(0.05)
|
38
68
|
|
39
|
-
|
40
|
-
end
|
69
|
+
display_product
|
41
70
|
|
71
|
+
# store extracted text in products hash
|
72
|
+
# key is product count
|
73
|
+
$products[$product_num] = {
|
74
|
+
title: $title,
|
75
|
+
price: $price,
|
76
|
+
stars: $stars,
|
77
|
+
reviews: $reviews,
|
78
|
+
image_href: $image_href,
|
79
|
+
url: $url,
|
80
|
+
seller: $seller,
|
81
|
+
}
|
42
82
|
|
43
|
-
|
44
|
-
|
45
|
-
STDOUT.puts "--"*50
|
46
|
-
STDOUT.puts "title: \t\t#{title}"
|
47
|
-
STDOUT.puts "seller: \t#{seller}"
|
48
|
-
STDOUT.puts "price: \t\t#{price}"
|
49
|
-
STDOUT.puts "stars: \t\t#{stars}"
|
50
|
-
STDOUT.puts "reviews: \t#{reviews}"
|
51
|
-
STDOUT.puts "image url: \t#{image}"
|
52
|
-
STDOUT.puts "product url: \t#{url}"
|
83
|
+
$product_num +=1 # ready for next product
|
84
|
+
end
|
53
85
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
57
86
|
end
|
58
|
-
|
59
87
|
end
|
data/lib/amazon-search/scan.rb
CHANGED
@@ -39,17 +39,16 @@ module Amazon
|
|
39
39
|
$last_pagenum.times do # paginate until on last page.
|
40
40
|
|
41
41
|
examine_current_pagenum
|
42
|
-
puts "\nscanning page #{$current_pagenum} of #{$last_pagenum}..."
|
43
42
|
|
44
43
|
$current_divs = $current_page.search('//li[starts-with(@id, "result")]')
|
45
44
|
$pages[$page_num] = $current_divs # store page results
|
46
45
|
|
46
|
+
extract_product_data
|
47
47
|
load_next_page
|
48
48
|
|
49
49
|
end
|
50
50
|
|
51
51
|
puts "\n(scan complete.)"
|
52
|
-
$pages
|
53
52
|
end
|
54
53
|
end
|
55
54
|
|
data/test/lib/amazon-search.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
require 'mechanize'
|
4
4
|
require './amazon-search/form'
|
5
|
-
require './amazon-search/
|
5
|
+
require './amazon-search/scan'
|
6
6
|
require './amazon-search/products'
|
7
7
|
|
8
8
|
module Amazon
|
@@ -14,6 +14,8 @@ module Amazon
|
|
14
14
|
find_form
|
15
15
|
submit_form
|
16
16
|
scan
|
17
|
+
|
18
|
+
$products
|
17
19
|
end
|
18
20
|
end
|
19
21
|
end
|
@@ -1,13 +1,36 @@
|
|
1
1
|
require 'mechanize'
|
2
|
-
require_relative './
|
2
|
+
require_relative './scan'
|
3
3
|
require_relative './form'
|
4
4
|
|
5
5
|
module Amazon
|
6
6
|
class << self
|
7
|
+
$products = {}
|
8
|
+
$product_num = 0
|
9
|
+
|
10
|
+
# used for checking strings
|
11
|
+
def is_numeric?(s)
|
12
|
+
!!Float(s) rescue false
|
13
|
+
end
|
14
|
+
|
15
|
+
# currently not being used and needs adjusting
|
16
|
+
def display_product
|
17
|
+
STDOUT.puts "--"*50
|
18
|
+
STDOUT.puts "title: \t\t#{$title}"
|
19
|
+
STDOUT.puts "seller: \t#{$seller}"
|
20
|
+
STDOUT.puts "price: \t\t#{$price}"
|
21
|
+
STDOUT.puts "stars: \t\t#{$stars}"
|
22
|
+
STDOUT.puts "reviews: \t#{$reviews}"
|
23
|
+
STDOUT.puts "image url: \t#{$image_href}"
|
24
|
+
STDOUT.puts "product url: \t#{$url}"
|
25
|
+
end
|
26
|
+
|
7
27
|
# extract product data
|
8
28
|
def extract_product_data
|
29
|
+
|
9
30
|
# nokogiri syntax is needed when iterating...not mechanize!
|
10
|
-
|
31
|
+
# extract useful stuff from product html
|
32
|
+
$current_divs.each do |html|
|
33
|
+
# first select raw html
|
11
34
|
title = html.at_css(".s-access-title")
|
12
35
|
seller = html.at_css(".a-row > .a-spacing-none")
|
13
36
|
price = html.at_css(".s-price")
|
@@ -19,41 +42,46 @@ module Amazon
|
|
19
42
|
break if title == nil # if it's nil it's prob an ad
|
20
43
|
break if price == nil # no price? prob not worthy item
|
21
44
|
break if stars == nil # no stars? not worth it
|
45
|
+
|
46
|
+
# extract text and set variables for puts
|
47
|
+
$title = title.text
|
48
|
+
$price = price.text
|
49
|
+
$stars = stars.text
|
50
|
+
$image_href = image_href['src']
|
51
|
+
$url = url['href']
|
52
|
+
|
53
|
+
# movies sometimes have text in review class
|
54
|
+
if is_numeric?(reviews.text)
|
55
|
+
$reviews = reviews.text
|
56
|
+
else
|
57
|
+
$reviews = "Unknown"
|
58
|
+
end
|
22
59
|
|
23
60
|
if seller == nil # sometimes seller is nil on movies, etc.
|
24
|
-
seller = "Unknown"
|
61
|
+
$seller = "Unknown"
|
25
62
|
else
|
26
|
-
seller = seller.text
|
63
|
+
$seller = seller.text
|
27
64
|
end
|
28
|
-
|
29
|
-
# extract text and set variables for puts
|
30
|
-
title = title.text
|
31
|
-
price = price.text
|
32
|
-
stars = stars.text
|
33
|
-
reviews = reviews.text
|
34
|
-
image_href = image_href['src']
|
35
|
-
url = url['href']
|
36
65
|
|
37
|
-
|
66
|
+
# don't overload the server
|
67
|
+
sleep(0.05)
|
38
68
|
|
39
|
-
|
40
|
-
end
|
69
|
+
display_product
|
41
70
|
|
71
|
+
# store extracted text in products hash
|
72
|
+
# key is product count
|
73
|
+
$products[$product_num] = {
|
74
|
+
title: $title,
|
75
|
+
price: $price,
|
76
|
+
stars: $stars,
|
77
|
+
reviews: $reviews,
|
78
|
+
image_href: $image_href,
|
79
|
+
url: $url,
|
80
|
+
seller: $seller,
|
81
|
+
}
|
42
82
|
|
43
|
-
|
44
|
-
|
45
|
-
STDOUT.puts "--"*50
|
46
|
-
STDOUT.puts "title: \t\t#{title}"
|
47
|
-
STDOUT.puts "seller: \t#{seller}"
|
48
|
-
STDOUT.puts "price: \t\t#{price}"
|
49
|
-
STDOUT.puts "stars: \t\t#{stars}"
|
50
|
-
STDOUT.puts "reviews: \t#{reviews}"
|
51
|
-
STDOUT.puts "image url: \t#{image}"
|
52
|
-
STDOUT.puts "product url: \t#{url}"
|
83
|
+
$product_num +=1 # ready for next product
|
84
|
+
end
|
53
85
|
end
|
54
|
-
|
55
|
-
|
56
|
-
|
57
86
|
end
|
58
|
-
|
59
87
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require_relative './products'
|
3
|
+
require_relative './form'
|
4
|
+
|
5
|
+
module Amazon
|
6
|
+
class << self
|
7
|
+
# examine current_pagenum
|
8
|
+
def examine_current_pagenum
|
9
|
+
$current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
|
10
|
+
$current_pagenum = $current_pagenum.text.to_i # need integer for checks
|
11
|
+
end
|
12
|
+
|
13
|
+
# find last page number
|
14
|
+
def find_last_pagenum
|
15
|
+
$last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
|
16
|
+
$last_pagenum = $last_pagenum.text.to_i # need integer for checks
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
# load next page
|
21
|
+
def load_next_page
|
22
|
+
|
23
|
+
examine_current_pagenum # does this need to be here?
|
24
|
+
|
25
|
+
$next_page_link = $current_page.link_with text: /Next Page/ # find next page link
|
26
|
+
$next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
|
27
|
+
|
28
|
+
$current_page = $agent.get($next_page.uri)
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
|
33
|
+
# cycle through search result pages and store product html
|
34
|
+
def scan
|
35
|
+
$pages = {}
|
36
|
+
|
37
|
+
find_last_pagenum
|
38
|
+
|
39
|
+
$last_pagenum.times do # paginate until on last page.
|
40
|
+
|
41
|
+
examine_current_pagenum
|
42
|
+
puts "\nscanning page #{$current_pagenum} of #{$last_pagenum} @ #{$main_page.uri+$current_page.uri}"
|
43
|
+
|
44
|
+
$current_divs = $current_page.search('//li[starts-with(@id, "result")]')
|
45
|
+
$pages[$page_num] = $current_divs # store page results
|
46
|
+
|
47
|
+
extract_product_data
|
48
|
+
|
49
|
+
load_next_page
|
50
|
+
|
51
|
+
end
|
52
|
+
|
53
|
+
puts "\n(scan complete.)"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amazon-search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
@@ -39,8 +39,8 @@ files:
|
|
39
39
|
- lib/amazon-search/scan.rb
|
40
40
|
- test/lib/amazon-search.rb
|
41
41
|
- test/lib/amazon-search/form.rb
|
42
|
-
- test/lib/amazon-search/pages.rb
|
43
42
|
- test/lib/amazon-search/products.rb
|
43
|
+
- test/lib/amazon-search/scan.rb
|
44
44
|
homepage: https://github.com/m8ss/amazon-search
|
45
45
|
licenses:
|
46
46
|
- MIT
|
@@ -68,5 +68,5 @@ summary: A simple screenscraper to search Amazon
|
|
68
68
|
test_files:
|
69
69
|
- test/lib/amazon-search.rb
|
70
70
|
- test/lib/amazon-search/form.rb
|
71
|
-
- test/lib/amazon-search/pages.rb
|
72
71
|
- test/lib/amazon-search/products.rb
|
72
|
+
- test/lib/amazon-search/scan.rb
|
@@ -1,113 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './products'
|
3
|
-
require_relative './form'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
# examine current_pagenum
|
8
|
-
def examine_current_pagenum
|
9
|
-
$current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
|
10
|
-
$current_pagenum = $current_pagenum.text.to_i # need integer for checks
|
11
|
-
end
|
12
|
-
|
13
|
-
# find last page number
|
14
|
-
def find_last_pagenum
|
15
|
-
$last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
|
16
|
-
$last_pagenum = $last_pagenum.text.to_i # need integer for checks
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
# load next page
|
21
|
-
def load_next_page
|
22
|
-
puts "***started load_next_page method***"
|
23
|
-
puts "ready to examine the page number?"
|
24
|
-
gets
|
25
|
-
|
26
|
-
examine_current_pagenum
|
27
|
-
|
28
|
-
puts "page number is..."
|
29
|
-
puts $current_pagenum
|
30
|
-
puts "continue?"
|
31
|
-
gets
|
32
|
-
|
33
|
-
$next_page_link = $current_page.link_with text: /Next Page/ # find next page link
|
34
|
-
|
35
|
-
puts "found next page..."
|
36
|
-
puts "this is link:"
|
37
|
-
puts $main_page.uri+$next_page_link.uri
|
38
|
-
puts "continue?"
|
39
|
-
gets
|
40
|
-
|
41
|
-
$next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
|
42
|
-
|
43
|
-
puts "next step is to load the next page..."
|
44
|
-
puts "page will load to:"
|
45
|
-
puts $agent.get($next_page.uri).uri
|
46
|
-
puts "continue?"
|
47
|
-
gets
|
48
|
-
|
49
|
-
$current_page = $agent.get($next_page.uri)
|
50
|
-
examine_current_pagenum
|
51
|
-
|
52
|
-
puts "====current_page has changed===="
|
53
|
-
puts "this is uri:"
|
54
|
-
puts $current_page.uri
|
55
|
-
puts "this is page_num"
|
56
|
-
puts $current_pagenum
|
57
|
-
|
58
|
-
puts "\ncontinue and exit loading method?"
|
59
|
-
gets
|
60
|
-
|
61
|
-
puts "***ending load_next_page method***"
|
62
|
-
end
|
63
|
-
|
64
|
-
|
65
|
-
# cycle through search result pages and store product html
|
66
|
-
def scan
|
67
|
-
puts "***started scan method***"
|
68
|
-
$pages = {}
|
69
|
-
|
70
|
-
find_last_pagenum
|
71
|
-
|
72
|
-
$last_pagenum.times do # paginate until on last page.
|
73
|
-
puts "***started pagination block***"
|
74
|
-
|
75
|
-
puts "Enter 'html' if you want to puts pages array, other press RETURN to continue"
|
76
|
-
answer = gets.chomp
|
77
|
-
|
78
|
-
if answer == "html"
|
79
|
-
if $pages.empty?
|
80
|
-
puts "pages array is empty"
|
81
|
-
else
|
82
|
-
$pages.each {|x| puts x}
|
83
|
-
end
|
84
|
-
end
|
85
|
-
|
86
|
-
examine_current_pagenum
|
87
|
-
|
88
|
-
$current_divs = $current_page.search('//li[starts-with(@id, "result")]')
|
89
|
-
$pages[$page_num] = $current_divs # store page results
|
90
|
-
|
91
|
-
|
92
|
-
puts "--"*50
|
93
|
-
puts "\nlast page number is #{$last_pagenum}"
|
94
|
-
puts "we're on #{$current_pagenum}"
|
95
|
-
puts "this is current hyperlink:"
|
96
|
-
puts $current_page.uri
|
97
|
-
|
98
|
-
|
99
|
-
puts "ready to go to #{$current_pagenum+1}?"
|
100
|
-
gets
|
101
|
-
|
102
|
-
load_next_page
|
103
|
-
puts "scanning is ready to restart loop."
|
104
|
-
puts "continue?"
|
105
|
-
gets
|
106
|
-
puts "***ending pagination block***"
|
107
|
-
end
|
108
|
-
|
109
|
-
puts "***ending scan method***"
|
110
|
-
end
|
111
|
-
end
|
112
|
-
|
113
|
-
end
|