amazon-search 1.4.2 → 1.4.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/{Readme.rdoc → README.md} +13 -10
- data/amazon-search.gemspec +2 -2
- data/lib/amazon-search.rb +157 -16
- metadata +4 -15
- data/lib/amazon-search/form.rb +0 -26
- data/lib/amazon-search/products.rb +0 -87
- data/lib/amazon-search/scan.rb +0 -55
- data/test/lib/amazon-search.rb +0 -24
- data/test/lib/amazon-search/form.rb +0 -26
- data/test/lib/amazon-search/products.rb +0 -87
- data/test/lib/amazon-search/scan.rb +0 -57
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e25e25215f49b4726b1e06db9901e738f866993
|
4
|
+
data.tar.gz: fffdba88068487fb177d04b1a3fa66a67a435ea1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 61d0e4ce208691c9cc62929bd3bd91dc62ca74bee9b0523a580d80b8ad874bbebb4cf9ec11de7407d6929fbc8173f0943dcfe914dfd295c783a20f30915d2374
|
7
|
+
data.tar.gz: 9bc77c89ddc0e9843e128649925538c069acf75e3f951a7628c1be12ffe5e6401d7a24a079b8c44413bcc608e6e8cefeed58210525e55e434e4da448c1e7b94f
|
data/{Readme.rdoc → README.md}
RENAMED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
1
|
+
# amazon-search
|
2
2
|
|
3
3
|
Amazon Search is a simple Ruby tool to search for Amazon products.
|
4
4
|
|
5
5
|
This tool screenscrapes an Amazon search and returns a hash of the product results. Configuration of Amazon's API is not needed.
|
6
6
|
|
7
|
-
The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements.
|
7
|
+
The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XPath and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
|
8
8
|
|
9
|
-
|
9
|
+
## DATA COLLECTED
|
10
10
|
* title
|
11
11
|
* price
|
12
12
|
* stars
|
@@ -16,13 +16,15 @@ The functionality is centered around mechanize pagination for the screen scrapin
|
|
16
16
|
* seller
|
17
17
|
|
18
18
|
|
19
|
-
|
19
|
+
## INSTALLATION
|
20
20
|
|
21
|
+
```
|
21
22
|
$ gem install amazon-search
|
23
|
+
```
|
22
24
|
|
25
|
+
## EXAMPLE
|
23
26
|
|
24
|
-
|
25
|
-
|
27
|
+
```ruby
|
26
28
|
require 'amazon-search'
|
27
29
|
|
28
30
|
# search for products by string
|
@@ -37,12 +39,12 @@ The functionality is centered around mechanize pagination for the screen scrapin
|
|
37
39
|
|
38
40
|
# reference any product by the order it appeared in search results
|
39
41
|
|
42
|
+
|
40
43
|
$products[0] # => references the first product found in search
|
41
44
|
$products[30] # => references the 29th product found in search
|
42
45
|
|
43
46
|
|
44
|
-
#
|
45
|
-
# and display attributes of that product
|
47
|
+
# display attributes of specific product
|
46
48
|
# all available attributes are:
|
47
49
|
|
48
50
|
$products[0][:title] # => the first product's title
|
@@ -71,10 +73,11 @@ The functionality is centered around mechanize pagination for the screen scrapin
|
|
71
73
|
puts product[:title]
|
72
74
|
puts product[:stars]
|
73
75
|
# etc ...
|
74
|
-
|
76
|
+
end
|
77
|
+
```
|
75
78
|
|
76
79
|
|
77
|
-
|
80
|
+
## MIT LICENSE
|
78
81
|
|
79
82
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
80
83
|
|
data/amazon-search.gemspec
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
Gem::Specification.new do |gem|
|
4
4
|
gem.name = %q{amazon-search}
|
5
|
-
gem.version = '1.4.
|
5
|
+
gem.version = '1.4.4'
|
6
6
|
gem.date = '2015-09-19'
|
7
7
|
gem.platform = Gem::Platform::RUBY
|
8
8
|
gem.required_ruby_version = '>= 1.8'
|
@@ -14,7 +14,7 @@ Gem::Specification.new do |gem|
|
|
14
14
|
gem.description = "Simple screenscraper to search Amazon and return product titles, urls, image href, etc."
|
15
15
|
gem.authors = ["John Mason"]
|
16
16
|
gem.email = 'mace2345@gmail.com'
|
17
|
-
gem.homepage = 'https://github.com/m8ss/amazon-search'
|
17
|
+
gem.homepage = 'https://github.com/m8ss/amazon-search'
|
18
18
|
gem.license = 'MIT'
|
19
19
|
|
20
20
|
gem.add_runtime_dependency('mechanize', '~> 2.7')
|
data/lib/amazon-search.rb
CHANGED
@@ -1,25 +1,166 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
1
|
require 'mechanize'
|
4
|
-
require 'amazon-search/form'
|
5
|
-
require 'amazon-search/scan'
|
6
|
-
require 'amazon-search/products'
|
7
2
|
|
3
|
+
# actions of Amazon search
|
8
4
|
module Amazon
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
$keywords = keywords
|
5
|
+
class << self
|
6
|
+
attr_accessor :products, :title, :price, :stars, :reviews, :seller,
|
7
|
+
:image_url, :product_url, :product_num
|
13
8
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
9
|
+
# main method: process Amazon search
|
10
|
+
def search(keywords)
|
11
|
+
@keywords = keywords
|
12
|
+
set_initial_values
|
13
|
+
set_agent
|
14
|
+
find_form
|
15
|
+
submit_form
|
16
|
+
scan
|
17
|
+
$products
|
18
|
+
end
|
18
19
|
|
19
|
-
|
20
|
-
|
20
|
+
def set_initial_values
|
21
|
+
$products = {}
|
22
|
+
@product_num = 0
|
21
23
|
end
|
22
|
-
end
|
23
24
|
|
25
|
+
# prepares Mechanize
|
26
|
+
def set_agent
|
27
|
+
@agent = Mechanize.new { |a| a.user_agent_alias = 'Mac Safari' }
|
28
|
+
end
|
29
|
+
|
30
|
+
# finds Amazon search box
|
31
|
+
def find_form
|
32
|
+
@main_page = @agent.get('http://amazon.com')
|
33
|
+
@search_form = @main_page.form_with :name => 'site-search'
|
34
|
+
end
|
35
|
+
|
36
|
+
# submits Amazon search box
|
37
|
+
def submit_form
|
38
|
+
@search_form.field_with(:name => 'field-keywords').value = @keywords
|
39
|
+
@current_page = @agent.submit @search_form # submits form
|
40
|
+
end
|
41
|
+
|
42
|
+
# examine current_pagenum
|
43
|
+
def examine_current_pagenum
|
44
|
+
@current_pagenum =
|
45
|
+
@current_page.search '//*[contains(concat( " ", @class, " " ),
|
46
|
+
concat( " ", "pagnCur", " " ))]'
|
47
|
+
|
48
|
+
@current_pagenum = @current_pagenum.text.to_i # need integer for checks
|
49
|
+
end
|
50
|
+
|
51
|
+
# find last page number
|
52
|
+
def find_last_pagenum
|
53
|
+
@last_pagenum =
|
54
|
+
@current_page.search '//*[contains(concat( " ", @class, " " ),
|
55
|
+
concat( " ", "pagnDisabled", " " ))]'
|
56
|
+
|
57
|
+
@last_pagenum = @last_pagenum.text.to_i # need integer for checks
|
58
|
+
end
|
59
|
+
|
60
|
+
# load next page
|
61
|
+
def load_next_page
|
62
|
+
examine_current_pagenum # does this need to be here?
|
63
|
+
|
64
|
+
# find next page link
|
65
|
+
@next_page_link = @current_page.link_with :text => /Next Page/
|
66
|
+
@next_page = @next_page_link.click unless @current_pagenum == @last_pagenum
|
67
|
+
@current_page = @agent.get(@next_page.uri)
|
68
|
+
end
|
69
|
+
|
70
|
+
# cycle through search result pages and store product html
|
71
|
+
def scan
|
72
|
+
@pages = {}
|
73
|
+
|
74
|
+
find_last_pagenum
|
24
75
|
|
76
|
+
@last_pagenum.times do # paginate until on last page.
|
77
|
+
examine_current_pagenum
|
25
78
|
|
79
|
+
@current_divs = @current_page.search('//li[starts-with(@id, "result")]')
|
80
|
+
@pages[@page_num] = @current_divs # store page results
|
81
|
+
|
82
|
+
extract_product_data
|
83
|
+
load_next_page
|
84
|
+
end
|
85
|
+
puts "\n(scan complete.)"
|
86
|
+
end
|
87
|
+
|
88
|
+
# used for checking strings
|
89
|
+
def numeric?(s)
|
90
|
+
!!Float(s) rescue false
|
91
|
+
end
|
92
|
+
|
93
|
+
# puts product details to console
|
94
|
+
def display_product
|
95
|
+
STDOUT.puts '--' * 50
|
96
|
+
STDOUT.puts "title: \t\t#{@title}"
|
97
|
+
STDOUT.puts "seller: \t#{@seller}"
|
98
|
+
STDOUT.puts "price: \t\t#{@price}"
|
99
|
+
STDOUT.puts "stars: \t\t#{@stars}"
|
100
|
+
STDOUT.puts "reviews: \t#{@reviews}"
|
101
|
+
STDOUT.puts "image url: \t#{@image_href}"
|
102
|
+
STDOUT.puts "product url: \t#{@url}"
|
103
|
+
end
|
104
|
+
|
105
|
+
# extract product data
|
106
|
+
def extract_product_data
|
107
|
+
# TODO: fix this global variable...
|
108
|
+
|
109
|
+
# nokogiri syntax is needed when iterating...not mechanize!
|
110
|
+
# extract useful stuff from product html
|
111
|
+
@current_divs.each do |html|
|
112
|
+
# first select raw html
|
113
|
+
title = html.at_css('.s-access-title')
|
114
|
+
seller = html.at_css('.a-row > .a-spacing-none')
|
115
|
+
price = html.at_css('.s-price')
|
116
|
+
stars = html.at_css('.a-icon-star')
|
117
|
+
reviews = html.at_css('span+ .a-text-normal')
|
118
|
+
image_href = html.at_css('.s-access-image')
|
119
|
+
url = html.at_css('.a-row > a')
|
120
|
+
|
121
|
+
break if title.nil? == true # if it's nil it's prob an ad
|
122
|
+
break if price.nil? == true # no price? prob not worthy item
|
123
|
+
break if stars.nil? == true # no stars? not worth it
|
124
|
+
|
125
|
+
# extract text and set variables for puts
|
126
|
+
@title = title.text
|
127
|
+
@price = price.text
|
128
|
+
@stars = stars.text
|
129
|
+
@image_href = image_href['src']
|
130
|
+
@url = url['href']
|
131
|
+
|
132
|
+
# movies sometimes have text in review class
|
133
|
+
if numeric?(reviews.text)
|
134
|
+
@reviews = reviews.text
|
135
|
+
else
|
136
|
+
@reviews = 'Unknown'
|
137
|
+
end
|
138
|
+
|
139
|
+
if seller.nil? == true # sometimes seller is nil on movies, etc.
|
140
|
+
@seller = 'Unknown'
|
141
|
+
else
|
142
|
+
@seller = seller.text
|
143
|
+
end
|
144
|
+
|
145
|
+
# don't overload the server
|
146
|
+
sleep(0.05)
|
147
|
+
|
148
|
+
display_product
|
149
|
+
|
150
|
+
# store extracted text in products hash
|
151
|
+
# key is product count
|
152
|
+
$products[@product_num] = {
|
153
|
+
:title => @title,
|
154
|
+
:price => @price,
|
155
|
+
:stars => @stars,
|
156
|
+
:reviews => @reviews,
|
157
|
+
:image_href => @image_href,
|
158
|
+
:url => @url,
|
159
|
+
:seller => @seller
|
160
|
+
}
|
161
|
+
|
162
|
+
@product_num += 1 # ready for next product
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: amazon-search
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.4.
|
4
|
+
version: 1.4.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Mason
|
@@ -31,16 +31,9 @@ executables: []
|
|
31
31
|
extensions: []
|
32
32
|
extra_rdoc_files: []
|
33
33
|
files:
|
34
|
-
-
|
34
|
+
- README.md
|
35
35
|
- amazon-search.gemspec
|
36
36
|
- lib/amazon-search.rb
|
37
|
-
- lib/amazon-search/form.rb
|
38
|
-
- lib/amazon-search/products.rb
|
39
|
-
- lib/amazon-search/scan.rb
|
40
|
-
- test/lib/amazon-search.rb
|
41
|
-
- test/lib/amazon-search/form.rb
|
42
|
-
- test/lib/amazon-search/products.rb
|
43
|
-
- test/lib/amazon-search/scan.rb
|
44
37
|
homepage: https://github.com/m8ss/amazon-search
|
45
38
|
licenses:
|
46
39
|
- MIT
|
@@ -61,12 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
61
54
|
version: '0'
|
62
55
|
requirements: []
|
63
56
|
rubyforge_project:
|
64
|
-
rubygems_version: 2.4.
|
57
|
+
rubygems_version: 2.4.5
|
65
58
|
signing_key:
|
66
59
|
specification_version: 4
|
67
60
|
summary: A simple screenscraper to search Amazon
|
68
|
-
test_files:
|
69
|
-
- test/lib/amazon-search.rb
|
70
|
-
- test/lib/amazon-search/form.rb
|
71
|
-
- test/lib/amazon-search/products.rb
|
72
|
-
- test/lib/amazon-search/scan.rb
|
61
|
+
test_files: []
|
data/lib/amazon-search/form.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './scan'
|
3
|
-
require_relative './products'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
# prepares Mechanize
|
8
|
-
def set_agent
|
9
|
-
$agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
|
10
|
-
end
|
11
|
-
|
12
|
-
# finds Amazon search box
|
13
|
-
def find_form
|
14
|
-
$main_page = $agent.get("http://amazon.com")
|
15
|
-
$search_form = $main_page.form_with :name => "site-search"
|
16
|
-
end
|
17
|
-
|
18
|
-
# submits Amazon search box
|
19
|
-
def submit_form
|
20
|
-
$search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
|
21
|
-
$current_page = $agent.submit $search_form # submits form
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './scan'
|
3
|
-
require_relative './form'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
$products = {}
|
8
|
-
$product_num = 0
|
9
|
-
|
10
|
-
# used for checking strings
|
11
|
-
def is_numeric?(s)
|
12
|
-
!!Float(s) rescue false
|
13
|
-
end
|
14
|
-
|
15
|
-
# puts product details to console
|
16
|
-
def display_product
|
17
|
-
STDOUT.puts "--"*50
|
18
|
-
STDOUT.puts "title: \t\t#{$title}"
|
19
|
-
STDOUT.puts "seller: \t#{$seller}"
|
20
|
-
STDOUT.puts "price: \t\t#{$price}"
|
21
|
-
STDOUT.puts "stars: \t\t#{$stars}"
|
22
|
-
STDOUT.puts "reviews: \t#{$reviews}"
|
23
|
-
STDOUT.puts "image url: \t#{$image_href}"
|
24
|
-
STDOUT.puts "product url: \t#{$url}"
|
25
|
-
end
|
26
|
-
|
27
|
-
# extract product data
|
28
|
-
def extract_product_data
|
29
|
-
|
30
|
-
# nokogiri syntax is needed when iterating...not mechanize!
|
31
|
-
# extract useful stuff from product html
|
32
|
-
$current_divs.each do |html|
|
33
|
-
# first select raw html
|
34
|
-
title = html.at_css(".s-access-title")
|
35
|
-
seller = html.at_css(".a-row > .a-spacing-none")
|
36
|
-
price = html.at_css(".s-price")
|
37
|
-
stars = html.at_css(".a-icon-star")
|
38
|
-
reviews = html.at_css("span+ .a-text-normal")
|
39
|
-
image_href = html.at_css(".s-access-image")
|
40
|
-
url = html.at_css(".a-row > a")
|
41
|
-
|
42
|
-
break if title == nil # if it's nil it's prob an ad
|
43
|
-
break if price == nil # no price? prob not worthy item
|
44
|
-
break if stars == nil # no stars? not worth it
|
45
|
-
|
46
|
-
# extract text and set variables for puts
|
47
|
-
$title = title.text
|
48
|
-
$price = price.text
|
49
|
-
$stars = stars.text
|
50
|
-
$image_href = image_href['src']
|
51
|
-
$url = url['href']
|
52
|
-
|
53
|
-
# movies sometimes have text in review class
|
54
|
-
if is_numeric?(reviews.text)
|
55
|
-
$reviews = reviews.text
|
56
|
-
else
|
57
|
-
$reviews = "Unknown"
|
58
|
-
end
|
59
|
-
|
60
|
-
if seller == nil # sometimes seller is nil on movies, etc.
|
61
|
-
$seller = "Unknown"
|
62
|
-
else
|
63
|
-
$seller = seller.text
|
64
|
-
end
|
65
|
-
|
66
|
-
# don't overload the server
|
67
|
-
sleep(0.05)
|
68
|
-
|
69
|
-
display_product
|
70
|
-
|
71
|
-
# store extracted text in products hash
|
72
|
-
# key is product count
|
73
|
-
$products[$product_num] = {
|
74
|
-
title: $title,
|
75
|
-
price: $price,
|
76
|
-
stars: $stars,
|
77
|
-
reviews: $reviews,
|
78
|
-
image_href: $image_href,
|
79
|
-
url: $url,
|
80
|
-
seller: $seller,
|
81
|
-
}
|
82
|
-
|
83
|
-
$product_num +=1 # ready for next product
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
data/lib/amazon-search/scan.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './products'
|
3
|
-
require_relative './form'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
# examine current_pagenum
|
8
|
-
def examine_current_pagenum
|
9
|
-
$current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
|
10
|
-
$current_pagenum = $current_pagenum.text.to_i # need integer for checks
|
11
|
-
end
|
12
|
-
|
13
|
-
# find last page number
|
14
|
-
def find_last_pagenum
|
15
|
-
$last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
|
16
|
-
$last_pagenum = $last_pagenum.text.to_i # need integer for checks
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
# load next page
|
21
|
-
def load_next_page
|
22
|
-
|
23
|
-
examine_current_pagenum # does this need to be here?
|
24
|
-
|
25
|
-
$next_page_link = $current_page.link_with text: /Next Page/ # find next page link
|
26
|
-
$next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
|
27
|
-
|
28
|
-
$current_page = $agent.get($next_page.uri)
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
# cycle through search result pages and store product html
|
34
|
-
def scan
|
35
|
-
$pages = {}
|
36
|
-
|
37
|
-
find_last_pagenum
|
38
|
-
|
39
|
-
$last_pagenum.times do # paginate until on last page.
|
40
|
-
|
41
|
-
examine_current_pagenum
|
42
|
-
|
43
|
-
$current_divs = $current_page.search('//li[starts-with(@id, "result")]')
|
44
|
-
$pages[$page_num] = $current_divs # store page results
|
45
|
-
|
46
|
-
extract_product_data
|
47
|
-
load_next_page
|
48
|
-
|
49
|
-
end
|
50
|
-
|
51
|
-
puts "\n(scan complete.)"
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
data/test/lib/amazon-search.rb
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
require 'mechanize'
|
4
|
-
require './amazon-search/form'
|
5
|
-
require './amazon-search/scan'
|
6
|
-
require './amazon-search/products'
|
7
|
-
|
8
|
-
module Amazon
|
9
|
-
class << self
|
10
|
-
def search(keywords)
|
11
|
-
$keywords = keywords
|
12
|
-
|
13
|
-
set_agent
|
14
|
-
find_form
|
15
|
-
submit_form
|
16
|
-
scan
|
17
|
-
|
18
|
-
$products
|
19
|
-
end
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
|
@@ -1,26 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './scan'
|
3
|
-
require_relative './products'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
# prepares Mechanize
|
8
|
-
def set_agent
|
9
|
-
$agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
|
10
|
-
end
|
11
|
-
|
12
|
-
# finds Amazon search box
|
13
|
-
def find_form
|
14
|
-
$main_page = $agent.get("http://amazon.com")
|
15
|
-
$search_form = $main_page.form_with :name => "site-search"
|
16
|
-
end
|
17
|
-
|
18
|
-
# submits Amazon search box
|
19
|
-
def submit_form
|
20
|
-
$search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
|
21
|
-
$current_page = $agent.submit $search_form # submits form
|
22
|
-
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './scan'
|
3
|
-
require_relative './form'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
$products = {}
|
8
|
-
$product_num = 0
|
9
|
-
|
10
|
-
# used for checking strings
|
11
|
-
def is_numeric?(s)
|
12
|
-
!!Float(s) rescue false
|
13
|
-
end
|
14
|
-
|
15
|
-
# currently not being used and needs adjusting
|
16
|
-
def display_product
|
17
|
-
STDOUT.puts "--"*50
|
18
|
-
STDOUT.puts "title: \t\t#{$title}"
|
19
|
-
STDOUT.puts "seller: \t#{$seller}"
|
20
|
-
STDOUT.puts "price: \t\t#{$price}"
|
21
|
-
STDOUT.puts "stars: \t\t#{$stars}"
|
22
|
-
STDOUT.puts "reviews: \t#{$reviews}"
|
23
|
-
STDOUT.puts "image url: \t#{$image_href}"
|
24
|
-
STDOUT.puts "product url: \t#{$url}"
|
25
|
-
end
|
26
|
-
|
27
|
-
# extract product data
|
28
|
-
def extract_product_data
|
29
|
-
|
30
|
-
# nokogiri syntax is needed when iterating...not mechanize!
|
31
|
-
# extract useful stuff from product html
|
32
|
-
$current_divs.each do |html|
|
33
|
-
# first select raw html
|
34
|
-
title = html.at_css(".s-access-title")
|
35
|
-
seller = html.at_css(".a-row > .a-spacing-none")
|
36
|
-
price = html.at_css(".s-price")
|
37
|
-
stars = html.at_css(".a-icon-star")
|
38
|
-
reviews = html.at_css("span+ .a-text-normal")
|
39
|
-
image_href = html.at_css(".s-access-image")
|
40
|
-
url = html.at_css(".a-row > a")
|
41
|
-
|
42
|
-
break if title == nil # if it's nil it's prob an ad
|
43
|
-
break if price == nil # no price? prob not worthy item
|
44
|
-
break if stars == nil # no stars? not worth it
|
45
|
-
|
46
|
-
# extract text and set variables for puts
|
47
|
-
$title = title.text
|
48
|
-
$price = price.text
|
49
|
-
$stars = stars.text
|
50
|
-
$image_href = image_href['src']
|
51
|
-
$url = url['href']
|
52
|
-
|
53
|
-
# movies sometimes have text in review class
|
54
|
-
if is_numeric?(reviews.text)
|
55
|
-
$reviews = reviews.text
|
56
|
-
else
|
57
|
-
$reviews = "Unknown"
|
58
|
-
end
|
59
|
-
|
60
|
-
if seller == nil # sometimes seller is nil on movies, etc.
|
61
|
-
$seller = "Unknown"
|
62
|
-
else
|
63
|
-
$seller = seller.text
|
64
|
-
end
|
65
|
-
|
66
|
-
# don't overload the server
|
67
|
-
sleep(0.05)
|
68
|
-
|
69
|
-
display_product
|
70
|
-
|
71
|
-
# store extracted text in products hash
|
72
|
-
# key is product count
|
73
|
-
$products[$product_num] = {
|
74
|
-
title: $title,
|
75
|
-
price: $price,
|
76
|
-
stars: $stars,
|
77
|
-
reviews: $reviews,
|
78
|
-
image_href: $image_href,
|
79
|
-
url: $url,
|
80
|
-
seller: $seller,
|
81
|
-
}
|
82
|
-
|
83
|
-
$product_num +=1 # ready for next product
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|
@@ -1,57 +0,0 @@
|
|
1
|
-
require 'mechanize'
|
2
|
-
require_relative './products'
|
3
|
-
require_relative './form'
|
4
|
-
|
5
|
-
module Amazon
|
6
|
-
class << self
|
7
|
-
# examine current_pagenum
|
8
|
-
def examine_current_pagenum
|
9
|
-
$current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
|
10
|
-
$current_pagenum = $current_pagenum.text.to_i # need integer for checks
|
11
|
-
end
|
12
|
-
|
13
|
-
# find last page number
|
14
|
-
def find_last_pagenum
|
15
|
-
$last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
|
16
|
-
$last_pagenum = $last_pagenum.text.to_i # need integer for checks
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
# load next page
|
21
|
-
def load_next_page
|
22
|
-
|
23
|
-
examine_current_pagenum # does this need to be here?
|
24
|
-
|
25
|
-
$next_page_link = $current_page.link_with text: /Next Page/ # find next page link
|
26
|
-
$next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
|
27
|
-
|
28
|
-
$current_page = $agent.get($next_page.uri)
|
29
|
-
|
30
|
-
end
|
31
|
-
|
32
|
-
|
33
|
-
# cycle through search result pages and store product html
|
34
|
-
def scan
|
35
|
-
$pages = {}
|
36
|
-
|
37
|
-
find_last_pagenum
|
38
|
-
|
39
|
-
$last_pagenum.times do # paginate until on last page.
|
40
|
-
|
41
|
-
examine_current_pagenum
|
42
|
-
puts "\nscanning page #{$current_pagenum} of #{$last_pagenum} @ #{$main_page.uri+$current_page.uri}"
|
43
|
-
|
44
|
-
$current_divs = $current_page.search('//li[starts-with(@id, "result")]')
|
45
|
-
$pages[$page_num] = $current_divs # store page results
|
46
|
-
|
47
|
-
extract_product_data
|
48
|
-
|
49
|
-
load_next_page
|
50
|
-
|
51
|
-
end
|
52
|
-
|
53
|
-
puts "\n(scan complete.)"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
end
|