amazon-search 1.2.1 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9d34643d6bb26f18ac331af2887673000fce1d8e
4
- data.tar.gz: aff9e145005c2d5362f4c8465bc16bddefa418a9
3
+ metadata.gz: d3bd73fc88e578c6c9572913876b6ccb4fca5f5c
4
+ data.tar.gz: 51d26171b63240b005b54494eab74abc78fdf484
5
5
  SHA512:
6
- metadata.gz: 8d80a1fb04a7bfe50baa173255942713b84c160c3e0d73a094b8d635ec2ef2017cfebc72514a49a3b6e8187a4fff9dd1487747eb86018383119bad012fe26491
7
- data.tar.gz: bd413ecd5459d464c551fb621d073550c45002c67f01d2be50ce1c2136ff1767cf79ca9d9a8a9d9c318a2647602cc5bb79f68cd55eeda5e21de702c6052beb18
6
+ metadata.gz: b6886d46b6347fab8dc5f0671716d03892f69f85aac71762257a428ecaf79ed0d456683f2e5667d1f5ce507d9612a55b47025a9766bfeef572ffcf14135266ea
7
+ data.tar.gz: d7ffb27e2956701acdb4ec441c01943b835a241aa765a31a7f706b879f55a3b47912c272c8100f117a6430bba45d6cacb762c9607325adb73e8d773e5227ee4f
data/Readme.rdoc CHANGED
@@ -12,17 +12,9 @@ This is a tool that does not require configuration of Amazon's API. The functio
12
12
 
13
13
  require 'amazon-search'
14
14
 
15
- # Search for products by keyword string and store results
16
- Amazon::Search.find_products "ruby"
17
- Amazon::Search.find_products "books"
18
- Amazon::Search.find_products "games"
19
-
20
- # After a search is complete, run this command
21
- Amazon::Search.display_results
22
-
23
- == GOTCHAS
24
-
25
- Sometimes a 503 error occurs due to the pagination scanning too quickly and overloading the server. An option will come out in the future to customize how long to sleep before requesting next page of search results. In the meantime, a simple retry will often work.
15
+ # Search for products by string
16
+
17
+ Amazon::search "ruby"
26
18
 
27
19
  == MIT LICENSE
28
20
 
@@ -2,8 +2,8 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = %q{amazon-search}
5
- gem.version = '1.2.1'
6
- gem.date = '2015-09-18'
5
+ gem.version = '1.3.0'
6
+ gem.date = '2015-09-19'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
9
9
 
@@ -0,0 +1,26 @@
1
+ require 'mechanize'
2
+ require_relative './scan'
3
+ require_relative './products'
4
+
5
+ module Amazon
6
+ class << self
7
+ # prepares Mechanize
8
+ def set_agent
9
+ $agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
10
+ end
11
+
12
+ # finds Amazon search box
13
+ def find_form
14
+ $main_page = $agent.get("http://amazon.com")
15
+ $search_form = $main_page.form_with :name => "site-search"
16
+ end
17
+
18
+ # submits Amazon search box
19
+ def submit_form
20
+ $search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
21
+ $current_page = $agent.submit $search_form # submits form
22
+
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,59 @@
1
+ require 'mechanize'
2
+ require_relative './scan'
3
+ require_relative './form'
4
+
5
+ module Amazon
6
+ class << self
7
+ # extract product data
8
+ def extract_product_data
9
+ # nokogiri syntax is needed when iterating...not mechanize!
10
+ $current_divs.each do |html|
11
+ title = html.at_css(".s-access-title")
12
+ seller = html.at_css(".a-row > .a-spacing-none")
13
+ price = html.at_css(".s-price")
14
+ stars = html.at_css(".a-icon-star")
15
+ reviews = html.at_css("span+ .a-text-normal")
16
+ image_href = html.at_css(".s-access-image")
17
+ url = html.at_css(".a-row > a")
18
+
19
+ break if title == nil # if it's nil it's prob an ad
20
+ break if price == nil # no price? prob not worthy item
21
+ break if stars == nil # no stars? not worth it
22
+
23
+ if seller == nil # sometimes seller is nil on movies, etc.
24
+ seller = "Unknown"
25
+ else
26
+ seller = seller.text
27
+ end
28
+
29
+ # extract text and set variables for puts
30
+ title = title.text
31
+ price = price.text
32
+ stars = stars.text
33
+ reviews = reviews.text
34
+ image_href = image_href['src']
35
+ url = url['href']
36
+
37
+ Product.new(title, price, stars, reviews, image_href, url, html)
38
+
39
+ end
40
+ end
41
+
42
+
43
+ # currently not being used and needs adjusting
44
+ def display_product
45
+ STDOUT.puts "--"*50
46
+ STDOUT.puts "title: \t\t#{title}"
47
+ STDOUT.puts "seller: \t#{seller}"
48
+ STDOUT.puts "price: \t\t#{price}"
49
+ STDOUT.puts "stars: \t\t#{stars}"
50
+ STDOUT.puts "reviews: \t#{reviews}"
51
+ STDOUT.puts "image url: \t#{image}"
52
+ STDOUT.puts "product url: \t#{url}"
53
+ end
54
+
55
+
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,56 @@
1
+ require 'mechanize'
2
+ require_relative './products'
3
+ require_relative './form'
4
+
5
+ module Amazon
6
+ class << self
7
+ # examine current_pagenum
8
+ def examine_current_pagenum
9
+ $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
+ $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
+ end
12
+
13
+ # find last page number
14
+ def find_last_pagenum
15
+ $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
+ $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
+ end
18
+
19
+
20
+ # load next page
21
+ def load_next_page
22
+
23
+ examine_current_pagenum # does this need to be here?
24
+
25
+ $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
26
+ $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
27
+
28
+ $current_page = $agent.get($next_page.uri)
29
+
30
+ end
31
+
32
+
33
+ # cycle through search result pages and store product html
34
+ def scan
35
+ $pages = {}
36
+
37
+ find_last_pagenum
38
+
39
+ $last_pagenum.times do # paginate until on last page.
40
+
41
+ examine_current_pagenum
42
+ puts "\nscanning page #{$current_pagenum} of #{$last_pagenum}..."
43
+
44
+ $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
45
+ $pages[$page_num] = $current_divs # store page results
46
+
47
+ load_next_page
48
+
49
+ end
50
+
51
+ puts "\n(scan complete.)"
52
+ $pages
53
+ end
54
+ end
55
+
56
+ end
data/lib/amazon-search.rb CHANGED
@@ -1,95 +1,22 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
3
  require 'mechanize'
4
+ require './amazon-search/form'
5
+ require './amazon-search/scan'
6
+ require './amazon-search/products'
4
7
 
5
8
  module Amazon
6
-
7
- class Search
8
- def self.find_products(keywords)
9
- #--------- submit the search form with keywords ---------------------
10
- agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"} # set browser
11
-
12
- main_page = agent.get("http://amazon.com")
13
- search_form = main_page.form_with :name => "site-search" # find the search form in Amazon
14
-
15
- search_form.field_with(:name => "field-keywords").value = keywords # sets value of search box
16
- search_results = agent.submit search_form # submits form
17
-
18
- #--------- scan each page and store the results ---------------------
19
- $product_divs = []
20
- page_num = 0
21
- next_page = agent.get(search_results.uri) # initial search results are the first page
22
-
23
- last_page_num = search_results.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
24
- last_page_num = last_page_num.text.to_i # change to int for upcoming iteration instructions
25
-
26
- last_page_num.times do # cycle all pages and stop on last page
27
- page_num += 1
28
- page = agent.get(next_page.uri) # load the next page
29
-
30
- $product_divs << page.search('//li[starts-with(@id, "result")]') # store the div of each product
31
-
32
- next_page_link = page.link_with text: /Next Page/ # find the next page link
33
- next_page = next_page_link.click unless page_num == last_page_num # click to next page unless on last page
34
- end # ends pagination loop
35
-
36
- puts "\n\n(end of search results)"
37
- sleep(1) # don't overload the servers
9
+ class << self
10
+ def search(keywords)
11
+ $keywords = keywords
12
+
13
+ set_agent
14
+ find_form
15
+ submit_form
16
+ scan
17
+ end
38
18
  end
19
+ end
39
20
 
40
21
 
41
- def self.display_results
42
- # nokogiri syntax is needed when iterating...not mechanize!
43
- product_divs.each do |product|
44
-
45
- #--------- nokogiri select html sections from css ---------------------
46
- title = product.at_css(".s-access-title")
47
- seller = product.at_css(".a-row > .a-spacing-none") #".a-spacing-small .a-spacing-none"
48
- price = product.at_css(".s-price")
49
- stars = product.at_css(".a-icon-star")
50
- reviews = product.at_css("span+ .a-text-normal") # ".a-span-last .a-spacing-mini > span+ .a-text-normal"
51
- image = product.at_css(".s-access-image")
52
- url = product.at_css(".a-row > a")
53
-
54
- #--------- avoid the related items gotchas ---------------------
55
- if title == nil # if it's nil it's prob an ad
56
- break
57
- else
58
- title = title.text
59
-
60
- if seller == nil # if seller is nil put unknown
61
- seller = "Unknown"
62
- else
63
- seller = seller.text
64
- if price == nil # no price? prob not worthy item
65
- break
66
-
67
- else
68
- price = price.text
69
- if stars == nil
70
- break
71
-
72
- else
73
- stars = stars.text
74
- reviews = reviews.text
75
- image = image['src']
76
- url = url['href']
77
22
 
78
- # errors properly avoided, now puts the results
79
- STDOUT.puts "--"*50
80
- STDOUT.puts "title: \t\t#{title}"
81
- STDOUT.puts "seller: \t#{seller}"
82
- STDOUT.puts "price: \t\t#{price}"
83
- STDOUT.puts "stars: \t\t#{stars}"
84
- STDOUT.puts "reviews: \t#{reviews}"
85
- STDOUT.puts "image url: \t#{image}"
86
- STDOUT.puts "product url: \t#{url}"
87
-
88
- end # ends nil price
89
- end # ends nil stars
90
- end # ends nil seller
91
- end # ends nil product
92
- end # ends each product div iteration (page is finished)
93
- end # ends display_results
94
- end # ends Search Class
95
- end # ends Amazon Module
@@ -0,0 +1,26 @@
1
+ require 'mechanize'
2
+ require_relative './pages'
3
+ require_relative './products'
4
+
5
+ module Amazon
6
+ class << self
7
+ # prepares Mechanize
8
+ def set_agent
9
+ $agent = Mechanize.new{ |a| a.user_agent_alias = "Mac Safari"}
10
+ end
11
+
12
+ # finds Amazon search box
13
+ def find_form
14
+ $main_page = $agent.get("http://amazon.com")
15
+ $search_form = $main_page.form_with :name => "site-search"
16
+ end
17
+
18
+ # submits Amazon search box
19
+ def submit_form
20
+ $search_form.field_with(:name => "field-keywords").value = $keywords # sets value of search box
21
+ $current_page = $agent.submit $search_form # submits form
22
+
23
+ end
24
+ end
25
+
26
+ end
@@ -0,0 +1,113 @@
1
+ require 'mechanize'
2
+ require_relative './products'
3
+ require_relative './form'
4
+
5
+ module Amazon
6
+ class << self
7
+ # examine current_pagenum
8
+ def examine_current_pagenum
9
+ $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
+ $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
+ end
12
+
13
+ # find last page number
14
+ def find_last_pagenum
15
+ $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
+ $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
+ end
18
+
19
+
20
+ # load next page
21
+ def load_next_page
22
+ puts "***started load_next_page method***"
23
+ puts "ready to examine the page number?"
24
+ gets
25
+
26
+ examine_current_pagenum
27
+
28
+ puts "page number is..."
29
+ puts $current_pagenum
30
+ puts "continue?"
31
+ gets
32
+
33
+ $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
34
+
35
+ puts "found next page..."
36
+ puts "this is link:"
37
+ puts $main_page.uri+$next_page_link.uri
38
+ puts "continue?"
39
+ gets
40
+
41
+ $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
42
+
43
+ puts "next step is to load the next page..."
44
+ puts "page will load to:"
45
+ puts $agent.get($next_page.uri).uri
46
+ puts "continue?"
47
+ gets
48
+
49
+ $current_page = $agent.get($next_page.uri)
50
+ examine_current_pagenum
51
+
52
+ puts "====current_page has changed===="
53
+ puts "this is uri:"
54
+ puts $current_page.uri
55
+ puts "this is page_num"
56
+ puts $current_pagenum
57
+
58
+ puts "\ncontinue and exit loading method?"
59
+ gets
60
+
61
+ puts "***ending load_next_page method***"
62
+ end
63
+
64
+
65
+ # cycle through search result pages and store product html
66
+ def scan
67
+ puts "***started scan method***"
68
+ $pages = {}
69
+
70
+ find_last_pagenum
71
+
72
+ $last_pagenum.times do # paginate until on last page.
73
+ puts "***started pagination block***"
74
+
75
+ puts "Enter 'html' if you want to puts pages array, other press RETURN to continue"
76
+ answer = gets.chomp
77
+
78
+ if answer == "html"
79
+ if $pages.empty?
80
+ puts "pages array is empty"
81
+ else
82
+ $pages.each {|x| puts x}
83
+ end
84
+ end
85
+
86
+ examine_current_pagenum
87
+
88
+ $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
89
+ $pages[$page_num] = $current_divs # store page results
90
+
91
+
92
+ puts "--"*50
93
+ puts "\nlast page number is #{$last_pagenum}"
94
+ puts "we're on #{$current_pagenum}"
95
+ puts "this is current hyperlink:"
96
+ puts $current_page.uri
97
+
98
+
99
+ puts "ready to go to #{$current_pagenum+1}?"
100
+ gets
101
+
102
+ load_next_page
103
+ puts "scanning is ready to restart loop."
104
+ puts "continue?"
105
+ gets
106
+ puts "***ending pagination block***"
107
+ end
108
+
109
+ puts "***ending scan method***"
110
+ end
111
+ end
112
+
113
+ end
@@ -0,0 +1,59 @@
1
+ require 'mechanize'
2
+ require_relative './pages'
3
+ require_relative './form'
4
+
5
+ module Amazon
6
+ class << self
7
+ # extract product data
8
+ def extract_product_data
9
+ # nokogiri syntax is needed when iterating...not mechanize!
10
+ $current_divs.each do |html|
11
+ title = html.at_css(".s-access-title")
12
+ seller = html.at_css(".a-row > .a-spacing-none")
13
+ price = html.at_css(".s-price")
14
+ stars = html.at_css(".a-icon-star")
15
+ reviews = html.at_css("span+ .a-text-normal")
16
+ image_href = html.at_css(".s-access-image")
17
+ url = html.at_css(".a-row > a")
18
+
19
+ break if title == nil # if it's nil it's prob an ad
20
+ break if price == nil # no price? prob not worthy item
21
+ break if stars == nil # no stars? not worth it
22
+
23
+ if seller == nil # sometimes seller is nil on movies, etc.
24
+ seller = "Unknown"
25
+ else
26
+ seller = seller.text
27
+ end
28
+
29
+ # extract text and set variables for puts
30
+ title = title.text
31
+ price = price.text
32
+ stars = stars.text
33
+ reviews = reviews.text
34
+ image_href = image_href['src']
35
+ url = url['href']
36
+
37
+ Product.new(title, price, stars, reviews, image_href, url, html)
38
+
39
+ end
40
+ end
41
+
42
+
43
+ # currently not being used and needs adjusting
44
+ def display_product
45
+ STDOUT.puts "--"*50
46
+ STDOUT.puts "title: \t\t#{title}"
47
+ STDOUT.puts "seller: \t#{seller}"
48
+ STDOUT.puts "price: \t\t#{price}"
49
+ STDOUT.puts "stars: \t\t#{stars}"
50
+ STDOUT.puts "reviews: \t#{reviews}"
51
+ STDOUT.puts "image url: \t#{image}"
52
+ STDOUT.puts "product url: \t#{url}"
53
+ end
54
+
55
+
56
+
57
+ end
58
+
59
+ end
@@ -0,0 +1,22 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'mechanize'
4
+ require './amazon-search/form'
5
+ require './amazon-search/pages'
6
+ require './amazon-search/products'
7
+
8
+ module Amazon
9
+ class << self
10
+ def search(keywords)
11
+ $keywords = keywords
12
+
13
+ set_agent
14
+ find_form
15
+ submit_form
16
+ scan
17
+ end
18
+ end
19
+ end
20
+
21
+
22
+
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amazon-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-18 00:00:00.000000000 Z
11
+ date: 2015-09-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -34,6 +34,13 @@ files:
34
34
  - Readme.rdoc
35
35
  - amazon-search.gemspec
36
36
  - lib/amazon-search.rb
37
+ - lib/amazon-search/form.rb
38
+ - lib/amazon-search/products.rb
39
+ - lib/amazon-search/scan.rb
40
+ - test/lib/amazon-search.rb
41
+ - test/lib/amazon-search/form.rb
42
+ - test/lib/amazon-search/pages.rb
43
+ - test/lib/amazon-search/products.rb
37
44
  homepage: https://github.com/m8ss/amazon-search
38
45
  licenses:
39
46
  - MIT
@@ -58,4 +65,8 @@ rubygems_version: 2.4.6
58
65
  signing_key:
59
66
  specification_version: 4
60
67
  summary: A simple screenscraper to search Amazon
61
- test_files: []
68
+ test_files:
69
+ - test/lib/amazon-search.rb
70
+ - test/lib/amazon-search/form.rb
71
+ - test/lib/amazon-search/pages.rb
72
+ - test/lib/amazon-search/products.rb