amazon-search 1.3.0 → 1.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d3bd73fc88e578c6c9572913876b6ccb4fca5f5c
4
- data.tar.gz: 51d26171b63240b005b54494eab74abc78fdf484
3
+ metadata.gz: 1f477c66bb11ecc5baef8af2c657320cc405e330
4
+ data.tar.gz: dd2ce5d6b22c3907bbe8bdfc41ff87ce4fd5e349
5
5
  SHA512:
6
- metadata.gz: b6886d46b6347fab8dc5f0671716d03892f69f85aac71762257a428ecaf79ed0d456683f2e5667d1f5ce507d9612a55b47025a9766bfeef572ffcf14135266ea
7
- data.tar.gz: d7ffb27e2956701acdb4ec441c01943b835a241aa765a31a7f706b879f55a3b47912c272c8100f117a6430bba45d6cacb762c9607325adb73e8d773e5227ee4f
6
+ metadata.gz: 8506a8cc975113e00d2e329a084a59c62fe7650a19d4c32c4d35f310be1bed884e337e0e4b79824b856bafdaa20fcbea1569eebc699b78e7bb56843ce05b3bc4
7
+ data.tar.gz: def6ff11fb46613b14464188a5bc7de794bf36161b46b1ac92ca96cbcfeb371ae89cced41462b24c6411db405eb5ee5ae08a78d431e09c4dad892f7bf4d4a888
@@ -2,19 +2,77 @@
2
2
 
3
3
  Amazon Search is a simple Ruby tool to search for Amazon products.
4
4
 
5
- This is a tool that does not require configuration of Amazon's API. The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XML and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
5
+ This tool screenscrapes an Amazon search and returns a hash of the product results. Configuration of Amazon's API is not needed.
6
+
7
+ The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements. XML and CSS selectors are currently being used. In the event that Amazon updates their site, the selectors will need to be updated.
8
+
9
+ == DATA COLLECTED
10
+ * title
11
+ * price
12
+ * stars
13
+ * reviews
14
+ * image_href
15
+ * url
16
+ * seller
17
+
6
18
 
7
19
  == INSTALLATION
8
20
 
9
21
  $ gem install amazon-search
10
22
 
23
+
11
24
  == EXAMPLE
12
25
 
13
26
  require 'amazon-search'
14
27
 
15
- # Search for products by string
16
-
28
+ # search for products by string
29
+
17
30
  Amazon::search "ruby"
31
+
32
+
33
+ # search results are stored in global variable:
34
+
35
+ $products # => returns entire hash of products found in search
36
+
37
+
38
+ # reference any product by the order it appeared in search results
39
+
40
+ $products[0] # => references the first product found in search
41
+ $products[30] # => references the 29th product found in search
42
+
43
+
44
+ # reference any product by the order it appeared in search results
45
+ # and display attributes of that product
46
+ # all available attributes are:
47
+
48
+ $products[0][:title] # => the first product's title
49
+ $products[0][:price] # => etc...
50
+ $products[0][:stars]
51
+ $products[0][:reviews]
52
+ $products[0][:image_href]
53
+ $products[0][:url]
54
+ $products[0][:seller]
55
+
56
+
57
+ # Save search results in order to execute another search
58
+ ### method 1)
59
+
60
+ example_search = Amazon::search "ruby"
61
+
62
+ ### method 2)
63
+
64
+ example_search = $products # => only works after search has been done
65
+
66
+
67
+ # Iterate over all search results and return specific attributes
68
+
69
+ $products.each do |x|
70
+ product = x[1] # => index into array before keying hash
71
+ puts product[:title]
72
+ puts product[:stars]
73
+ # etc ...
74
+
75
+
18
76
 
19
77
  == MIT LICENSE
20
78
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |gem|
4
4
  gem.name = %q{amazon-search}
5
- gem.version = '1.3.0'
5
+ gem.version = '1.4.0'
6
6
  gem.date = '2015-09-19'
7
7
  gem.platform = Gem::Platform::RUBY
8
8
  gem.required_ruby_version = '>= 1.8'
@@ -14,6 +14,8 @@ module Amazon
14
14
  find_form
15
15
  submit_form
16
16
  scan
17
+
18
+ $products
17
19
  end
18
20
  end
19
21
  end
@@ -4,10 +4,33 @@ require_relative './form'
4
4
 
5
5
  module Amazon
6
6
  class << self
7
+ $products = {}
8
+ $product_num = 0
9
+
10
+ # used for checking strings
11
+ def is_numeric?(s)
12
+ !!Float(s) rescue false
13
+ end
14
+
15
+ # currently not being used and needs adjusting
16
+ def display_product
17
+ STDOUT.puts "--"*50
18
+ STDOUT.puts "title: \t\t#{$title}"
19
+ STDOUT.puts "seller: \t#{$seller}"
20
+ STDOUT.puts "price: \t\t#{$price}"
21
+ STDOUT.puts "stars: \t\t#{$stars}"
22
+ STDOUT.puts "reviews: \t#{$reviews}"
23
+ STDOUT.puts "image url: \t#{$image_href}"
24
+ STDOUT.puts "product url: \t#{$url}"
25
+ end
26
+
7
27
  # extract product data
8
28
  def extract_product_data
29
+
9
30
  # nokogiri syntax is needed when iterating...not mechanize!
10
- $current_divs.each do |html|
31
+ # extract useful stuff from product html
32
+ $current_divs.each do |html|
33
+ # first select raw html
11
34
  title = html.at_css(".s-access-title")
12
35
  seller = html.at_css(".a-row > .a-spacing-none")
13
36
  price = html.at_css(".s-price")
@@ -19,41 +42,46 @@ module Amazon
19
42
  break if title == nil # if it's nil it's prob an ad
20
43
  break if price == nil # no price? prob not worthy item
21
44
  break if stars == nil # no stars? not worth it
45
+
46
+ # extract text and set variables for puts
47
+ $title = title.text
48
+ $price = price.text
49
+ $stars = stars.text
50
+ $image_href = image_href['src']
51
+ $url = url['href']
52
+
53
+ # movies sometimes have text in review class
54
+ if is_numeric?(reviews.text)
55
+ $reviews = reviews.text
56
+ else
57
+ $reviews = "Unknown"
58
+ end
22
59
 
23
60
  if seller == nil # sometimes seller is nil on movies, etc.
24
- seller = "Unknown"
61
+ $seller = "Unknown"
25
62
  else
26
- seller = seller.text
63
+ $seller = seller.text
27
64
  end
28
-
29
- # extract text and set variables for puts
30
- title = title.text
31
- price = price.text
32
- stars = stars.text
33
- reviews = reviews.text
34
- image_href = image_href['src']
35
- url = url['href']
36
65
 
37
- Product.new(title, price, stars, reviews, image_href, url, html)
66
+ # don't overload the server
67
+ sleep(0.05)
38
68
 
39
- end
40
- end
69
+ display_product
41
70
 
71
+ # store extracted text in products hash
72
+ # key is product count
73
+ $products[$product_num] = {
74
+ title: $title,
75
+ price: $price,
76
+ stars: $stars,
77
+ reviews: $reviews,
78
+ image_href: $image_href,
79
+ url: $url,
80
+ seller: $seller,
81
+ }
42
82
 
43
- # currently not being used and needs adjusting
44
- def display_product
45
- STDOUT.puts "--"*50
46
- STDOUT.puts "title: \t\t#{title}"
47
- STDOUT.puts "seller: \t#{seller}"
48
- STDOUT.puts "price: \t\t#{price}"
49
- STDOUT.puts "stars: \t\t#{stars}"
50
- STDOUT.puts "reviews: \t#{reviews}"
51
- STDOUT.puts "image url: \t#{image}"
52
- STDOUT.puts "product url: \t#{url}"
83
+ $product_num +=1 # ready for next product
84
+ end
53
85
  end
54
-
55
-
56
-
57
86
  end
58
-
59
87
  end
@@ -39,17 +39,16 @@ module Amazon
39
39
  $last_pagenum.times do # paginate until on last page.
40
40
 
41
41
  examine_current_pagenum
42
- puts "\nscanning page #{$current_pagenum} of #{$last_pagenum}..."
43
42
 
44
43
  $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
45
44
  $pages[$page_num] = $current_divs # store page results
46
45
 
46
+ extract_product_data
47
47
  load_next_page
48
48
 
49
49
  end
50
50
 
51
51
  puts "\n(scan complete.)"
52
- $pages
53
52
  end
54
53
  end
55
54
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  require 'mechanize'
4
4
  require './amazon-search/form'
5
- require './amazon-search/pages'
5
+ require './amazon-search/scan'
6
6
  require './amazon-search/products'
7
7
 
8
8
  module Amazon
@@ -14,6 +14,8 @@ module Amazon
14
14
  find_form
15
15
  submit_form
16
16
  scan
17
+
18
+ $products
17
19
  end
18
20
  end
19
21
  end
@@ -1,5 +1,5 @@
1
1
  require 'mechanize'
2
- require_relative './pages'
2
+ require_relative './scan'
3
3
  require_relative './products'
4
4
 
5
5
  module Amazon
@@ -1,13 +1,36 @@
1
1
  require 'mechanize'
2
- require_relative './pages'
2
+ require_relative './scan'
3
3
  require_relative './form'
4
4
 
5
5
  module Amazon
6
6
  class << self
7
+ $products = {}
8
+ $product_num = 0
9
+
10
+ # used for checking strings
11
+ def is_numeric?(s)
12
+ !!Float(s) rescue false
13
+ end
14
+
15
+ # currently not being used and needs adjusting
16
+ def display_product
17
+ STDOUT.puts "--"*50
18
+ STDOUT.puts "title: \t\t#{$title}"
19
+ STDOUT.puts "seller: \t#{$seller}"
20
+ STDOUT.puts "price: \t\t#{$price}"
21
+ STDOUT.puts "stars: \t\t#{$stars}"
22
+ STDOUT.puts "reviews: \t#{$reviews}"
23
+ STDOUT.puts "image url: \t#{$image_href}"
24
+ STDOUT.puts "product url: \t#{$url}"
25
+ end
26
+
7
27
  # extract product data
8
28
  def extract_product_data
29
+
9
30
  # nokogiri syntax is needed when iterating...not mechanize!
10
- $current_divs.each do |html|
31
+ # extract useful stuff from product html
32
+ $current_divs.each do |html|
33
+ # first select raw html
11
34
  title = html.at_css(".s-access-title")
12
35
  seller = html.at_css(".a-row > .a-spacing-none")
13
36
  price = html.at_css(".s-price")
@@ -19,41 +42,46 @@ module Amazon
19
42
  break if title == nil # if it's nil it's prob an ad
20
43
  break if price == nil # no price? prob not worthy item
21
44
  break if stars == nil # no stars? not worth it
45
+
46
+ # extract text and set variables for puts
47
+ $title = title.text
48
+ $price = price.text
49
+ $stars = stars.text
50
+ $image_href = image_href['src']
51
+ $url = url['href']
52
+
53
+ # movies sometimes have text in review class
54
+ if is_numeric?(reviews.text)
55
+ $reviews = reviews.text
56
+ else
57
+ $reviews = "Unknown"
58
+ end
22
59
 
23
60
  if seller == nil # sometimes seller is nil on movies, etc.
24
- seller = "Unknown"
61
+ $seller = "Unknown"
25
62
  else
26
- seller = seller.text
63
+ $seller = seller.text
27
64
  end
28
-
29
- # extract text and set variables for puts
30
- title = title.text
31
- price = price.text
32
- stars = stars.text
33
- reviews = reviews.text
34
- image_href = image_href['src']
35
- url = url['href']
36
65
 
37
- Product.new(title, price, stars, reviews, image_href, url, html)
66
+ # don't overload the server
67
+ sleep(0.05)
38
68
 
39
- end
40
- end
69
+ display_product
41
70
 
71
+ # store extracted text in products hash
72
+ # key is product count
73
+ $products[$product_num] = {
74
+ title: $title,
75
+ price: $price,
76
+ stars: $stars,
77
+ reviews: $reviews,
78
+ image_href: $image_href,
79
+ url: $url,
80
+ seller: $seller,
81
+ }
42
82
 
43
- # currently not being used and needs adjusting
44
- def display_product
45
- STDOUT.puts "--"*50
46
- STDOUT.puts "title: \t\t#{title}"
47
- STDOUT.puts "seller: \t#{seller}"
48
- STDOUT.puts "price: \t\t#{price}"
49
- STDOUT.puts "stars: \t\t#{stars}"
50
- STDOUT.puts "reviews: \t#{reviews}"
51
- STDOUT.puts "image url: \t#{image}"
52
- STDOUT.puts "product url: \t#{url}"
83
+ $product_num +=1 # ready for next product
84
+ end
53
85
  end
54
-
55
-
56
-
57
86
  end
58
-
59
87
  end
@@ -0,0 +1,57 @@
1
+ require 'mechanize'
2
+ require_relative './products'
3
+ require_relative './form'
4
+
5
+ module Amazon
6
+ class << self
7
+ # examine current_pagenum
8
+ def examine_current_pagenum
9
+ $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
+ $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
+ end
12
+
13
+ # find last page number
14
+ def find_last_pagenum
15
+ $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
+ $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
+ end
18
+
19
+
20
+ # load next page
21
+ def load_next_page
22
+
23
+ examine_current_pagenum # does this need to be here?
24
+
25
+ $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
26
+ $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
27
+
28
+ $current_page = $agent.get($next_page.uri)
29
+
30
+ end
31
+
32
+
33
+ # cycle through search result pages and store product html
34
+ def scan
35
+ $pages = {}
36
+
37
+ find_last_pagenum
38
+
39
+ $last_pagenum.times do # paginate until on last page.
40
+
41
+ examine_current_pagenum
42
+ puts "\nscanning page #{$current_pagenum} of #{$last_pagenum} @ #{$main_page.uri+$current_page.uri}"
43
+
44
+ $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
45
+ $pages[$page_num] = $current_divs # store page results
46
+
47
+ extract_product_data
48
+
49
+ load_next_page
50
+
51
+ end
52
+
53
+ puts "\n(scan complete.)"
54
+ end
55
+ end
56
+
57
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: amazon-search
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.3.0
4
+ version: 1.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Mason
@@ -39,8 +39,8 @@ files:
39
39
  - lib/amazon-search/scan.rb
40
40
  - test/lib/amazon-search.rb
41
41
  - test/lib/amazon-search/form.rb
42
- - test/lib/amazon-search/pages.rb
43
42
  - test/lib/amazon-search/products.rb
43
+ - test/lib/amazon-search/scan.rb
44
44
  homepage: https://github.com/m8ss/amazon-search
45
45
  licenses:
46
46
  - MIT
@@ -68,5 +68,5 @@ summary: A simple screenscraper to search Amazon
68
68
  test_files:
69
69
  - test/lib/amazon-search.rb
70
70
  - test/lib/amazon-search/form.rb
71
- - test/lib/amazon-search/pages.rb
72
71
  - test/lib/amazon-search/products.rb
72
+ - test/lib/amazon-search/scan.rb
@@ -1,113 +0,0 @@
1
- require 'mechanize'
2
- require_relative './products'
3
- require_relative './form'
4
-
5
- module Amazon
6
- class << self
7
- # examine current_pagenum
8
- def examine_current_pagenum
9
- $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
10
- $current_pagenum = $current_pagenum.text.to_i # need integer for checks
11
- end
12
-
13
- # find last page number
14
- def find_last_pagenum
15
- $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
16
- $last_pagenum = $last_pagenum.text.to_i # need integer for checks
17
- end
18
-
19
-
20
- # load next page
21
- def load_next_page
22
- puts "***started load_next_page method***"
23
- puts "ready to examine the page number?"
24
- gets
25
-
26
- examine_current_pagenum
27
-
28
- puts "page number is..."
29
- puts $current_pagenum
30
- puts "continue?"
31
- gets
32
-
33
- $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
34
-
35
- puts "found next page..."
36
- puts "this is link:"
37
- puts $main_page.uri+$next_page_link.uri
38
- puts "continue?"
39
- gets
40
-
41
- $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
42
-
43
- puts "next step is to load the next page..."
44
- puts "page will load to:"
45
- puts $agent.get($next_page.uri).uri
46
- puts "continue?"
47
- gets
48
-
49
- $current_page = $agent.get($next_page.uri)
50
- examine_current_pagenum
51
-
52
- puts "====current_page has changed===="
53
- puts "this is uri:"
54
- puts $current_page.uri
55
- puts "this is page_num"
56
- puts $current_pagenum
57
-
58
- puts "\ncontinue and exit loading method?"
59
- gets
60
-
61
- puts "***ending load_next_page method***"
62
- end
63
-
64
-
65
- # cycle through search result pages and store product html
66
- def scan
67
- puts "***started scan method***"
68
- $pages = {}
69
-
70
- find_last_pagenum
71
-
72
- $last_pagenum.times do # paginate until on last page.
73
- puts "***started pagination block***"
74
-
75
- puts "Enter 'html' if you want to puts pages array, other press RETURN to continue"
76
- answer = gets.chomp
77
-
78
- if answer == "html"
79
- if $pages.empty?
80
- puts "pages array is empty"
81
- else
82
- $pages.each {|x| puts x}
83
- end
84
- end
85
-
86
- examine_current_pagenum
87
-
88
- $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
89
- $pages[$page_num] = $current_divs # store page results
90
-
91
-
92
- puts "--"*50
93
- puts "\nlast page number is #{$last_pagenum}"
94
- puts "we're on #{$current_pagenum}"
95
- puts "this is current hyperlink:"
96
- puts $current_page.uri
97
-
98
-
99
- puts "ready to go to #{$current_pagenum+1}?"
100
- gets
101
-
102
- load_next_page
103
- puts "scanning is ready to restart loop."
104
- puts "continue?"
105
- gets
106
- puts "***ending pagination block***"
107
- end
108
-
109
- puts "***ending scan method***"
110
- end
111
- end
112
-
113
- end