RubyGems - amazon-search - Versions diffs - 1.3.0 → 1.4.0 - Mend

amazon-search 1.3.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/Readme.rdoc +61 -3
data/amazon-search.gemspec +1 -1
data/lib/amazon-search.rb +2 -0
data/lib/amazon-search/products.rb +56 -28
data/lib/amazon-search/scan.rb +1 -2
data/test/lib/amazon-search.rb +3 -1
data/test/lib/amazon-search/form.rb +1 -1
data/test/lib/amazon-search/products.rb +57 -29
data/test/lib/amazon-search/scan.rb +57 -0
metadata +3 -3
data/test/lib/amazon-search/pages.rb +0 -113

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d3bd73fc88e578c6c9572913876b6ccb4fca5f5c
-  data.tar.gz: 51d26171b63240b005b54494eab74abc78fdf484
+  metadata.gz: 1f477c66bb11ecc5baef8af2c657320cc405e330
+  data.tar.gz: dd2ce5d6b22c3907bbe8bdfc41ff87ce4fd5e349
 SHA512:
-  metadata.gz: b6886d46b6347fab8dc5f0671716d03892f69f85aac71762257a428ecaf79ed0d456683f2e5667d1f5ce507d9612a55b47025a9766bfeef572ffcf14135266ea
-  data.tar.gz: d7ffb27e2956701acdb4ec441c01943b835a241aa765a31a7f706b879f55a3b47912c272c8100f117a6430bba45d6cacb762c9607325adb73e8d773e5227ee4f
+  metadata.gz: 8506a8cc975113e00d2e329a084a59c62fe7650a19d4c32c4d35f310be1bed884e337e0e4b79824b856bafdaa20fcbea1569eebc699b78e7bb56843ce05b3bc4
+  data.tar.gz: def6ff11fb46613b14464188a5bc7de794bf36161b46b1ac92ca96cbcfeb371ae89cced41462b24c6411db405eb5ee5ae08a78d431e09c4dad892f7bf4d4a888

data/Readme.rdoc CHANGED

@@ -2,19 +2,77 @@
 Amazon Search is a simple Ruby tool to search for Amazon products.
-This is a tool that does not require configuration of Amazon's API.  The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements.  XML and CSS selectors are currently being used.  In the event that Amazon updates their site, the selectors will need to be updated.
+This tool screenscrapes an Amazon search and returns a hash of the product results. Configuration of Amazon's API is not needed.
+The functionality is centered around mechanize pagination for the screen scraping of nokogiri elements.  XML and CSS selectors are currently being used.  In the event that Amazon updates their site, the selectors will need to be updated.
+== DATA COLLECTED
+* title
+* price
+* stars
+* reviews
+* image_href
+* url
+* seller
 == INSTALLATION
   $ gem install amazon-search
 == EXAMPLE
     require 'amazon-search'
-    # Search for products by string
+    # search for products by string
     Amazon::search "ruby"
+    # search results are stored in global variable:
+    $products # => returns entire hash of products found in search
+    # reference any product by the order it appeared in search results
+    $products[0] # => references the first product found in search
+    $products[30] # => references the 29th product found in search
+    # reference any product by the order it appeared in search results
+    # and display attributes of that product
+    # all available attributes are:
+    $products[0][:title] # => the first product's title
+    $products[0][:price] # => etc...
+    $products[0][:stars]
+    $products[0][:reviews]
+    $products[0][:image_href]
+    $products[0][:url]
+    $products[0][:seller]
+    # Save search results in order to execute another search
+    ### method 1)
+    example_search = Amazon::search "ruby"
+    ### method 2)
+    example_search = $products # => only works after search has been done
+    # Iterate over all search results and return specific attributes
+    $products.each do |x|
+    	product = x[1] # => index into array before keying hash
+    	puts product[:title]
+    	puts product[:stars]
+    	# etc ...
 == MIT LICENSE

data/amazon-search.gemspec CHANGED

@@ -2,7 +2,7 @@
 Gem::Specification.new do |gem|
   gem.name        = %q{amazon-search}
-  gem.version     = '1.3.0'
+  gem.version     = '1.4.0'
   gem.date        = '2015-09-19'
   gem.platform = Gem::Platform::RUBY
   gem.required_ruby_version = '>= 1.8'

data/lib/amazon-search.rb CHANGED

@@ -14,6 +14,8 @@ module Amazon
             find_form
             submit_form
             scan
+            $products
         end
     end
 end

data/lib/amazon-search/products.rb CHANGED

@@ -4,10 +4,33 @@ require_relative './form'
 module Amazon
     class << self
+        $products = {}
+        $product_num = 0
+        # used for checking strings
+        def is_numeric?(s)
+         !!Float(s) rescue false
+        end
+        # currently not being used and needs adjusting
+        def display_product
+            STDOUT.puts "--"*50
+            STDOUT.puts "title: \t\t#{$title}"
+            STDOUT.puts "seller: \t#{$seller}"
+            STDOUT.puts "price: \t\t#{$price}"
+            STDOUT.puts "stars: \t\t#{$stars}"
+            STDOUT.puts "reviews: \t#{$reviews}"
+            STDOUT.puts "image url: \t#{$image_href}"
+            STDOUT.puts "product url: \t#{$url}"
+        end
         # extract product data
         def extract_product_data
             # nokogiri syntax is needed when iterating...not mechanize!
-            $current_divs.each do |html|
+            # extract useful stuff from product html
+            $current_divs.each do |html|
+                # first select raw html
                 title = html.at_css(".s-access-title")
                 seller = html.at_css(".a-row > .a-spacing-none")
                 price = html.at_css(".s-price")
@@ -19,41 +42,46 @@ module Amazon
                 break if title == nil # if it's nil it's prob an ad
                 break if price == nil # no price? prob not worthy item
                 break if stars == nil # no stars? not worth it
+                # extract text and set variables for puts
+                $title = title.text
+                $price = price.text
+                $stars = stars.text
+                $image_href = image_href['src']
+                $url = url['href']
+                # movies sometimes have text in review class
+                if is_numeric?(reviews.text)
+                    $reviews = reviews.text
+                else
+                    $reviews = "Unknown"
+                end
                 if seller == nil # sometimes seller is nil on movies, etc.
-                    seller = "Unknown"
+                    $seller = "Unknown"
                 else
-                    seller = seller.text
+                    $seller = seller.text
                 end
-                # extract text and set variables for puts
-                title = title.text
-                price = price.text
-                stars = stars.text
-                reviews = reviews.text
-                image_href = image_href['src']
-                url = url['href']
-                Product.new(title, price, stars, reviews, image_href, url, html)
+                # don't overload the server
+                sleep(0.05)
-            end
-        end
+                display_product
+                # store extracted text in products hash
+                # key is product count
+                $products[$product_num] = {
+                    title: $title,
+                    price: $price,
+                    stars: $stars,
+                    reviews: $reviews,
+                    image_href: $image_href,
+                    url: $url,
+                    seller: $seller,
+                }
-        # currently not being used and needs adjusting
-        def display_product
-            STDOUT.puts "--"*50
-            STDOUT.puts "title: \t\t#{title}"
-            STDOUT.puts "seller: \t#{seller}"
-            STDOUT.puts "price: \t\t#{price}"
-            STDOUT.puts "stars: \t\t#{stars}"
-            STDOUT.puts "reviews: \t#{reviews}"
-            STDOUT.puts "image url: \t#{image}"
-            STDOUT.puts "product url: \t#{url}"
+                $product_num +=1 # ready for next product
+            end
         end
     end
 end

data/lib/amazon-search/scan.rb CHANGED

@@ -39,17 +39,16 @@ module Amazon
             $last_pagenum.times do # paginate until on last page.
                 examine_current_pagenum
-                puts "\nscanning page #{$current_pagenum} of #{$last_pagenum}..."
                 $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
                 $pages[$page_num] = $current_divs # store page results
+                extract_product_data
                 load_next_page
             end
             puts "\n(scan complete.)"
-            $pages
         end
     end

data/test/lib/amazon-search.rb CHANGED

@@ -2,7 +2,7 @@
 require 'mechanize'
 require './amazon-search/form'
-require './amazon-search/pages'
+require './amazon-search/scan'
 require './amazon-search/products'
 module Amazon
@@ -14,6 +14,8 @@ module Amazon
             find_form
             submit_form
             scan
+            $products
         end
     end
 end

data/test/lib/amazon-search/form.rb CHANGED

@@ -1,5 +1,5 @@
 require 'mechanize'
-require_relative './pages'
+require_relative './scan'
 require_relative './products'
 module Amazon

data/test/lib/amazon-search/products.rb CHANGED

@@ -1,13 +1,36 @@
 require 'mechanize'
-require_relative './pages'
+require_relative './scan'
 require_relative './form'
 module Amazon
     class << self
+        $products = {}
+        $product_num = 0
+        # used for checking strings
+        def is_numeric?(s)
+         !!Float(s) rescue false
+        end
+        # currently not being used and needs adjusting
+        def display_product
+            STDOUT.puts "--"*50
+            STDOUT.puts "title: \t\t#{$title}"
+            STDOUT.puts "seller: \t#{$seller}"
+            STDOUT.puts "price: \t\t#{$price}"
+            STDOUT.puts "stars: \t\t#{$stars}"
+            STDOUT.puts "reviews: \t#{$reviews}"
+            STDOUT.puts "image url: \t#{$image_href}"
+            STDOUT.puts "product url: \t#{$url}"
+        end
         # extract product data
         def extract_product_data
             # nokogiri syntax is needed when iterating...not mechanize!
-            $current_divs.each do |html|
+            # extract useful stuff from product html
+            $current_divs.each do |html|
+                # first select raw html
                 title = html.at_css(".s-access-title")
                 seller = html.at_css(".a-row > .a-spacing-none")
                 price = html.at_css(".s-price")
@@ -19,41 +42,46 @@ module Amazon
                 break if title == nil # if it's nil it's prob an ad
                 break if price == nil # no price? prob not worthy item
                 break if stars == nil # no stars? not worth it
+                # extract text and set variables for puts
+                $title = title.text
+                $price = price.text
+                $stars = stars.text
+                $image_href = image_href['src']
+                $url = url['href']
+                # movies sometimes have text in review class
+                if is_numeric?(reviews.text)
+                    $reviews = reviews.text
+                else
+                    $reviews = "Unknown"
+                end
                 if seller == nil # sometimes seller is nil on movies, etc.
-                    seller = "Unknown"
+                    $seller = "Unknown"
                 else
-                    seller = seller.text
+                    $seller = seller.text
                 end
-                # extract text and set variables for puts
-                title = title.text
-                price = price.text
-                stars = stars.text
-                reviews = reviews.text
-                image_href = image_href['src']
-                url = url['href']
-                Product.new(title, price, stars, reviews, image_href, url, html)
+                # don't overload the server
+                sleep(0.05)
-            end
-        end
+                display_product
+                # store extracted text in products hash
+                # key is product count
+                $products[$product_num] = {
+                    title: $title,
+                    price: $price,
+                    stars: $stars,
+                    reviews: $reviews,
+                    image_href: $image_href,
+                    url: $url,
+                    seller: $seller,
+                }
-        # currently not being used and needs adjusting
-        def display_product
-            STDOUT.puts "--"*50
-            STDOUT.puts "title: \t\t#{title}"
-            STDOUT.puts "seller: \t#{seller}"
-            STDOUT.puts "price: \t\t#{price}"
-            STDOUT.puts "stars: \t\t#{stars}"
-            STDOUT.puts "reviews: \t#{reviews}"
-            STDOUT.puts "image url: \t#{image}"
-            STDOUT.puts "product url: \t#{url}"
+                $product_num +=1 # ready for next product
+            end
         end
     end
 end

data/test/lib/amazon-search/scan.rb ADDED

@@ -0,0 +1,57 @@
+require 'mechanize'
+require_relative './products'
+require_relative './form'
+module Amazon
+    class << self
+        # examine current_pagenum
+        def examine_current_pagenum
+            $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
+            $current_pagenum = $current_pagenum.text.to_i # need integer for checks
+        end
+        # find last page number
+        def find_last_pagenum
+            $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
+            $last_pagenum = $last_pagenum.text.to_i # need integer for checks
+        end
+        # load next page
+        def load_next_page
+            examine_current_pagenum # does this need to be here?
+            $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
+            $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
+            $current_page = $agent.get($next_page.uri)
+        end
+        # cycle through search result pages and store product html
+        def scan
+            $pages = {}
+            find_last_pagenum
+            $last_pagenum.times do # paginate until on last page.
+                examine_current_pagenum
+                puts "\nscanning page #{$current_pagenum} of #{$last_pagenum} @ #{$main_page.uri+$current_page.uri}"
+                $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
+                $pages[$page_num] = $current_divs # store page results
+                extract_product_data
+                load_next_page
+            end
+            puts "\n(scan complete.)"
+        end
+    end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: amazon-search
 version: !ruby/object:Gem::Version
-  version: 1.3.0
+  version: 1.4.0
 platform: ruby
 authors:
 - John Mason
@@ -39,8 +39,8 @@ files:
 - lib/amazon-search/scan.rb
 - test/lib/amazon-search.rb
 - test/lib/amazon-search/form.rb
-- test/lib/amazon-search/pages.rb
 - test/lib/amazon-search/products.rb
+- test/lib/amazon-search/scan.rb
 homepage: https://github.com/m8ss/amazon-search
 licenses:
 - MIT
@@ -68,5 +68,5 @@ summary: A simple screenscraper to search Amazon
 test_files:
 - test/lib/amazon-search.rb
 - test/lib/amazon-search/form.rb
-- test/lib/amazon-search/pages.rb
 - test/lib/amazon-search/products.rb
+- test/lib/amazon-search/scan.rb

data/test/lib/amazon-search/pages.rb DELETED

@@ -1,113 +0,0 @@
-require 'mechanize'
-require_relative './products'
-require_relative './form'
-module Amazon
-    class << self
-        # examine current_pagenum
-        def examine_current_pagenum
-            $current_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnCur", " " ))]'
-            $current_pagenum = $current_pagenum.text.to_i # need integer for checks
-        end
-        # find last page number
-        def find_last_pagenum
-            $last_pagenum = $current_page.search '//*[contains(concat( " ", @class, " " ), concat( " ", "pagnDisabled", " " ))]'
-            $last_pagenum = $last_pagenum.text.to_i # need integer for checks
-        end
-        # load next page
-        def load_next_page
-            puts "***started load_next_page method***"
-            puts "ready to examine the page number?"
-            gets
-            examine_current_pagenum
-            puts "page number is..."
-            puts $current_pagenum
-            puts "continue?"
-            gets
-            $next_page_link = $current_page.link_with text: /Next Page/ # find next page link
-            puts "found next page..."
-            puts "this is link:"
-            puts $main_page.uri+$next_page_link.uri
-            puts "continue?"
-            gets
-            $next_page = $next_page_link.click unless $current_pagenum == $last_pagenum # click to next page unless on last page
-            puts "next step is to load the next page..."
-            puts "page will load to:"
-            puts $agent.get($next_page.uri).uri
-            puts "continue?"
-            gets
-            $current_page = $agent.get($next_page.uri)
-            examine_current_pagenum
-            puts "====current_page has changed===="
-            puts "this is uri:"
-            puts $current_page.uri
-            puts "this is page_num"
-            puts $current_pagenum
-            puts "\ncontinue and exit loading method?"
-            gets
-            puts "***ending load_next_page method***"
-        end
-        # cycle through search result pages and store product html
-        def scan
-            puts "***started scan method***"
-            $pages = {}
-            find_last_pagenum
-            $last_pagenum.times do # paginate until on last page.
-                puts "***started pagination block***"
-                puts "Enter 'html' if you want to puts pages array, other press RETURN to continue"
-                answer = gets.chomp
-                if answer == "html"
-                    if $pages.empty?
-                        puts "pages array is empty"
-                    else
-                        $pages.each {|x| puts x}
-                    end
-                end
-                examine_current_pagenum
-                $current_divs = $current_page.search('//li[starts-with(@id, "result")]')
-                $pages[$page_num] = $current_divs # store page results
-                puts "--"*50
-                puts "\nlast page number is #{$last_pagenum}"
-                puts "we're on #{$current_pagenum}"
-                puts "this is current hyperlink:"
-                puts $current_page.uri
-                puts "ready to go to #{$current_pagenum+1}?"
-                gets
-                load_next_page
-                puts "scanning is ready to restart loop."
-                puts "continue?"
-                gets
-                puts "***ending pagination block***"
-            end
-            puts "***ending scan method***"
-        end
-    end
-end