RubyGems - amazon_deets - Versions diffs - 0.0.3 → 0.2.0 - Mend

amazon_deets 0.0.3 → 0.2.0

Files changed (7) hide show

checksums.yaml +4 -4
data/lib/amazon_deets.rb +1 -119
data/lib/amazon_deets/core.rb +85 -0
data/lib/amazon_deets/factories.rb +18 -0
data/lib/amazon_deets/general_merchandise.rb +95 -0
data/lib/amazon_deets/kindle.rb +90 -0
metadata +6 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: be524b56abcc6dab4e880b8b27ad49a39ac917b2
-  data.tar.gz: 60047f9e529b38ad24d11e2562cb2bf5ba1ee590
+  metadata.gz: f4e516ca707d2e7250a0b203395e778984ac40bc
+  data.tar.gz: da8ad327292bdd9fee2c5dcd1a0135ea1a1c970d
 SHA512:
-  metadata.gz: e9cedb5fa453d9f082a403998154297bd1b1fd7af47cb23f53f4929ad849af96b92572b307e3f0e4fe3da4314cfb21f0b723c9834b351ee77f5fce08337f1eb7
-  data.tar.gz: fb0a0f570667b9c583e1e22240d3738e7397c97ca1304ecc4de80169f00db5564a818d0faa2fed3031190810d3dcfcdbb61ce025adaa45af9292b372fe91aad4
+  metadata.gz: 96d5ce3798cefd24d568dbea661d92c77cbabc68bef6272e3b9e54406fce8d4cf4f4faaf8283e85a259fe9303aea263176e9c92b66c3b23db5245ff5339a5498
+  data.tar.gz: 35996047d38fcb28a35a8e1029f7889d10d8a1b0723694ce5902ab9f90c297f7f6b68026b997b361f8a175d7d77684496573b97437edc4cfa98e203931bd214a

data/lib/amazon_deets.rb CHANGED

@@ -1,120 +1,2 @@
-require 'logbert'
-require 'mechanize'
-module AmazonDeets
-  class Grabber
-    LOG = Logbert[self]
-    RatingRegex  = /(.+)\s+out\sof/
-    ReviewsRegex = /(\d+)/
-    attr_accessor :agent
-    def initialize(agent: Mechanize.new)
-      @agent = agent
-    end
-    def title
-      result = agent.page.search("//h1[@id='title']").first
-      if result
-        return result.text.strip
-      end
-      result = agent.page.search("span#btAsinTitle").first
-      if result
-        return result.text.strip
-      end
-      return nil
-    end
-    def url
-      agent.page.uri.to_s
-    end
-    def list_price
-      lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
-      if lp_element.nil?
-        lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
-      end
-      if lp_element
-        return lp_element.text.gsub(/[^.\d]/, "")
-      else
-        return nil
-      end
-    end
-    def current_price
-      current_price_element = agent.page.search("//span[@id='priceblock_saleprice']").first
-      if current_price_element
-        return current_price_element.text
-      else
-        LOG.debug "Looks like no sale is going on.  Returning list price"
-        return list_price
-      end
-    end
-    def rating_text
-      result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
-      if result
-        return result[:title]
-      end
-      result = agent.page.search("div.acrRating").first
-      if result
-        return result.text
-      end
-      return nil
-    end
-    def rating
-      text = rating_text
-      if text
-        m = RatingRegex.match(text)
-        if m and m[1]
-          return m[1].to_f
-        end
-      end
-      return nil
-    end
-    def reviews
-      reviews_element = agent.page.search("//div[@id='summaryStars']/a")
-      if reviews_element
-        text = reviews_element.text.gsub(/[^\d]/, "")
-        return text.to_i unless text.empty?
-      end
-      return nil
-    end
-    def details_hash
-      return {
-        title:         title,
-        url:           url,
-        list_price:    list_price,
-        current_price: current_price,
-        rating:        rating,
-        reviews:       reviews
-      }
-    end
-    def grab(url)
-      agent.get(url)
-      details_hash
-    end
-  end
-end
+require 'amazon_deets/factories'

data/lib/amazon_deets/core.rb ADDED

@@ -0,0 +1,85 @@
+require 'logbert'
+require 'mechanize'
+module AmazonDeets
+  # Basic interface for the scrapers.  Point it to
+  # a URL, and it does the scrape.  BOOM!
+  class AbstractScraper
+    def scrape(url)
+      raise NotImplementedError
+    end
+  end
+  class MechanizedScraper < AbstractScraper
+    attr_accessor :agent
+    attr_accessor :fragments
+    def initialize(agent: Mechanize.new, fragments: Array.new)
+      @agent     = agent
+      @fragments = fragments
+    end
+    def scrape(url)
+      agent.get(url)
+      fragments.each do |f|
+        if f.applicable?(agent)
+          return f.scrape(agent)
+        end
+      end
+    end
+  end
+  # Amazon renders different HTML dependending upon
+  # the type of product that you are viewing.  This
+  # means that the scraper queries need to change
+  # depending upon whether you want the data for a
+  # Kindle book or some general merchandise.  Rather
+  # than building one super-complicated scraper, we'll
+  # break the code into multiple simple scrapers that
+  # focus on solving specific problems.
+  #
+  class MechanizedFragment
+    # Decides whether or not this MechanizedFragment
+    # is applicable
+    def applicable?(agent)
+      raise NotImplementedError
+    end
+    def scrape(agent)
+      raise NotImplementedError
+    end
+  end
+  # A MechanizedContext is similar to a scraper, but it
+  # assumes that the @agent has already navigated to
+  # the URL that is going to be scraped.
+  class MechanizedContext
+    attr_accessor :agent
+    def initialize(agent: Mechanized.new)
+      @agent = agent
+    end
+    def scrape
+      raise NotImplementedError
+    end
+  end
+end

data/lib/amazon_deets/factories.rb ADDED

@@ -0,0 +1,18 @@
+require 'amazon_deets/general_merchandise'
+require 'amazon_deets/kindle'
+module AmazonDeets
+  def self.create_scraper(agent: Mechanize.new)
+    MechanizedScraper.new(
+      agent: agent,
+      fragments: [
+        KindleFragment.new,
+        GeneralMerchandiseFragment.new
+      ]
+    )
+  end
+end

data/lib/amazon_deets/general_merchandise.rb ADDED

@@ -0,0 +1,95 @@
+require 'logbert'
+require 'mechanize'
+require 'amazon_deets/core'
+module AmazonDeets
+  class GeneralMerchandiseFragment < MechanizedFragment
+    def applicable?(agent)
+      agent.page.search("h1#title").any?
+    end
+    def scrape(agent)
+      context = Context.new(agent: agent)
+      return context.scrape
+    end
+    class Context < MechanizedContext
+      LOG = Logbert[self]
+      RatingRegex  = /(.+)\s+out\sof/
+      def title
+        result = agent.page.search("//h1[@id='title']").first
+        if result
+          return result.text.strip
+        end
+      end
+      def url
+        agent.page.uri.to_s
+      end
+      def list_price
+        lp_element = agent.page.search("//span[@id='priceblock_ourprice']").first
+        if lp_element.nil?
+          lp_element = agent.page.search("//td[text()='Price:']/following-sibling::td")
+        end
+        if lp_element
+          return lp_element.text.gsub(/[^.\d]/, "")
+        end
+      end
+      def current_price
+        cp_element = agent.page.search("//span[@id='priceblock_saleprice']").first
+        if cp_element
+          return cp_element.text
+        else
+          LOG.debug "Looks like no sale is going on.  Returning list price"
+          return list_price
+        end
+      end
+      def rating
+        result = agent.page.search("//div[@id='averageCustomerReviews']//span[@title]").first
+        if result
+          m = RatingRegex.match result[:title]
+          if m and m[1]
+            return m[1]
+          end
+        end
+      end
+      def reviews
+        reviews_element = agent.page.search("//div[@id='averageCustomerReviews']//a[contains(text(), 'reviews')]")
+        if reviews_element
+          text = reviews_element.text.gsub(/[^\d]/, "")
+          return text.to_i unless text.empty?
+        else
+          LOG.warning "Reviews element could not be found"
+        end
+      end
+      def scrape
+        return {
+          title:         title,
+          url:           url,
+          list_price:    list_price,
+          current_price: current_price,
+          rating:        rating,
+          reviews:       reviews
+        }
+      end
+    end
+  end
+end

data/lib/amazon_deets/kindle.rb ADDED

@@ -0,0 +1,90 @@
+require 'logbert'
+require 'mechanize'
+require 'amazon_deets/core'
+module AmazonDeets
+  class KindleFragment < MechanizedFragment
+    def applicable?(agent)
+      agent.page.search("div.kindleBanner").any?
+    end
+    def scrape(agent)
+      context = Context.new(agent: agent)
+      return context.scrape
+    end
+    class Context < MechanizedContext
+      LOG = Logbert[self]
+      RatingRegex  = /(.+)\s+out\sof/
+      def title
+        result = agent.page.search("span#btAsinTitle").first
+        if result
+          return result.text.strip
+        end
+      end
+      def url
+        agent.page.uri.to_s
+      end
+      def list_price
+        lp_element = agent.page.search("td.listPrice").first
+        if lp_element
+          return lp_element.text.gsub(/[^.\d]/, "")
+        end
+      end
+      def current_price
+        cp_element = agent.page.search("td b.priceLarge").first
+        if cp_element
+          return cp_element.text.gsub(/[^.\d]/, "")
+        end
+      end
+      def rating
+        result = agent.page.search("span.crAvgStars span[title$='5 stars']").first
+        if result
+          m = RatingRegex.match result[:title]
+          LOG.info result[:title]
+          if m and m[1]
+            return m[1]
+          end
+        else
+          LOG.warning "Unable to locate rating element"
+        end
+      end
+      def reviews
+        reviews_element = agent.page.search("//span[@class='crAvgStars']/a[contains(text(), 'reviews')]")
+        if reviews_element
+          text = reviews_element.text.gsub(/[^\d]/, "")
+          return text.to_i unless text.empty?
+        else
+          LOG.warning "Reviews element could not be found"
+        end
+      end
+      def scrape
+        return {
+          title:         title,
+          url:           url,
+          list_price:    list_price,
+          current_price: current_price,
+          rating:        rating,
+          reviews:       reviews
+        }
+      end
+    end
+  end
+end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: amazon_deets
 version: !ruby/object:Gem::Version
-  version: 0.0.3
+  version: 0.2.0
 platform: ruby
 authors:
 - Brian Lauber
@@ -44,8 +44,12 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
+- lib/amazon_deets/core.rb
+- lib/amazon_deets/factories.rb
+- lib/amazon_deets/general_merchandise.rb
+- lib/amazon_deets/kindle.rb
 - lib/amazon_deets.rb
-homepage:
+homepage: https://github.com/briandamaged/amazon_deets
 licenses:
 - MIT
 metadata: {}