RubyGems - scrapers - Versions diffs - 2.1.0 → 3.0.0 - Mend

scrapers 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/ChangeLog +7 -0
data/Gemfile +0 -8
data/Guardfile +1 -1
data/bin/rubytapas +2 -75
data/lib/scrapers.rb +1 -3
data/lib/scrapers/manning_books.rb +37 -27
data/lib/scrapers/rubytapas.rb +6 -81
data/lib/scrapers/rubytapas/cli.rb +39 -0
data/lib/scrapers/rubytapas/config.rb +11 -0
data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
data/lib/scrapers/rubytapas/episode.rb +86 -0
data/lib/scrapers/rubytapas/scraper.rb +142 -0
data/lib/scrapers/version.rb +2 -2
data/scrapers.gemspec +4 -1
data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
data/spec/scrapers/allrecipes_spec.rb +2 -2
data/spec/scrapers/discoverynews_spec.rb +3 -14
data/spec/scrapers/download_spec.rb +6 -16
data/spec/scrapers/gocomics_spec.rb +3 -3
data/spec/scrapers/imgur_spec.rb +10 -22
data/spec/scrapers/manning_books_spec.rb +9 -6
data/spec/scrapers/nasa_apod_spec.rb +12 -14
data/spec/scrapers/sinfest_spec.rb +3 -3
data/spec/scrapers/xkcd_spec.rb +1 -0
data/spec/scrapers_spec.rb +2 -1
data/spec/spec_helper.rb +1 -8
data/spec/support/dir_helpers.rb +13 -0
data/spec/support/use_vcr.rb +9 -0
data/vcr_cassettes/nasa-apod.yml +348 -0
data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
data/vcr_cassettes/rubytapas_download.yml +982 -0
data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
data/vcr_cassettes/rubytapas_feed.yml +5880 -0
data/vcr_cassettes/rubytapas_login.yml +849 -0
metadata +74 -6

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
-  data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
+  metadata.gz: 6a3e0b5e2fa58fb8686e8c350b1619dce0e85e0f
+  data.tar.gz: 9b2f65ddff548a914a2b5e9d8155f725568c5708
 SHA512:
-  metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
-  data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
+  metadata.gz: e67a79dccb7e436e9b629a63157ed44c0bdf76e83f99bbe8bc2d33f4edab9c0b68401d1c1da554077871b7f0affde40bb4a2f33508db20b6726c1c2f7781d549
+  data.tar.gz: d8f62b7d23d09253ecaba8f37f26b0852019e0b4e8403feafaf109791896dd89d491edb99eeffec87b8fdf1946a0e4b8a40f3038f6706686b79c44e61554c2a1

data/.gitignore CHANGED

@@ -18,3 +18,4 @@ tmp
 .rspec
 .tapas
 tmp/
+TAGS

data/ChangeLog ADDED

@@ -0,0 +1,7 @@
+2014-12-27  Tamara Temple  <tamouse@gmail.com>
+	* Replace lib/scrapers/rubytapas/downloader.rb with
+	lib/scrapers/rubytapas/dpdcart.rb and restructure
+	lib/scrapers/rubytapas/scraper.rb to use it as a service gateway
+	object.

data/Gemfile CHANGED

@@ -2,11 +2,3 @@ source 'https://rubygems.org'
 # Specify your gem's dependencies in scrapers.gemspec
 gemspec
-group :development, :test do
-  gem "pry"
-  gem "pry-byebug"
-  gem "pry-nav"
-  gem "pry-rescue"
-  gem "pry-stack_explorer"
-end

data/Guardfile CHANGED

@@ -1,4 +1,4 @@
-guard :rspec do
+guard :rspec, cmd: 'bundle exec rspec -f d --fail-fast' do
   watch(%r{^spec/.+_spec\.rb$})
   watch(%r{^lib/(.+)\.rb$})     { |m| "spec/#{m[1]}_spec.rb" }
   watch('spec/spec_helper.rb')  { "spec" }

data/bin/rubytapas CHANGED

@@ -1,77 +1,4 @@
 #!/usr/bin/env ruby
-require 'thor'
-require 'netrc'
-require 'scrapers/rubytapas'
+require 'scrapers/rubytapas/cli'
-################################################################################
-#
-# Scraper for RubyTapas episodes.
-#
-# (Note: you need to subscribe to RupyTapas to download episodes.)
-#
-################################################################################
-class RubyTapasDownload < Thor
-  RUBYTAPAS="rubytapas.dpdcart.com"
-  RUBYTAPAS_EPISODE_URL = "https://#{RUBYTAPAS}/subscriber/post?id=\#{episode}"
-  desc "download", "Downloads the listed episode's files into a new directory with the episode tag in the given directory"
-  method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
-  method_option :url, :desc => "url to episode downloads (overrides episode)"
-  method_option :episode, :aliases => %w{-e --ep}, :desc => "Episode number"
-  method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
-  method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
-  def download
-    netrc = Netrc.read
-    user, pw = netrc[RUBYTAPAS]
-    user = options.fetch("user", user)
-    pw = options.fetch("password", pw)
-    url = options.fetch("url", nil)
-    episode = options.fetch("episode", nil)
-    destination = options.fetch("destination", nil)
-    STDERR.puts "destination: #{destination}, episode: #{episode}, url: #{url}, user: #{user}, pw: #{pw.length}"
-    unless url
-      raise "Must give episode or full url" unless episode
-      url = RUBYTAPAS_EPISODE_URL.sub(%r[\#{episode}], episode)
-    end
-    Scrapers::RubyTapas.scrape url, user, pw, destination
-  end
-  desc "all", "Download all rubytapas episodes"
-  method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination of dowload", :default => '.'
-  method_option :url, :desc => "url of showlist", :default => 'https://rubytapas.dpdcart.com/subscriber/content'
-  method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
-  method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
-  def all
-    STDERR.puts options.inspect
-    netrc = Netrc.read
-    user, pw = netrc[RUBYTAPAS]
-    user = options.fetch("user", user)
-    pw = options.fetch("password", pw)
-    url = options.fetch("url", nil)
-    destination = options.fetch("destination", nil)
-    STDERR.puts "destination: #{destination}, url: #{url}, user: #{user}, pw: #{pw.length}"
-    raise "Must give url" unless url
-    showlist_urls = Scrapers::RubyTapas.showlist(url, user, pw)
-    showlist_urls.each do |url|
-      begin
-        Scrapers::RubyTapas.scrape url, user, pw, destination
-      rescue Errno::EEXIST
-        puts "episode exists, skipping"
-      end
-      print "pausing..."
-      sleep 5
-      puts "."
-    end
-  end
-end
-RubyTapasDownload.start
+Scrapers::RubyTapas::CLI.start(ARGV)

data/lib/scrapers.rb CHANGED

@@ -1,16 +1,14 @@
 require 'mechanize'
 require 'uri'
-Dir[File.join(File.expand_path('../', __FILE__),'**','*.rb')].each {|file| require file}
 module Scrapers
   def self.agent()
     @agent ||= Mechanize.new
   end
-  def self.base(url)
+  def self.base_url(url)
     u = URI.parse(url)
     u.path=''
     u.to_s
   end
 end

data/lib/scrapers/manning_books.rb CHANGED

@@ -22,18 +22,20 @@ module Scrapers
       end
       def scrape
-        Mechanize.start do |m|
-          login(m) do |m|
-            book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
-            Dir.chdir(destination) do |dir|
-              @results = download_books(m, book_downloads)
+        @results = nil
+        Dir.chdir(destination) do |dir|
+          Mechanize.start do |m|
+            login(m) do |m|
+              books = build_book_list(m.current_page)
+              @results = download_books(m, books)
             end
           end
-        end
-        Hash[@results]
+        end
+        @results
       end
       def login(agent, &block)
         raise "Must provide a block to execute after logged in to site" unless block_given?
@@ -49,6 +51,33 @@ module Scrapers
         yield agent
       end
+      def build_book_list(page)
+        page.search('.book').map do |book|
+          {
+            title: book.at('[data-type=title]').children.first.text,
+            downloads: book.at('.book_downloads').search('a').map do |link|
+              [link.children.first.text.downcase.to_sym, link.attr(:href)]
+            end.to_h
+          }
+        end
+      end
+      def download_books(agent, books)
+        books.map do |book|
+          puts "Retrieving #{book[:title]}"
+          downloads = book[:downloads].map do |type, href|
+            next unless %i[pdf epub kindle].include?(type)
+            print "  downloading #{type} ..."
+            agent.get href unless dry_run
+            agent.current_page.save! unless dry_run
+            puts "saved #{agent.current_page.filename}"
+            [agent.current_page.filename, href]
+          end.compact.to_h
+          wait_a_bit delay_time
+          [book[:title], downloads]
+        end.to_h
+      end
       def wait_a_bit(delay)
         puts "delaying for #{delay} second(s)"
         %w[- * | +].cycle do |c|
@@ -60,25 +89,6 @@ module Scrapers
         print "\r"
       end
-      def download_books(agent, books)
-        books.map do |book|
-          bookname = book.node.parent.parent.parent.parent.at_css('h1').text
-          puts "Downloading #{bookname} from #{book.href}"
-          if dry_run
-            warn "dry run, not saving"
-          else
-            agent.get book.href
-            puts "Saving #{agent.current_page.filename}"
-            agent.current_page.save! # overwrite!
-          end
-          wait_a_bit delay_time
-          [agent.current_page.filename, agent.current_page.uri.to_s]
-        end
-      end
     end
   end
 end

data/lib/scrapers/rubytapas.rb CHANGED

@@ -1,88 +1,13 @@
-require 'fileutils'
-require 'ostruct'
-require 'mechanize'
-require 'uri'
+require 'scrapers/rubytapas/config'
+require 'scrapers/rubytapas/cli'
+require 'scrapers/rubytapas/scraper'
+require 'scrapers/rubytapas/episode'
+require 'scrapers/rubytapas/dpdcart'
 module Scrapers
   module RubyTapas
-    module_function
-    # Save the post and attachments from an episode of RubyTapas
-    # in a directory determined from the episode title.
-    #
-    # Example:
-    #  episode url: "https://rubytapas.dpdcart.com/subscriber/post?id=443"
-    #  title: "177 Aliasing | RubyTapas"
-    #  subdirectory: /177-aliasing
-    #
-    # Parameters:
-    #
-    # * *url* - url of the episode to download
-    # * *user* - username used to log into dpdcart
-    # * *pw* - password used with username
-    # * *dest* - destination directory to put episode subdirectory
-    #
-    def scrape(url=nil, user=nil, pw=nil, dest=".")
-      raise "Must give user and password for RubyTapas downloads" if user.to_s.empty? or pw.to_s.empty?
-      dest = File.realdirpath(dest)
-      raise "Destination #{dest} must be a writeable directory" unless File.directory?(dest) and File.writable?(dest)
-      Mechanize.start do |m|
-        tapas = OpenStruct.new
-        m = self.login(m, url, user, pw)
-        m.current_page.tap do |page|
-          tapas.title = page.title.strip
-          tapas.episode_dir = File.join(dest,tapas.title.split("|").first.strip.downcase.gsub(%r{[^[:alnum:][:space:]]},' ').gsub(%r{[[:space:]]+},'-'))
-          tapas.attachments = page.links_with(:href => %r{\bdownload\b})
-          puts "Fetching and saving #{tapas.title} into #{tapas.episode_dir}"
-          FileUtils.mkdir(tapas.episode_dir)
-          Dir.chdir(tapas.episode_dir) do |dir|
-            tapas.attachments.each do |att|
-              puts "fetching #{att.text}"
-              file = att.click
-              puts "saving #{file.filename}"
-              file.save
-            end
-          end
-        end
-        tapas
-      end
-    end
-    # retrieve a list of URLs for shows from the showlist
-    def self.showlist(showlist_url, user=nil, pw=nil)
-      raise "Must give showlist url, user, and password" if showlist_url.to_s.empty? || user.to_s.empty? || pw.to_s.empty?
-      Mechanize.start do |m|
-        m = self.login(m, showlist_url, user, pw)
-        links = m.current_page.links_with(:text => "Read More")
-        s = URI.parse(showlist_url)
-        s.path = ''
-        links.map{|l| "#{s}#{l.href}" }
-      end
-    end
-    def self.login(m, url, user, pw)
-      # First time, we will get redirected to the login page
-      m.get url
-      m.current_page.form.field_with(:name => "username").value = user
-      m.current_page.form.field_with(:name => "password").value = pw
-      m.current_page.form.submit
-      # Second time, we should land on episode page
-      m.get url
-      raise "Not where I expected. #{m.current_page.uri} is not #{url}" unless m.current_page.uri != url
-      m
-    end
+    VERSION = "3.0.0"
   end
 end

data/lib/scrapers/rubytapas/cli.rb ADDED

@@ -0,0 +1,39 @@
+require 'thor'
+require 'scrapers/version'
+require 'scrapers/rubytapas'
+module Scrapers
+  module RubyTapas
+    # Thor script that handles things with Avdi Grimm's RubyTapas
+    class CLI < Thor
+      # Download an episode, or all episodes.
+      desc "download EPISODE", "Downloads the listed episode's files into a new directory with the episode tag in the given directory. Specifying ALL for the episode number downloads all episodes."
+      method_option :debug, type: :boolean
+      method_option :dry_run, type: :boolean
+      method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
+      method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
+      method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
+      def download(episode)
+        Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
+      end
+      # Get a list of available episodes
+      desc "list", "Show a list of the available episodes"
+      method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
+      method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
+      def list()
+        Scrapers::RubyTapas::Scraper.new(nil, options).list!
+      end
+      # Version Info
+      desc "version", "Show the rubytapas and scrapers library version info"
+      def version
+        say "rubytapas version: #{Scrapers::RubyTapas::VERSION}. scrapers version: #{Scrapers::VERSION}."
+      end
+    end
+  end
+end

data/lib/scrapers/rubytapas/config.rb ADDED

@@ -0,0 +1,11 @@
+module Scrapers
+  module RubyTapas
+    RUBYTAPAS_HOST="rubytapas.dpdcart.com"
+    CONTENTS_PATH = "/subscriber/content"
+    POST_PATH = "/post"
+    FEED_PATH = "/feed"
+    POST_QUERY = "id"
+  end
+end

data/lib/scrapers/rubytapas/dpdcart.rb ADDED

@@ -0,0 +1,115 @@
+require 'netrc'
+require 'mechanize'
+module Scrapers
+  module RubyTapas
+    # DpdCart is a remote service gateway object (Gateway Pattern)
+    # that provides a connection to rubytapas.dpdcart.com where the
+    # RubyTapas episodes and download files are available, as well as
+    # the episode feed.
+    class DpdCart
+      RUBYTAPAS_HOST = 'rubytapas.dpdcart.com'
+      ENV_RUBYTAPAS_USER = 'RUBYTAPAS_USER'
+      ENV_RUBYTAPAS_PASSWORD = 'RUBYTAPAS_PASSWORD'
+      LOGIN_PATH = '/subscriber/login'
+      LOGIN_URL = "https://#{RUBYTAPAS_HOST}#{LOGIN_PATH}"
+      FEED_PATH = '/feed'
+      FEED_URL = "https://#{RUBYTAPAS_HOST}#{FEED_PATH}"
+      CONTENT_PATH = "/subscriber/content"
+      CONTENT_URL = "https://#{RUBYTAPAS_HOST}#{CONTENT_PATH}"
+      # User name for dpdcart account
+      attr_accessor :user
+      # Password for dpdcart acount
+      attr_accessor :password
+      attr_accessor :dry_run, :debug
+      # Create a new instance of the DpdCart gateway.
+      #
+      # @param user [String] - the DpdCart account name, typically an
+      # email address.
+      # @param password [String] - password associated with the
+      # account.
+      #
+      # If the user and password are empty, the information will be
+      # obtained in the following order:
+      #
+      # - reading the environment variables `RUBYTAPAS_USER` and
+      # `RUBYTAPAS_PASSWORD`
+      #
+      # - reading the user's `$HOME/.netrc` file and pulling the
+      # credentials that match the host name for the rubytapas
+      # account.
+      #
+      # If no credentials can be found, it will raise and error:
+      # `NoCredentialsError`.
+      def initialize(user=nil, password=nil, options={})
+        self.dry_run = options[:dry_run]
+        self.debug = options[:debug]
+        if user && password
+          @user = user
+          @password = password
+        else
+          @user, @password = get_credentials_from_environment
+          unless user && password
+            @user, @password = get_credentials_from_netrc
+          end
+        end
+        self.agent = Mechanize.new
+      end
+      # Return the episode feed from dpdcart
+      def feed!
+        uri = URI(FEED_URL)
+        request = Net::HTTP::Get.new(uri)
+        request.basic_auth user, password
+        Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
+      end
+      # Login to dpdcart before downloading
+      def login!
+        page = agent.get LOGIN_URL
+        page.form.field_with(name: "username").value = user
+        page.form.field_with(name: "password").value = password
+        page.form.submit
+        unless agent.page.title.match(/Subscription Content/)
+          raise "Could not log in"
+        end
+        agent
+      end
+      # Download the file from dpdcart
+      def download!(file)
+        warn "DEBUG: downloading #{file}" if debug
+        if dry_run
+          warn "DEBUG: download skipped for dry run" if dry_run
+          filename = file
+          body = "no body"
+        else
+          page = agent.get(file) unless dry_run
+          filename = page.filename
+          body = page.body
+        end
+        [ filename, body ]
+      end
+      private
+      attr_accessor :options, :agent
+      def get_credentials_from_environment
+        [ ENV[ENV_RUBYTAPAS_USER], ENV[ENV_RUBYTAPAS_PASSWORD] ]
+      end
+      def get_credentials_from_netrc
+        creds = Netrc.read[RUBYTAPAS_HOST]
+        [ creds.login, creds.password ]
+      end
+    end
+  end
+end