scrapers 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/ChangeLog +7 -0
  4. data/Gemfile +0 -8
  5. data/Guardfile +1 -1
  6. data/bin/rubytapas +2 -75
  7. data/lib/scrapers.rb +1 -3
  8. data/lib/scrapers/manning_books.rb +37 -27
  9. data/lib/scrapers/rubytapas.rb +6 -81
  10. data/lib/scrapers/rubytapas/cli.rb +39 -0
  11. data/lib/scrapers/rubytapas/config.rb +11 -0
  12. data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
  13. data/lib/scrapers/rubytapas/episode.rb +86 -0
  14. data/lib/scrapers/rubytapas/scraper.rb +142 -0
  15. data/lib/scrapers/version.rb +2 -2
  16. data/scrapers.gemspec +4 -1
  17. data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
  18. data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
  19. data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
  20. data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
  21. data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
  22. data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
  23. data/spec/scrapers/allrecipes_spec.rb +2 -2
  24. data/spec/scrapers/discoverynews_spec.rb +3 -14
  25. data/spec/scrapers/download_spec.rb +6 -16
  26. data/spec/scrapers/gocomics_spec.rb +3 -3
  27. data/spec/scrapers/imgur_spec.rb +10 -22
  28. data/spec/scrapers/manning_books_spec.rb +9 -6
  29. data/spec/scrapers/nasa_apod_spec.rb +12 -14
  30. data/spec/scrapers/sinfest_spec.rb +3 -3
  31. data/spec/scrapers/xkcd_spec.rb +1 -0
  32. data/spec/scrapers_spec.rb +2 -1
  33. data/spec/spec_helper.rb +1 -8
  34. data/spec/support/dir_helpers.rb +13 -0
  35. data/spec/support/use_vcr.rb +9 -0
  36. data/vcr_cassettes/nasa-apod.yml +348 -0
  37. data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
  38. data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
  39. data/vcr_cassettes/rubytapas_download.yml +982 -0
  40. data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
  41. data/vcr_cassettes/rubytapas_feed.yml +5880 -0
  42. data/vcr_cassettes/rubytapas_login.yml +849 -0
  43. metadata +74 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
4
- data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
3
+ metadata.gz: 6a3e0b5e2fa58fb8686e8c350b1619dce0e85e0f
4
+ data.tar.gz: 9b2f65ddff548a914a2b5e9d8155f725568c5708
5
5
  SHA512:
6
- metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
7
- data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
6
+ metadata.gz: e67a79dccb7e436e9b629a63157ed44c0bdf76e83f99bbe8bc2d33f4edab9c0b68401d1c1da554077871b7f0affde40bb4a2f33508db20b6726c1c2f7781d549
7
+ data.tar.gz: d8f62b7d23d09253ecaba8f37f26b0852019e0b4e8403feafaf109791896dd89d491edb99eeffec87b8fdf1946a0e4b8a40f3038f6706686b79c44e61554c2a1
data/.gitignore CHANGED
@@ -18,3 +18,4 @@ tmp
18
18
  .rspec
19
19
  .tapas
20
20
  tmp/
21
+ TAGS
@@ -0,0 +1,7 @@
1
+ 2014-12-27 Tamara Temple <tamouse@gmail.com>
2
+
3
+ * Replace lib/scrapers/rubytapas/downloader.rb with
4
+ lib/scrapers/rubytapas/dpdcart.rb and restructure
5
+ lib/scrapers/rubytapas/scraper.rb to use it as a service gateway
6
+ object.
7
+
data/Gemfile CHANGED
@@ -2,11 +2,3 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in scrapers.gemspec
4
4
  gemspec
5
-
6
- group :development, :test do
7
- gem "pry"
8
- gem "pry-byebug"
9
- gem "pry-nav"
10
- gem "pry-rescue"
11
- gem "pry-stack_explorer"
12
- end
data/Guardfile CHANGED
@@ -1,4 +1,4 @@
1
- guard :rspec do
1
+ guard :rspec, cmd: 'bundle exec rspec -f d --fail-fast' do
2
2
  watch(%r{^spec/.+_spec\.rb$})
3
3
  watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
4
  watch('spec/spec_helper.rb') { "spec" }
@@ -1,77 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- require 'thor'
3
- require 'netrc'
4
- require 'scrapers/rubytapas'
2
+ require 'scrapers/rubytapas/cli'
5
3
 
6
- ################################################################################
7
- #
8
- # Scraper for RubyTapas episodes.
9
- #
10
- # (Note: you need to subscribe to RupyTapas to download episodes.)
11
- #
12
- ################################################################################
13
-
14
- class RubyTapasDownload < Thor
15
-
16
- RUBYTAPAS="rubytapas.dpdcart.com"
17
- RUBYTAPAS_EPISODE_URL = "https://#{RUBYTAPAS}/subscriber/post?id=\#{episode}"
18
-
19
- desc "download", "Downloads the listed episode's files into a new directory with the episode tag in the given directory"
20
- method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
21
- method_option :url, :desc => "url to episode downloads (overrides episode)"
22
- method_option :episode, :aliases => %w{-e --ep}, :desc => "Episode number"
23
- method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
24
- method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
25
-
26
- def download
27
- netrc = Netrc.read
28
- user, pw = netrc[RUBYTAPAS]
29
- user = options.fetch("user", user)
30
- pw = options.fetch("password", pw)
31
- url = options.fetch("url", nil)
32
- episode = options.fetch("episode", nil)
33
- destination = options.fetch("destination", nil)
34
- STDERR.puts "destination: #{destination}, episode: #{episode}, url: #{url}, user: #{user}, pw: #{pw.length}"
35
- unless url
36
- raise "Must give episode or full url" unless episode
37
- url = RUBYTAPAS_EPISODE_URL.sub(%r[\#{episode}], episode)
38
- end
39
- Scrapers::RubyTapas.scrape url, user, pw, destination
40
- end
41
-
42
- desc "all", "Download all rubytapas episodes"
43
- method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination of dowload", :default => '.'
44
- method_option :url, :desc => "url of showlist", :default => 'https://rubytapas.dpdcart.com/subscriber/content'
45
- method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
46
- method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
47
-
48
- def all
49
- STDERR.puts options.inspect
50
-
51
- netrc = Netrc.read
52
- user, pw = netrc[RUBYTAPAS]
53
- user = options.fetch("user", user)
54
- pw = options.fetch("password", pw)
55
- url = options.fetch("url", nil)
56
- destination = options.fetch("destination", nil)
57
- STDERR.puts "destination: #{destination}, url: #{url}, user: #{user}, pw: #{pw.length}"
58
- raise "Must give url" unless url
59
-
60
- showlist_urls = Scrapers::RubyTapas.showlist(url, user, pw)
61
-
62
- showlist_urls.each do |url|
63
- begin
64
- Scrapers::RubyTapas.scrape url, user, pw, destination
65
- rescue Errno::EEXIST
66
- puts "episode exists, skipping"
67
- end
68
- print "pausing..."
69
- sleep 5
70
- puts "."
71
- end
72
-
73
- end
74
-
75
- end
76
-
77
- RubyTapasDownload.start
4
+ Scrapers::RubyTapas::CLI.start(ARGV)
@@ -1,16 +1,14 @@
1
1
  require 'mechanize'
2
2
  require 'uri'
3
- Dir[File.join(File.expand_path('../', __FILE__),'**','*.rb')].each {|file| require file}
4
3
 
5
4
  module Scrapers
6
5
  def self.agent()
7
6
  @agent ||= Mechanize.new
8
7
  end
9
8
 
10
- def self.base(url)
9
+ def self.base_url(url)
11
10
  u = URI.parse(url)
12
11
  u.path=''
13
12
  u.to_s
14
13
  end
15
-
16
14
  end
@@ -22,18 +22,20 @@ module Scrapers
22
22
  end
23
23
 
24
24
  def scrape
25
- Mechanize.start do |m|
26
- login(m) do |m|
27
- book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
28
- Dir.chdir(destination) do |dir|
29
- @results = download_books(m, book_downloads)
25
+ @results = nil
26
+ Dir.chdir(destination) do |dir|
27
+
28
+ Mechanize.start do |m|
29
+ login(m) do |m|
30
+ books = build_book_list(m.current_page)
31
+ @results = download_books(m, books)
30
32
  end
31
33
  end
32
- end
33
34
 
34
- Hash[@results]
35
+ end
36
+ @results
35
37
  end
36
-
38
+
37
39
  def login(agent, &block)
38
40
  raise "Must provide a block to execute after logged in to site" unless block_given?
39
41
 
@@ -49,6 +51,33 @@ module Scrapers
49
51
  yield agent
50
52
  end
51
53
 
54
+ def build_book_list(page)
55
+ page.search('.book').map do |book|
56
+ {
57
+ title: book.at('[data-type=title]').children.first.text,
58
+ downloads: book.at('.book_downloads').search('a').map do |link|
59
+ [link.children.first.text.downcase.to_sym, link.attr(:href)]
60
+ end.to_h
61
+ }
62
+ end
63
+ end
64
+
65
+ def download_books(agent, books)
66
+ books.map do |book|
67
+ puts "Retrieving #{book[:title]}"
68
+ downloads = book[:downloads].map do |type, href|
69
+ next unless %i[pdf epub kindle].include?(type)
70
+ print " downloading #{type} ..."
71
+ agent.get href unless dry_run
72
+ agent.current_page.save! unless dry_run
73
+ puts "saved #{agent.current_page.filename}"
74
+ [agent.current_page.filename, href]
75
+ end.compact.to_h
76
+ wait_a_bit delay_time
77
+ [book[:title], downloads]
78
+ end.to_h
79
+ end
80
+
52
81
  def wait_a_bit(delay)
53
82
  puts "delaying for #{delay} second(s)"
54
83
  %w[- * | +].cycle do |c|
@@ -60,25 +89,6 @@ module Scrapers
60
89
  print "\r"
61
90
  end
62
91
 
63
-
64
- def download_books(agent, books)
65
- books.map do |book|
66
- bookname = book.node.parent.parent.parent.parent.at_css('h1').text
67
- puts "Downloading #{bookname} from #{book.href}"
68
- if dry_run
69
- warn "dry run, not saving"
70
- else
71
- agent.get book.href
72
- puts "Saving #{agent.current_page.filename}"
73
- agent.current_page.save! # overwrite!
74
- end
75
-
76
- wait_a_bit delay_time
77
- [agent.current_page.filename, agent.current_page.uri.to_s]
78
- end
79
- end
80
-
81
92
  end
82
93
  end
83
94
  end
84
-
@@ -1,88 +1,13 @@
1
- require 'fileutils'
2
- require 'ostruct'
3
- require 'mechanize'
4
- require 'uri'
1
+ require 'scrapers/rubytapas/config'
2
+ require 'scrapers/rubytapas/cli'
3
+ require 'scrapers/rubytapas/scraper'
4
+ require 'scrapers/rubytapas/episode'
5
+ require 'scrapers/rubytapas/dpdcart'
5
6
 
6
7
  module Scrapers
7
-
8
8
  module RubyTapas
9
9
 
10
- module_function
11
-
12
- # Save the post and attachments from an episode of RubyTapas
13
- # in a directory determined from the episode title.
14
- #
15
- # Example:
16
- # episode url: "https://rubytapas.dpdcart.com/subscriber/post?id=443"
17
- # title: "177 Aliasing | RubyTapas"
18
- # subdirectory: /177-aliasing
19
- #
20
- # Parameters:
21
- #
22
- # * *url* - url of the episode to download
23
- # * *user* - username used to log into dpdcart
24
- # * *pw* - password used with username
25
- # * *dest* - destination directory to put episode subdirectory
26
- #
27
- def scrape(url=nil, user=nil, pw=nil, dest=".")
28
- raise "Must give user and password for RubyTapas downloads" if user.to_s.empty? or pw.to_s.empty?
29
- dest = File.realdirpath(dest)
30
- raise "Destination #{dest} must be a writeable directory" unless File.directory?(dest) and File.writable?(dest)
31
-
32
- Mechanize.start do |m|
33
-
34
- tapas = OpenStruct.new
35
-
36
- m = self.login(m, url, user, pw)
37
-
38
- m.current_page.tap do |page|
39
- tapas.title = page.title.strip
40
- tapas.episode_dir = File.join(dest,tapas.title.split("|").first.strip.downcase.gsub(%r{[^[:alnum:][:space:]]},' ').gsub(%r{[[:space:]]+},'-'))
41
- tapas.attachments = page.links_with(:href => %r{\bdownload\b})
42
- puts "Fetching and saving #{tapas.title} into #{tapas.episode_dir}"
43
- FileUtils.mkdir(tapas.episode_dir)
44
- Dir.chdir(tapas.episode_dir) do |dir|
45
- tapas.attachments.each do |att|
46
- puts "fetching #{att.text}"
47
- file = att.click
48
- puts "saving #{file.filename}"
49
- file.save
50
- end
51
- end
52
- end
53
-
54
- tapas
55
-
56
- end
57
- end
58
-
59
- # retrieve a list of URLs for shows from the showlist
60
- def self.showlist(showlist_url, user=nil, pw=nil)
61
- raise "Must give showlist url, user, and password" if showlist_url.to_s.empty? || user.to_s.empty? || pw.to_s.empty?
62
-
63
- Mechanize.start do |m|
64
- m = self.login(m, showlist_url, user, pw)
65
- links = m.current_page.links_with(:text => "Read More")
66
- s = URI.parse(showlist_url)
67
- s.path = ''
68
- links.map{|l| "#{s}#{l.href}" }
69
- end
70
-
71
-
72
- end
73
-
74
- def self.login(m, url, user, pw)
75
- # First time, we will get redirected to the login page
76
- m.get url
77
- m.current_page.form.field_with(:name => "username").value = user
78
- m.current_page.form.field_with(:name => "password").value = pw
79
- m.current_page.form.submit
80
-
81
- # Second time, we should land on episode page
82
- m.get url
83
- raise "Not where I expected. #{m.current_page.uri} is not #{url}" unless m.current_page.uri != url
84
- m
85
- end
10
+ VERSION = "3.0.0"
86
11
 
87
12
  end
88
13
  end
@@ -0,0 +1,39 @@
1
+ require 'thor'
2
+ require 'scrapers/version'
3
+ require 'scrapers/rubytapas'
4
+
5
+ module Scrapers
6
+ module RubyTapas
7
+
8
+ # Thor script that handles things with Avdi Grimm's RubyTapas
9
+ class CLI < Thor
10
+
11
+ # Download an episode, or all episodes.
12
+ desc "download EPISODE", "Downloads the listed episode's files into a new directory with the episode tag in the given directory. Specifying ALL for the episode number downloads all episodes."
13
+ method_option :debug, type: :boolean
14
+ method_option :dry_run, type: :boolean
15
+ method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
16
+ method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
17
+ method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
18
+ def download(episode)
19
+ Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
20
+ end
21
+
22
+ # Get a list of available episodes
23
+ desc "list", "Show a list of the available episodes"
24
+ method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
25
+ method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
26
+ def list()
27
+ Scrapers::RubyTapas::Scraper.new(nil, options).list!
28
+ end
29
+
30
+ # Version Info
31
+ desc "version", "Show the rubytapas and scrapers library version info"
32
+ def version
33
+ say "rubytapas version: #{Scrapers::RubyTapas::VERSION}. scrapers version: #{Scrapers::VERSION}."
34
+ end
35
+
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,11 @@
1
+ module Scrapers
2
+ module RubyTapas
3
+
4
+ RUBYTAPAS_HOST="rubytapas.dpdcart.com"
5
+ CONTENTS_PATH = "/subscriber/content"
6
+ POST_PATH = "/post"
7
+ FEED_PATH = "/feed"
8
+ POST_QUERY = "id"
9
+
10
+ end
11
+ end
@@ -0,0 +1,115 @@
1
+ require 'netrc'
2
+ require 'mechanize'
3
+
4
+ module Scrapers
5
+ module RubyTapas
6
+
7
+ # DpdCart is a remote service gateway object (Gateway Pattern)
8
+ # that provides a connection to rubytapas.dpdcart.com where the
9
+ # RubyTapas episodes and download files are available, as well as
10
+ # the episode feed.
11
+ class DpdCart
12
+
13
+ RUBYTAPAS_HOST = 'rubytapas.dpdcart.com'
14
+ ENV_RUBYTAPAS_USER = 'RUBYTAPAS_USER'
15
+ ENV_RUBYTAPAS_PASSWORD = 'RUBYTAPAS_PASSWORD'
16
+ LOGIN_PATH = '/subscriber/login'
17
+ LOGIN_URL = "https://#{RUBYTAPAS_HOST}#{LOGIN_PATH}"
18
+ FEED_PATH = '/feed'
19
+ FEED_URL = "https://#{RUBYTAPAS_HOST}#{FEED_PATH}"
20
+ CONTENT_PATH = "/subscriber/content"
21
+ CONTENT_URL = "https://#{RUBYTAPAS_HOST}#{CONTENT_PATH}"
22
+
23
+ # User name for dpdcart account
24
+ attr_accessor :user
25
+
26
+ # Password for dpdcart acount
27
+ attr_accessor :password
28
+
29
+ attr_accessor :dry_run, :debug
30
+
31
+ # Create a new instance of the DpdCart gateway.
32
+ #
33
+ # @param user [String] - the DpdCart account name, typically an
34
+ # email address.
35
+ # @param password [String] - password associated with the
36
+ # account.
37
+ #
38
+ # If the user and password are empty, the information will be
39
+ # obtained in the following order:
40
+ #
41
+ # - reading the environment variables `RUBYTAPAS_USER` and
42
+ # `RUBYTAPAS_PASSWORD`
43
+ #
44
+ # - reading the user's `$HOME/.netrc` file and pulling the
45
+ # credentials that match the host name for the rubytapas
46
+ # account.
47
+ #
48
+ # If no credentials can be found, it will raise and error:
49
+ # `NoCredentialsError`.
50
+ def initialize(user=nil, password=nil, options={})
51
+ self.dry_run = options[:dry_run]
52
+ self.debug = options[:debug]
53
+ if user && password
54
+ @user = user
55
+ @password = password
56
+ else
57
+ @user, @password = get_credentials_from_environment
58
+ unless user && password
59
+ @user, @password = get_credentials_from_netrc
60
+ end
61
+ end
62
+ self.agent = Mechanize.new
63
+ end
64
+
65
+ # Return the episode feed from dpdcart
66
+ def feed!
67
+ uri = URI(FEED_URL)
68
+ request = Net::HTTP::Get.new(uri)
69
+ request.basic_auth user, password
70
+ Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
71
+ end
72
+
73
+ # Login to dpdcart before downloading
74
+ def login!
75
+ page = agent.get LOGIN_URL
76
+ page.form.field_with(name: "username").value = user
77
+ page.form.field_with(name: "password").value = password
78
+ page.form.submit
79
+ unless agent.page.title.match(/Subscription Content/)
80
+ raise "Could not log in"
81
+ end
82
+ agent
83
+ end
84
+
85
+ # Download the file from dpdcart
86
+ def download!(file)
87
+ warn "DEBUG: downloading #{file}" if debug
88
+ if dry_run
89
+ warn "DEBUG: download skipped for dry run" if dry_run
90
+ filename = file
91
+ body = "no body"
92
+ else
93
+ page = agent.get(file) unless dry_run
94
+ filename = page.filename
95
+ body = page.body
96
+ end
97
+ [ filename, body ]
98
+ end
99
+
100
+ private
101
+
102
+ attr_accessor :options, :agent
103
+
104
+ def get_credentials_from_environment
105
+ [ ENV[ENV_RUBYTAPAS_USER], ENV[ENV_RUBYTAPAS_PASSWORD] ]
106
+ end
107
+
108
+ def get_credentials_from_netrc
109
+ creds = Netrc.read[RUBYTAPAS_HOST]
110
+ [ creds.login, creds.password ]
111
+ end
112
+
113
+ end
114
+ end
115
+ end