scrapers 2.1.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/ChangeLog +7 -0
  4. data/Gemfile +0 -8
  5. data/Guardfile +1 -1
  6. data/bin/rubytapas +2 -75
  7. data/lib/scrapers.rb +1 -3
  8. data/lib/scrapers/manning_books.rb +37 -27
  9. data/lib/scrapers/rubytapas.rb +6 -81
  10. data/lib/scrapers/rubytapas/cli.rb +39 -0
  11. data/lib/scrapers/rubytapas/config.rb +11 -0
  12. data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
  13. data/lib/scrapers/rubytapas/episode.rb +86 -0
  14. data/lib/scrapers/rubytapas/scraper.rb +142 -0
  15. data/lib/scrapers/version.rb +2 -2
  16. data/scrapers.gemspec +4 -1
  17. data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
  18. data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
  19. data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
  20. data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
  21. data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
  22. data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
  23. data/spec/scrapers/allrecipes_spec.rb +2 -2
  24. data/spec/scrapers/discoverynews_spec.rb +3 -14
  25. data/spec/scrapers/download_spec.rb +6 -16
  26. data/spec/scrapers/gocomics_spec.rb +3 -3
  27. data/spec/scrapers/imgur_spec.rb +10 -22
  28. data/spec/scrapers/manning_books_spec.rb +9 -6
  29. data/spec/scrapers/nasa_apod_spec.rb +12 -14
  30. data/spec/scrapers/sinfest_spec.rb +3 -3
  31. data/spec/scrapers/xkcd_spec.rb +1 -0
  32. data/spec/scrapers_spec.rb +2 -1
  33. data/spec/spec_helper.rb +1 -8
  34. data/spec/support/dir_helpers.rb +13 -0
  35. data/spec/support/use_vcr.rb +9 -0
  36. data/vcr_cassettes/nasa-apod.yml +348 -0
  37. data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
  38. data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
  39. data/vcr_cassettes/rubytapas_download.yml +982 -0
  40. data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
  41. data/vcr_cassettes/rubytapas_feed.yml +5880 -0
  42. data/vcr_cassettes/rubytapas_login.yml +849 -0
  43. metadata +74 -6
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
4
- data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
3
+ metadata.gz: 6a3e0b5e2fa58fb8686e8c350b1619dce0e85e0f
4
+ data.tar.gz: 9b2f65ddff548a914a2b5e9d8155f725568c5708
5
5
  SHA512:
6
- metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
7
- data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
6
+ metadata.gz: e67a79dccb7e436e9b629a63157ed44c0bdf76e83f99bbe8bc2d33f4edab9c0b68401d1c1da554077871b7f0affde40bb4a2f33508db20b6726c1c2f7781d549
7
+ data.tar.gz: d8f62b7d23d09253ecaba8f37f26b0852019e0b4e8403feafaf109791896dd89d491edb99eeffec87b8fdf1946a0e4b8a40f3038f6706686b79c44e61554c2a1
data/.gitignore CHANGED
@@ -18,3 +18,4 @@ tmp
18
18
  .rspec
19
19
  .tapas
20
20
  tmp/
21
+ TAGS
@@ -0,0 +1,7 @@
1
+ 2014-12-27 Tamara Temple <tamouse@gmail.com>
2
+
3
+ * Replace lib/scrapers/rubytapas/downloader.rb with
4
+ lib/scrapers/rubytapas/dpdcart.rb and restructure
5
+ lib/scrapers/rubytapas/scraper.rb to use it as a service gateway
6
+ object.
7
+
data/Gemfile CHANGED
@@ -2,11 +2,3 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in scrapers.gemspec
4
4
  gemspec
5
-
6
- group :development, :test do
7
- gem "pry"
8
- gem "pry-byebug"
9
- gem "pry-nav"
10
- gem "pry-rescue"
11
- gem "pry-stack_explorer"
12
- end
data/Guardfile CHANGED
@@ -1,4 +1,4 @@
1
- guard :rspec do
1
+ guard :rspec, cmd: 'bundle exec rspec -f d --fail-fast' do
2
2
  watch(%r{^spec/.+_spec\.rb$})
3
3
  watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
4
4
  watch('spec/spec_helper.rb') { "spec" }
@@ -1,77 +1,4 @@
1
1
  #!/usr/bin/env ruby
2
- require 'thor'
3
- require 'netrc'
4
- require 'scrapers/rubytapas'
2
+ require 'scrapers/rubytapas/cli'
5
3
 
6
- ################################################################################
7
- #
8
- # Scraper for RubyTapas episodes.
9
- #
10
- # (Note: you need to subscribe to RupyTapas to download episodes.)
11
- #
12
- ################################################################################
13
-
14
- class RubyTapasDownload < Thor
15
-
16
- RUBYTAPAS="rubytapas.dpdcart.com"
17
- RUBYTAPAS_EPISODE_URL = "https://#{RUBYTAPAS}/subscriber/post?id=\#{episode}"
18
-
19
- desc "download", "Downloads the listed episode's files into a new directory with the episode tag in the given directory"
20
- method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
21
- method_option :url, :desc => "url to episode downloads (overrides episode)"
22
- method_option :episode, :aliases => %w{-e --ep}, :desc => "Episode number"
23
- method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
24
- method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
25
-
26
- def download
27
- netrc = Netrc.read
28
- user, pw = netrc[RUBYTAPAS]
29
- user = options.fetch("user", user)
30
- pw = options.fetch("password", pw)
31
- url = options.fetch("url", nil)
32
- episode = options.fetch("episode", nil)
33
- destination = options.fetch("destination", nil)
34
- STDERR.puts "destination: #{destination}, episode: #{episode}, url: #{url}, user: #{user}, pw: #{pw.length}"
35
- unless url
36
- raise "Must give episode or full url" unless episode
37
- url = RUBYTAPAS_EPISODE_URL.sub(%r[\#{episode}], episode)
38
- end
39
- Scrapers::RubyTapas.scrape url, user, pw, destination
40
- end
41
-
42
- desc "all", "Download all rubytapas episodes"
43
- method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination of dowload", :default => '.'
44
- method_option :url, :desc => "url of showlist", :default => 'https://rubytapas.dpdcart.com/subscriber/content'
45
- method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
46
- method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
47
-
48
- def all
49
- STDERR.puts options.inspect
50
-
51
- netrc = Netrc.read
52
- user, pw = netrc[RUBYTAPAS]
53
- user = options.fetch("user", user)
54
- pw = options.fetch("password", pw)
55
- url = options.fetch("url", nil)
56
- destination = options.fetch("destination", nil)
57
- STDERR.puts "destination: #{destination}, url: #{url}, user: #{user}, pw: #{pw.length}"
58
- raise "Must give url" unless url
59
-
60
- showlist_urls = Scrapers::RubyTapas.showlist(url, user, pw)
61
-
62
- showlist_urls.each do |url|
63
- begin
64
- Scrapers::RubyTapas.scrape url, user, pw, destination
65
- rescue Errno::EEXIST
66
- puts "episode exists, skipping"
67
- end
68
- print "pausing..."
69
- sleep 5
70
- puts "."
71
- end
72
-
73
- end
74
-
75
- end
76
-
77
- RubyTapasDownload.start
4
+ Scrapers::RubyTapas::CLI.start(ARGV)
@@ -1,16 +1,14 @@
1
1
  require 'mechanize'
2
2
  require 'uri'
3
- Dir[File.join(File.expand_path('../', __FILE__),'**','*.rb')].each {|file| require file}
4
3
 
5
4
  module Scrapers
6
5
  def self.agent()
7
6
  @agent ||= Mechanize.new
8
7
  end
9
8
 
10
- def self.base(url)
9
+ def self.base_url(url)
11
10
  u = URI.parse(url)
12
11
  u.path=''
13
12
  u.to_s
14
13
  end
15
-
16
14
  end
@@ -22,18 +22,20 @@ module Scrapers
22
22
  end
23
23
 
24
24
  def scrape
25
- Mechanize.start do |m|
26
- login(m) do |m|
27
- book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
28
- Dir.chdir(destination) do |dir|
29
- @results = download_books(m, book_downloads)
25
+ @results = nil
26
+ Dir.chdir(destination) do |dir|
27
+
28
+ Mechanize.start do |m|
29
+ login(m) do |m|
30
+ books = build_book_list(m.current_page)
31
+ @results = download_books(m, books)
30
32
  end
31
33
  end
32
- end
33
34
 
34
- Hash[@results]
35
+ end
36
+ @results
35
37
  end
36
-
38
+
37
39
  def login(agent, &block)
38
40
  raise "Must provide a block to execute after logged in to site" unless block_given?
39
41
 
@@ -49,6 +51,33 @@ module Scrapers
49
51
  yield agent
50
52
  end
51
53
 
54
+ def build_book_list(page)
55
+ page.search('.book').map do |book|
56
+ {
57
+ title: book.at('[data-type=title]').children.first.text,
58
+ downloads: book.at('.book_downloads').search('a').map do |link|
59
+ [link.children.first.text.downcase.to_sym, link.attr(:href)]
60
+ end.to_h
61
+ }
62
+ end
63
+ end
64
+
65
+ def download_books(agent, books)
66
+ books.map do |book|
67
+ puts "Retrieving #{book[:title]}"
68
+ downloads = book[:downloads].map do |type, href|
69
+ next unless %i[pdf epub kindle].include?(type)
70
+ print " downloading #{type} ..."
71
+ agent.get href unless dry_run
72
+ agent.current_page.save! unless dry_run
73
+ puts "saved #{agent.current_page.filename}"
74
+ [agent.current_page.filename, href]
75
+ end.compact.to_h
76
+ wait_a_bit delay_time
77
+ [book[:title], downloads]
78
+ end.to_h
79
+ end
80
+
52
81
  def wait_a_bit(delay)
53
82
  puts "delaying for #{delay} second(s)"
54
83
  %w[- * | +].cycle do |c|
@@ -60,25 +89,6 @@ module Scrapers
60
89
  print "\r"
61
90
  end
62
91
 
63
-
64
- def download_books(agent, books)
65
- books.map do |book|
66
- bookname = book.node.parent.parent.parent.parent.at_css('h1').text
67
- puts "Downloading #{bookname} from #{book.href}"
68
- if dry_run
69
- warn "dry run, not saving"
70
- else
71
- agent.get book.href
72
- puts "Saving #{agent.current_page.filename}"
73
- agent.current_page.save! # overwrite!
74
- end
75
-
76
- wait_a_bit delay_time
77
- [agent.current_page.filename, agent.current_page.uri.to_s]
78
- end
79
- end
80
-
81
92
  end
82
93
  end
83
94
  end
84
-
@@ -1,88 +1,13 @@
1
- require 'fileutils'
2
- require 'ostruct'
3
- require 'mechanize'
4
- require 'uri'
1
+ require 'scrapers/rubytapas/config'
2
+ require 'scrapers/rubytapas/cli'
3
+ require 'scrapers/rubytapas/scraper'
4
+ require 'scrapers/rubytapas/episode'
5
+ require 'scrapers/rubytapas/dpdcart'
5
6
 
6
7
  module Scrapers
7
-
8
8
  module RubyTapas
9
9
 
10
- module_function
11
-
12
- # Save the post and attachments from an episode of RubyTapas
13
- # in a directory determined from the episode title.
14
- #
15
- # Example:
16
- # episode url: "https://rubytapas.dpdcart.com/subscriber/post?id=443"
17
- # title: "177 Aliasing | RubyTapas"
18
- # subdirectory: /177-aliasing
19
- #
20
- # Parameters:
21
- #
22
- # * *url* - url of the episode to download
23
- # * *user* - username used to log into dpdcart
24
- # * *pw* - password used with username
25
- # * *dest* - destination directory to put episode subdirectory
26
- #
27
- def scrape(url=nil, user=nil, pw=nil, dest=".")
28
- raise "Must give user and password for RubyTapas downloads" if user.to_s.empty? or pw.to_s.empty?
29
- dest = File.realdirpath(dest)
30
- raise "Destination #{dest} must be a writeable directory" unless File.directory?(dest) and File.writable?(dest)
31
-
32
- Mechanize.start do |m|
33
-
34
- tapas = OpenStruct.new
35
-
36
- m = self.login(m, url, user, pw)
37
-
38
- m.current_page.tap do |page|
39
- tapas.title = page.title.strip
40
- tapas.episode_dir = File.join(dest,tapas.title.split("|").first.strip.downcase.gsub(%r{[^[:alnum:][:space:]]},' ').gsub(%r{[[:space:]]+},'-'))
41
- tapas.attachments = page.links_with(:href => %r{\bdownload\b})
42
- puts "Fetching and saving #{tapas.title} into #{tapas.episode_dir}"
43
- FileUtils.mkdir(tapas.episode_dir)
44
- Dir.chdir(tapas.episode_dir) do |dir|
45
- tapas.attachments.each do |att|
46
- puts "fetching #{att.text}"
47
- file = att.click
48
- puts "saving #{file.filename}"
49
- file.save
50
- end
51
- end
52
- end
53
-
54
- tapas
55
-
56
- end
57
- end
58
-
59
- # retrieve a list of URLs for shows from the showlist
60
- def self.showlist(showlist_url, user=nil, pw=nil)
61
- raise "Must give showlist url, user, and password" if showlist_url.to_s.empty? || user.to_s.empty? || pw.to_s.empty?
62
-
63
- Mechanize.start do |m|
64
- m = self.login(m, showlist_url, user, pw)
65
- links = m.current_page.links_with(:text => "Read More")
66
- s = URI.parse(showlist_url)
67
- s.path = ''
68
- links.map{|l| "#{s}#{l.href}" }
69
- end
70
-
71
-
72
- end
73
-
74
- def self.login(m, url, user, pw)
75
- # First time, we will get redirected to the login page
76
- m.get url
77
- m.current_page.form.field_with(:name => "username").value = user
78
- m.current_page.form.field_with(:name => "password").value = pw
79
- m.current_page.form.submit
80
-
81
- # Second time, we should land on episode page
82
- m.get url
83
- raise "Not where I expected. #{m.current_page.uri} is not #{url}" unless m.current_page.uri != url
84
- m
85
- end
10
+ VERSION = "3.0.0"
86
11
 
87
12
  end
88
13
  end
@@ -0,0 +1,39 @@
1
+ require 'thor'
2
+ require 'scrapers/version'
3
+ require 'scrapers/rubytapas'
4
+
5
+ module Scrapers
6
+ module RubyTapas
7
+
8
+ # Thor script that handles things with Avdi Grimm's RubyTapas
9
+ class CLI < Thor
10
+
11
+ # Download an episode, or all episodes.
12
+ desc "download EPISODE", "Downloads the listed episode's files into a new directory with the episode tag in the given directory. Specifying ALL for the episode number downloads all episodes."
13
+ method_option :debug, type: :boolean
14
+ method_option :dry_run, type: :boolean
15
+ method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
16
+ method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
17
+ method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
18
+ def download(episode)
19
+ Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
20
+ end
21
+
22
+ # Get a list of available episodes
23
+ desc "list", "Show a list of the available episodes"
24
+ method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
25
+ method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
26
+ def list()
27
+ Scrapers::RubyTapas::Scraper.new(nil, options).list!
28
+ end
29
+
30
+ # Version Info
31
+ desc "version", "Show the rubytapas and scrapers library version info"
32
+ def version
33
+ say "rubytapas version: #{Scrapers::RubyTapas::VERSION}. scrapers version: #{Scrapers::VERSION}."
34
+ end
35
+
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,11 @@
1
+ module Scrapers
2
+ module RubyTapas
3
+
4
+ RUBYTAPAS_HOST="rubytapas.dpdcart.com"
5
+ CONTENTS_PATH = "/subscriber/content"
6
+ POST_PATH = "/post"
7
+ FEED_PATH = "/feed"
8
+ POST_QUERY = "id"
9
+
10
+ end
11
+ end
@@ -0,0 +1,115 @@
1
+ require 'netrc'
2
+ require 'mechanize'
3
+
4
+ module Scrapers
5
+ module RubyTapas
6
+
7
+ # DpdCart is a remote service gateway object (Gateway Pattern)
8
+ # that provides a connection to rubytapas.dpdcart.com where the
9
+ # RubyTapas episodes and download files are available, as well as
10
+ # the episode feed.
11
+ class DpdCart
12
+
13
+ RUBYTAPAS_HOST = 'rubytapas.dpdcart.com'
14
+ ENV_RUBYTAPAS_USER = 'RUBYTAPAS_USER'
15
+ ENV_RUBYTAPAS_PASSWORD = 'RUBYTAPAS_PASSWORD'
16
+ LOGIN_PATH = '/subscriber/login'
17
+ LOGIN_URL = "https://#{RUBYTAPAS_HOST}#{LOGIN_PATH}"
18
+ FEED_PATH = '/feed'
19
+ FEED_URL = "https://#{RUBYTAPAS_HOST}#{FEED_PATH}"
20
+ CONTENT_PATH = "/subscriber/content"
21
+ CONTENT_URL = "https://#{RUBYTAPAS_HOST}#{CONTENT_PATH}"
22
+
23
+ # User name for dpdcart account
24
+ attr_accessor :user
25
+
26
+ # Password for dpdcart acount
27
+ attr_accessor :password
28
+
29
+ attr_accessor :dry_run, :debug
30
+
31
+ # Create a new instance of the DpdCart gateway.
32
+ #
33
+ # @param user [String] - the DpdCart account name, typically an
34
+ # email address.
35
+ # @param password [String] - password associated with the
36
+ # account.
37
+ #
38
+ # If the user and password are empty, the information will be
39
+ # obtained in the following order:
40
+ #
41
+ # - reading the environment variables `RUBYTAPAS_USER` and
42
+ # `RUBYTAPAS_PASSWORD`
43
+ #
44
+ # - reading the user's `$HOME/.netrc` file and pulling the
45
+ # credentials that match the host name for the rubytapas
46
+ # account.
47
+ #
48
+ # If no credentials can be found, it will raise and error:
49
+ # `NoCredentialsError`.
50
+ def initialize(user=nil, password=nil, options={})
51
+ self.dry_run = options[:dry_run]
52
+ self.debug = options[:debug]
53
+ if user && password
54
+ @user = user
55
+ @password = password
56
+ else
57
+ @user, @password = get_credentials_from_environment
58
+ unless user && password
59
+ @user, @password = get_credentials_from_netrc
60
+ end
61
+ end
62
+ self.agent = Mechanize.new
63
+ end
64
+
65
+ # Return the episode feed from dpdcart
66
+ def feed!
67
+ uri = URI(FEED_URL)
68
+ request = Net::HTTP::Get.new(uri)
69
+ request.basic_auth user, password
70
+ Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
71
+ end
72
+
73
+ # Login to dpdcart before downloading
74
+ def login!
75
+ page = agent.get LOGIN_URL
76
+ page.form.field_with(name: "username").value = user
77
+ page.form.field_with(name: "password").value = password
78
+ page.form.submit
79
+ unless agent.page.title.match(/Subscription Content/)
80
+ raise "Could not log in"
81
+ end
82
+ agent
83
+ end
84
+
85
+ # Download the file from dpdcart
86
+ def download!(file)
87
+ warn "DEBUG: downloading #{file}" if debug
88
+ if dry_run
89
+ warn "DEBUG: download skipped for dry run" if dry_run
90
+ filename = file
91
+ body = "no body"
92
+ else
93
+ page = agent.get(file) unless dry_run
94
+ filename = page.filename
95
+ body = page.body
96
+ end
97
+ [ filename, body ]
98
+ end
99
+
100
+ private
101
+
102
+ attr_accessor :options, :agent
103
+
104
+ def get_credentials_from_environment
105
+ [ ENV[ENV_RUBYTAPAS_USER], ENV[ENV_RUBYTAPAS_PASSWORD] ]
106
+ end
107
+
108
+ def get_credentials_from_netrc
109
+ creds = Netrc.read[RUBYTAPAS_HOST]
110
+ [ creds.login, creds.password ]
111
+ end
112
+
113
+ end
114
+ end
115
+ end