scrapers 3.0.0 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/netrc_reader.rb +4 -2
- data/lib/scrapers/rubytapas/cli.rb +3 -1
- data/lib/scrapers/rubytapas/config.rb +0 -6
- data/lib/scrapers/rubytapas/dpdcart.rb +84 -48
- data/lib/scrapers/rubytapas/scraper.rb +6 -10
- data/lib/scrapers/version.rb +2 -2
- data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +0 -14
- data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +6 -7
- data/spec/lib/scrapers/rubytapas/scraper_spec.rb +1 -5
- data/vcr_cassettes/rubytapas_download_twice.yml +1053 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b09c6be6c6c50e75b02dcc6e889d577f1f3c4d1c
|
4
|
+
data.tar.gz: c35d853aad8bfab4880d445f7db11e7532537c76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 945df36bd1df9f8d91dcf86ee3e7e51796e50c604da0c9f135339c9be5bc54aa0170c6f8af83aa40ad7f23a4ee4d7c5c840b7f2585cd58170b7a936206543b51
|
7
|
+
data.tar.gz: 245fc91742ca186713c28f6ccfb5b591604fdf0e4573a7bf50c32e723a5697f4e8f24f61ce57421830dbff58f6677bbba5f293a810526d66261c393833fcec34
|
data/lib/netrc_reader.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'netrc'
|
2
2
|
|
3
|
-
module Scrapers
|
3
|
+
module Scrapers
|
4
4
|
class NetrcReader
|
5
5
|
attr_accessor :user, :pw
|
6
6
|
|
7
7
|
def initialize(section)
|
8
8
|
netrc = Netrc.read
|
9
|
-
@user, @pw = netrc[section]
|
9
|
+
@user, @pw = netrc[section]
|
10
|
+
rescue NoMethodError => e
|
11
|
+
fail "Could not find credentials for #{section}"
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -4,7 +4,7 @@ require 'scrapers/rubytapas'
|
|
4
4
|
|
5
5
|
module Scrapers
|
6
6
|
module RubyTapas
|
7
|
-
|
7
|
+
|
8
8
|
# Thor script that handles things with Avdi Grimm's RubyTapas
|
9
9
|
class CLI < Thor
|
10
10
|
|
@@ -15,6 +15,7 @@ module Scrapers
|
|
15
15
|
method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
|
16
16
|
method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
|
17
17
|
method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
|
18
|
+
method_option :subscription, default: 'rubytapas'
|
18
19
|
def download(episode)
|
19
20
|
Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
|
20
21
|
end
|
@@ -23,6 +24,7 @@ module Scrapers
|
|
23
24
|
desc "list", "Show a list of the available episodes"
|
24
25
|
method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
|
25
26
|
method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
|
27
|
+
method_option :subscription, default: 'rubytapas'
|
26
28
|
def list()
|
27
29
|
Scrapers::RubyTapas::Scraper.new(nil, options).list!
|
28
30
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "mechanize"
|
2
|
+
require "netrc"
|
3
3
|
|
4
4
|
module Scrapers
|
5
5
|
module RubyTapas
|
@@ -8,17 +8,29 @@ module Scrapers
|
|
8
8
|
# that provides a connection to rubytapas.dpdcart.com where the
|
9
9
|
# RubyTapas episodes and download files are available, as well as
|
10
10
|
# the episode feed.
|
11
|
+
|
11
12
|
class DpdCart
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
# NOTE: Updating this since now I have *two* subscriptions that
|
15
|
+
# use DPD Cart, rubytapas and elixirsips. Generalizing this
|
16
|
+
# accordingly. :)
|
17
|
+
|
18
|
+
# The subscription name will be filled in depending on which
|
19
|
+
# subscription I'm downloading from. This is a stock sprintf-type
|
20
|
+
# fill in where you pass in the subscription parameter with a
|
21
|
+
# value, thusly:
|
22
|
+
#
|
23
|
+
# DPDCART_HOST % {subscription: "rubytapas"}
|
24
|
+
#
|
25
|
+
DPDCART_HOST_FORMAT = "%{subscription}.dpdcart.com"
|
26
|
+
ENV_DPDCART_USER_FORMAT = "%{subscription}_USER"
|
27
|
+
ENV_DPDCART_PASSWORD_FORMAT = "%{subscription}_USER"
|
28
|
+
LOGIN_PATH = '/subscriber/login'
|
29
|
+
FEED_PATH = '/feed'
|
30
|
+
CONTENT_PATH = "/subscriber/content"
|
31
|
+
|
32
|
+
# Subscription at dbdcart
|
33
|
+
attr_accessor :subscription
|
22
34
|
|
23
35
|
# User name for dpdcart account
|
24
36
|
attr_accessor :user
|
@@ -26,7 +38,13 @@ module Scrapers
|
|
26
38
|
# Password for dpdcart acount
|
27
39
|
attr_accessor :password
|
28
40
|
|
29
|
-
|
41
|
+
def dpdcart_host ; @dpdcart_host ||= DPDCART_HOST_FORMAT % {subscription: subscription} ; end
|
42
|
+
def env_dpdcart_user ; @env_dpdcart_user ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
|
43
|
+
def env_dpdcart_password ; @env_dpdcart_password ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
|
44
|
+
def debug ; @debug ||= options[:debug] ; end
|
45
|
+
def dry_run ; @dry_run ||= options[:dry_run] ; end
|
46
|
+
def feed_url ; @feed_url ||= URI("https://#{dpdcart_host}#{FEED_PATH}") ; end
|
47
|
+
def login_url ; @login_url ||= URI("https://#{dpdcart_host}#{LOGIN_PATH}") ; end
|
30
48
|
|
31
49
|
# Create a new instance of the DpdCart gateway.
|
32
50
|
#
|
@@ -34,63 +52,47 @@ module Scrapers
|
|
34
52
|
# email address.
|
35
53
|
# @param password [String] - password associated with the
|
36
54
|
# account.
|
55
|
+
# @param subscription [String] - subscription name at DPD Cart
|
56
|
+
# (e.g. 'rubytapas' or 'elixirsips')
|
37
57
|
#
|
38
58
|
# If the user and password are empty, the information will be
|
39
59
|
# obtained in the following order:
|
40
60
|
#
|
41
|
-
# - reading the environment variables `
|
42
|
-
# `
|
61
|
+
# - reading the environment variables `<subscriptiion>_USER` and
|
62
|
+
# `<subscription>_PASSWORD`
|
63
|
+
#
|
64
|
+
# Note that <subscription> will be the subscription passed in
|
65
|
+
# above.
|
43
66
|
#
|
44
67
|
# - reading the user's `$HOME/.netrc` file and pulling the
|
45
|
-
# credentials that match the host name for the
|
68
|
+
# credentials that match the host name for the subscription
|
46
69
|
# account.
|
47
70
|
#
|
48
|
-
# If no credentials can be found, it will raise
|
71
|
+
# If no credentials can be found, it will raise an error:
|
49
72
|
# `NoCredentialsError`.
|
50
|
-
|
51
|
-
|
52
|
-
self.
|
53
|
-
|
54
|
-
|
55
|
-
@password = password
|
56
|
-
else
|
57
|
-
@user, @password = get_credentials_from_environment
|
58
|
-
unless user && password
|
59
|
-
@user, @password = get_credentials_from_netrc
|
60
|
-
end
|
61
|
-
end
|
73
|
+
#
|
74
|
+
def initialize(user=nil, password=nil, subscription='rubytapas', options={})
|
75
|
+
self.options = options
|
76
|
+
self.subscription = subscription
|
77
|
+
set_user_and_password(user, password)
|
62
78
|
self.agent = Mechanize.new
|
63
79
|
end
|
64
80
|
|
65
|
-
#
|
81
|
+
# Retreive the episode feed from dpdcart
|
66
82
|
def feed!
|
67
|
-
|
68
|
-
request = Net::HTTP::Get.new(uri)
|
69
|
-
request.basic_auth user, password
|
70
|
-
Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
|
71
|
-
end
|
72
|
-
|
73
|
-
# Login to dpdcart before downloading
|
74
|
-
def login!
|
75
|
-
page = agent.get LOGIN_URL
|
76
|
-
page.form.field_with(name: "username").value = user
|
77
|
-
page.form.field_with(name: "password").value = password
|
78
|
-
page.form.submit
|
79
|
-
unless agent.page.title.match(/Subscription Content/)
|
80
|
-
raise "Could not log in"
|
81
|
-
end
|
82
|
-
agent
|
83
|
+
http_fetch(feed_url)
|
83
84
|
end
|
84
85
|
|
85
86
|
# Download the file from dpdcart
|
86
87
|
def download!(file)
|
88
|
+
login
|
87
89
|
warn "DEBUG: downloading #{file}" if debug
|
88
90
|
if dry_run
|
89
91
|
warn "DEBUG: download skipped for dry run" if dry_run
|
90
92
|
filename = file
|
91
93
|
body = "no body"
|
92
94
|
else
|
93
|
-
page = agent.get(file)
|
95
|
+
page = agent.get(file)
|
94
96
|
filename = page.filename
|
95
97
|
body = page.body
|
96
98
|
end
|
@@ -101,15 +103,49 @@ module Scrapers
|
|
101
103
|
|
102
104
|
attr_accessor :options, :agent
|
103
105
|
|
106
|
+
def set_user_and_password(user, password)
|
107
|
+
if user && password
|
108
|
+
@user = user
|
109
|
+
@password = password
|
110
|
+
else
|
111
|
+
@user, @password = get_credentials_from_environment
|
112
|
+
unless user && password
|
113
|
+
@user, @password = get_credentials_from_netrc
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
104
118
|
def get_credentials_from_environment
|
105
|
-
[ ENV[
|
119
|
+
[ ENV[env_dpdcart_user], ENV[env_dpdcart_password] ]
|
106
120
|
end
|
107
121
|
|
108
122
|
def get_credentials_from_netrc
|
109
|
-
creds = Netrc.read[
|
123
|
+
creds = Netrc.read[dpdcart_host]
|
124
|
+
if creds.nil?
|
125
|
+
warn "Could not find credentials for #{dpdcart_host}"
|
126
|
+
exit -1
|
127
|
+
end
|
110
128
|
[ creds.login, creds.password ]
|
111
129
|
end
|
112
130
|
|
131
|
+
# Login to dpdcart before downloading
|
132
|
+
def login
|
133
|
+
page = agent.get login_url
|
134
|
+
page.form.field_with(name: "username").value = user
|
135
|
+
page.form.field_with(name: "password").value = password
|
136
|
+
page.form.submit
|
137
|
+
unless agent.page.title.match(/Subscription Content/)
|
138
|
+
raise "Could not log in"
|
139
|
+
end
|
140
|
+
agent
|
141
|
+
end
|
142
|
+
|
143
|
+
def http_fetch(uri)
|
144
|
+
request = Net::HTTP::Get.new(uri)
|
145
|
+
request.basic_auth user, password
|
146
|
+
Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
|
147
|
+
end
|
148
|
+
|
113
149
|
end
|
114
150
|
end
|
115
151
|
end
|
@@ -8,11 +8,12 @@ module Scrapers
|
|
8
8
|
module RubyTapas
|
9
9
|
|
10
10
|
# Scraper provides the methods to download, extract and build a collection
|
11
|
-
# of
|
11
|
+
# of DPD cart subscription episode from the RubyTapas RSS feed.
|
12
12
|
class Scraper
|
13
13
|
|
14
|
-
attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
14
|
+
attr_accessor :subscription, :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
15
15
|
attr_reader :dpdcart
|
16
|
+
attr_reader :episodes
|
16
17
|
|
17
18
|
# *episode_number* is the RubyTapas episode number (note! not the post id!) of the
|
18
19
|
# episode to download. If the episode number is the symbol :all, then all episodes
|
@@ -26,6 +27,7 @@ module Scrapers
|
|
26
27
|
# - "pw": the password of the RubyTapas account
|
27
28
|
# - "destination": the root destination of the episode downloads
|
28
29
|
def initialize(episode_number, options)
|
30
|
+
self.subscription = options.fetch("subscription") # let this fail if no subscription given.
|
29
31
|
self.episode_number = episode_number
|
30
32
|
self.user = options["user"]
|
31
33
|
self.pw = options["pw"]
|
@@ -33,13 +35,12 @@ module Scrapers
|
|
33
35
|
self.dry_run = options["dry_run"]
|
34
36
|
self.debug = options["debug"]
|
35
37
|
@dpdcart = Scrapers::RubyTapas::DpdCart.
|
36
|
-
|
37
|
-
|
38
|
+
new(user, pw, subscription, {dry_run: dry_run, debug: debug})
|
39
|
+
@episodes ||= fetch_episodes
|
38
40
|
end
|
39
41
|
|
40
42
|
# Perform the scraping operation
|
41
43
|
def scrape!
|
42
|
-
dpdcart.login!
|
43
44
|
if all_episodes?
|
44
45
|
episodes.each do |episode|
|
45
46
|
|
@@ -70,11 +71,6 @@ module Scrapers
|
|
70
71
|
end
|
71
72
|
end
|
72
73
|
|
73
|
-
# Returns the collection of episodes.
|
74
|
-
def episodes
|
75
|
-
@episodes ||= fetch_episodes
|
76
|
-
end
|
77
|
-
|
78
74
|
# Retrieves the episode associated with *episode number*.
|
79
75
|
def find_by_episode(episode_number)
|
80
76
|
episodes.detect {|e| e.number == episode_number}
|
data/lib/scrapers/version.rb
CHANGED
@@ -2,11 +2,11 @@ module Scrapers
|
|
2
2
|
module Version
|
3
3
|
|
4
4
|
MAJOR = 3
|
5
|
-
MINOR =
|
5
|
+
MINOR = 1
|
6
6
|
BUILD = 0
|
7
7
|
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
|
11
11
|
|
12
12
|
DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
|
@@ -8,7 +8,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
8
8
|
|
9
9
|
describe "method signatures" do
|
10
10
|
it { is_expected.to respond_to(:feed!) }
|
11
|
-
it { is_expected.to respond_to(:login!)}
|
12
11
|
it { is_expected.to respond_to(:download!)}
|
13
12
|
end
|
14
13
|
|
@@ -22,17 +21,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
22
21
|
end
|
23
22
|
end
|
24
23
|
|
25
|
-
describe "#login!" do
|
26
|
-
it "shows the subscriber content page" do
|
27
|
-
VCR.use_cassette('rubytapas_login', record: :new_episodes,
|
28
|
-
match_requests_on: [:method, :host, :path]
|
29
|
-
) do
|
30
|
-
expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
24
|
describe "#download!" do
|
37
25
|
let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
|
38
26
|
let(:name) { "001-binary-literals.html" }
|
@@ -44,7 +32,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
44
32
|
VCR.use_cassette('rubytapas_download', record: :new_episodes,
|
45
33
|
match_requests_on: [:method, :host, :path,
|
46
34
|
:query]) do
|
47
|
-
gateway.login!
|
48
35
|
filename, body = gateway.download! file
|
49
36
|
expect(filename).to eq(name)
|
50
37
|
expect(body.size).to eq(5744)
|
@@ -55,7 +42,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
55
42
|
VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
|
56
43
|
match_requests_on: [:method, :host, :path,
|
57
44
|
:query]) do
|
58
|
-
gateway.login!
|
59
45
|
filename, body = gateway.download! file
|
60
46
|
expect(filename).to eq(name)
|
61
47
|
filename, body = gateway.download! file2
|
@@ -13,7 +13,6 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
13
13
|
|
14
14
|
let(:cart) do
|
15
15
|
instance_spy("Scrapers::RubyTapas::DpdCart",
|
16
|
-
:login! => "Subscription Content | RubyTapas",
|
17
16
|
:feed! => feed,
|
18
17
|
:download! => download
|
19
18
|
)
|
@@ -35,12 +34,12 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
35
34
|
it "retrieves one episode" do
|
36
35
|
expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
|
37
36
|
expect(cart).to receive(:download!).exactly(3).times
|
38
|
-
|
37
|
+
|
39
38
|
VCR.use_cassette('rubytapas-download-1', :match_requests_on => [:method, :host, :path, :query]) do
|
40
39
|
Scrapers::RubyTapas::CLI.start(%w[download 001 --destination=. --user=joan@example.com --pw=password])
|
41
40
|
end
|
42
41
|
end
|
43
|
-
|
42
|
+
|
44
43
|
end
|
45
44
|
|
46
45
|
context "when scraping all episodes" do
|
@@ -48,7 +47,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
48
47
|
it "retrieves all episodes" do
|
49
48
|
expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
|
50
49
|
expect(cart).to receive(:download!).exactly(933).times
|
51
|
-
|
50
|
+
|
52
51
|
VCR.use_cassette('rubytapas-download-all', :match_requests_on => [:method, :host, :path, :query]) do
|
53
52
|
save_stdout = $stdout
|
54
53
|
# $stdout = output
|
@@ -56,9 +55,9 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
56
55
|
$stdout = save_stdout
|
57
56
|
end
|
58
57
|
end
|
59
|
-
|
58
|
+
|
60
59
|
end
|
61
|
-
|
60
|
+
|
62
61
|
end
|
63
62
|
|
64
63
|
describe "list command" do
|
@@ -73,7 +72,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
73
72
|
Scrapers::RubyTapas::CLI.start(%w[list --user=joan@example.com --pw=password])
|
74
73
|
end
|
75
74
|
end
|
76
|
-
|
75
|
+
|
77
76
|
describe "version command" do
|
78
77
|
it "prints the version numbers for rubytapas and scrapers" do
|
79
78
|
save_stdout = $stdout
|
@@ -13,12 +13,11 @@ describe Scrapers::RubyTapas::Scraper do
|
|
13
13
|
let(:options) do
|
14
14
|
{
|
15
15
|
"destination" => '.',
|
16
|
+
"subscription" => 'rubytapas'
|
16
17
|
}
|
17
18
|
end
|
18
19
|
let(:cart) {instance_spy("Scrapers::RubyTapas::DpdCart",
|
19
20
|
:feed! => feed,
|
20
|
-
:login! =>
|
21
|
-
double("Subscription Content | RubyTapas"),
|
22
21
|
:download! => [ 'filename',
|
23
22
|
'body' ]
|
24
23
|
)}
|
@@ -28,7 +27,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
28
27
|
|
29
28
|
describe "#episodes" do
|
30
29
|
it "gets a collection of episodes" do
|
31
|
-
expect(scraper).to receive(:fetch_episodes).and_call_original
|
32
30
|
expect(scraper.episodes.size).to eq(267)
|
33
31
|
end
|
34
32
|
end
|
@@ -53,7 +51,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
53
51
|
context "when scraping one episode" do
|
54
52
|
it "scrapes one episode" do
|
55
53
|
expect(scraper).to receive(:find_by_episode).with(episode_number).and_call_original
|
56
|
-
expect(scraper).to receive(:fetch_episodes).once.and_call_original
|
57
54
|
scraper.scrape!
|
58
55
|
end
|
59
56
|
|
@@ -63,7 +60,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
63
60
|
let(:scraper) { Scrapers::RubyTapas::Scraper.new(:all, options) }
|
64
61
|
|
65
62
|
it "scrapes all the episodes" do
|
66
|
-
expect(scraper).to receive(:fetch_episodes).once.and_call_original
|
67
63
|
scraper.scrape!
|
68
64
|
end
|
69
65
|
end
|