scrapers 3.0.0 → 3.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/netrc_reader.rb +4 -2
- data/lib/scrapers/rubytapas/cli.rb +3 -1
- data/lib/scrapers/rubytapas/config.rb +0 -6
- data/lib/scrapers/rubytapas/dpdcart.rb +84 -48
- data/lib/scrapers/rubytapas/scraper.rb +6 -10
- data/lib/scrapers/version.rb +2 -2
- data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +0 -14
- data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +6 -7
- data/spec/lib/scrapers/rubytapas/scraper_spec.rb +1 -5
- data/vcr_cassettes/rubytapas_download_twice.yml +1053 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b09c6be6c6c50e75b02dcc6e889d577f1f3c4d1c
|
4
|
+
data.tar.gz: c35d853aad8bfab4880d445f7db11e7532537c76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 945df36bd1df9f8d91dcf86ee3e7e51796e50c604da0c9f135339c9be5bc54aa0170c6f8af83aa40ad7f23a4ee4d7c5c840b7f2585cd58170b7a936206543b51
|
7
|
+
data.tar.gz: 245fc91742ca186713c28f6ccfb5b591604fdf0e4573a7bf50c32e723a5697f4e8f24f61ce57421830dbff58f6677bbba5f293a810526d66261c393833fcec34
|
data/lib/netrc_reader.rb
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
require 'netrc'
|
2
2
|
|
3
|
-
module Scrapers
|
3
|
+
module Scrapers
|
4
4
|
class NetrcReader
|
5
5
|
attr_accessor :user, :pw
|
6
6
|
|
7
7
|
def initialize(section)
|
8
8
|
netrc = Netrc.read
|
9
|
-
@user, @pw = netrc[section]
|
9
|
+
@user, @pw = netrc[section]
|
10
|
+
rescue NoMethodError => e
|
11
|
+
fail "Could not find credentials for #{section}"
|
10
12
|
end
|
11
13
|
end
|
12
14
|
end
|
@@ -4,7 +4,7 @@ require 'scrapers/rubytapas'
|
|
4
4
|
|
5
5
|
module Scrapers
|
6
6
|
module RubyTapas
|
7
|
-
|
7
|
+
|
8
8
|
# Thor script that handles things with Avdi Grimm's RubyTapas
|
9
9
|
class CLI < Thor
|
10
10
|
|
@@ -15,6 +15,7 @@ module Scrapers
|
|
15
15
|
method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
|
16
16
|
method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
|
17
17
|
method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
|
18
|
+
method_option :subscription, default: 'rubytapas'
|
18
19
|
def download(episode)
|
19
20
|
Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
|
20
21
|
end
|
@@ -23,6 +24,7 @@ module Scrapers
|
|
23
24
|
desc "list", "Show a list of the available episodes"
|
24
25
|
method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
|
25
26
|
method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
|
27
|
+
method_option :subscription, default: 'rubytapas'
|
26
28
|
def list()
|
27
29
|
Scrapers::RubyTapas::Scraper.new(nil, options).list!
|
28
30
|
end
|
@@ -1,5 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require "mechanize"
|
2
|
+
require "netrc"
|
3
3
|
|
4
4
|
module Scrapers
|
5
5
|
module RubyTapas
|
@@ -8,17 +8,29 @@ module Scrapers
|
|
8
8
|
# that provides a connection to rubytapas.dpdcart.com where the
|
9
9
|
# RubyTapas episodes and download files are available, as well as
|
10
10
|
# the episode feed.
|
11
|
+
|
11
12
|
class DpdCart
|
12
13
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
14
|
+
# NOTE: Updating this since now I have *two* subscriptions that
|
15
|
+
# use DPD Cart, rubytapas and elixirsips. Generalizing this
|
16
|
+
# accordingly. :)
|
17
|
+
|
18
|
+
# The subscription name will be filled in depending on which
|
19
|
+
# subscription I'm downloading from. This is a stock sprintf-type
|
20
|
+
# fill in where you pass in the subscription parameter with a
|
21
|
+
# value, thusly:
|
22
|
+
#
|
23
|
+
# DPDCART_HOST % {subscription: "rubytapas"}
|
24
|
+
#
|
25
|
+
DPDCART_HOST_FORMAT = "%{subscription}.dpdcart.com"
|
26
|
+
ENV_DPDCART_USER_FORMAT = "%{subscription}_USER"
|
27
|
+
ENV_DPDCART_PASSWORD_FORMAT = "%{subscription}_USER"
|
28
|
+
LOGIN_PATH = '/subscriber/login'
|
29
|
+
FEED_PATH = '/feed'
|
30
|
+
CONTENT_PATH = "/subscriber/content"
|
31
|
+
|
32
|
+
# Subscription at dbdcart
|
33
|
+
attr_accessor :subscription
|
22
34
|
|
23
35
|
# User name for dpdcart account
|
24
36
|
attr_accessor :user
|
@@ -26,7 +38,13 @@ module Scrapers
|
|
26
38
|
# Password for dpdcart acount
|
27
39
|
attr_accessor :password
|
28
40
|
|
29
|
-
|
41
|
+
def dpdcart_host ; @dpdcart_host ||= DPDCART_HOST_FORMAT % {subscription: subscription} ; end
|
42
|
+
def env_dpdcart_user ; @env_dpdcart_user ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
|
43
|
+
def env_dpdcart_password ; @env_dpdcart_password ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
|
44
|
+
def debug ; @debug ||= options[:debug] ; end
|
45
|
+
def dry_run ; @dry_run ||= options[:dry_run] ; end
|
46
|
+
def feed_url ; @feed_url ||= URI("https://#{dpdcart_host}#{FEED_PATH}") ; end
|
47
|
+
def login_url ; @login_url ||= URI("https://#{dpdcart_host}#{LOGIN_PATH}") ; end
|
30
48
|
|
31
49
|
# Create a new instance of the DpdCart gateway.
|
32
50
|
#
|
@@ -34,63 +52,47 @@ module Scrapers
|
|
34
52
|
# email address.
|
35
53
|
# @param password [String] - password associated with the
|
36
54
|
# account.
|
55
|
+
# @param subscription [String] - subscription name at DPD Cart
|
56
|
+
# (e.g. 'rubytapas' or 'elixirsips')
|
37
57
|
#
|
38
58
|
# If the user and password are empty, the information will be
|
39
59
|
# obtained in the following order:
|
40
60
|
#
|
41
|
-
# - reading the environment variables `
|
42
|
-
# `
|
61
|
+
# - reading the environment variables `<subscriptiion>_USER` and
|
62
|
+
# `<subscription>_PASSWORD`
|
63
|
+
#
|
64
|
+
# Note that <subscription> will be the subscription passed in
|
65
|
+
# above.
|
43
66
|
#
|
44
67
|
# - reading the user's `$HOME/.netrc` file and pulling the
|
45
|
-
# credentials that match the host name for the
|
68
|
+
# credentials that match the host name for the subscription
|
46
69
|
# account.
|
47
70
|
#
|
48
|
-
# If no credentials can be found, it will raise
|
71
|
+
# If no credentials can be found, it will raise an error:
|
49
72
|
# `NoCredentialsError`.
|
50
|
-
|
51
|
-
|
52
|
-
self.
|
53
|
-
|
54
|
-
|
55
|
-
@password = password
|
56
|
-
else
|
57
|
-
@user, @password = get_credentials_from_environment
|
58
|
-
unless user && password
|
59
|
-
@user, @password = get_credentials_from_netrc
|
60
|
-
end
|
61
|
-
end
|
73
|
+
#
|
74
|
+
def initialize(user=nil, password=nil, subscription='rubytapas', options={})
|
75
|
+
self.options = options
|
76
|
+
self.subscription = subscription
|
77
|
+
set_user_and_password(user, password)
|
62
78
|
self.agent = Mechanize.new
|
63
79
|
end
|
64
80
|
|
65
|
-
#
|
81
|
+
# Retreive the episode feed from dpdcart
|
66
82
|
def feed!
|
67
|
-
|
68
|
-
request = Net::HTTP::Get.new(uri)
|
69
|
-
request.basic_auth user, password
|
70
|
-
Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
|
71
|
-
end
|
72
|
-
|
73
|
-
# Login to dpdcart before downloading
|
74
|
-
def login!
|
75
|
-
page = agent.get LOGIN_URL
|
76
|
-
page.form.field_with(name: "username").value = user
|
77
|
-
page.form.field_with(name: "password").value = password
|
78
|
-
page.form.submit
|
79
|
-
unless agent.page.title.match(/Subscription Content/)
|
80
|
-
raise "Could not log in"
|
81
|
-
end
|
82
|
-
agent
|
83
|
+
http_fetch(feed_url)
|
83
84
|
end
|
84
85
|
|
85
86
|
# Download the file from dpdcart
|
86
87
|
def download!(file)
|
88
|
+
login
|
87
89
|
warn "DEBUG: downloading #{file}" if debug
|
88
90
|
if dry_run
|
89
91
|
warn "DEBUG: download skipped for dry run" if dry_run
|
90
92
|
filename = file
|
91
93
|
body = "no body"
|
92
94
|
else
|
93
|
-
page = agent.get(file)
|
95
|
+
page = agent.get(file)
|
94
96
|
filename = page.filename
|
95
97
|
body = page.body
|
96
98
|
end
|
@@ -101,15 +103,49 @@ module Scrapers
|
|
101
103
|
|
102
104
|
attr_accessor :options, :agent
|
103
105
|
|
106
|
+
def set_user_and_password(user, password)
|
107
|
+
if user && password
|
108
|
+
@user = user
|
109
|
+
@password = password
|
110
|
+
else
|
111
|
+
@user, @password = get_credentials_from_environment
|
112
|
+
unless user && password
|
113
|
+
@user, @password = get_credentials_from_netrc
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
104
118
|
def get_credentials_from_environment
|
105
|
-
[ ENV[
|
119
|
+
[ ENV[env_dpdcart_user], ENV[env_dpdcart_password] ]
|
106
120
|
end
|
107
121
|
|
108
122
|
def get_credentials_from_netrc
|
109
|
-
creds = Netrc.read[
|
123
|
+
creds = Netrc.read[dpdcart_host]
|
124
|
+
if creds.nil?
|
125
|
+
warn "Could not find credentials for #{dpdcart_host}"
|
126
|
+
exit -1
|
127
|
+
end
|
110
128
|
[ creds.login, creds.password ]
|
111
129
|
end
|
112
130
|
|
131
|
+
# Login to dpdcart before downloading
|
132
|
+
def login
|
133
|
+
page = agent.get login_url
|
134
|
+
page.form.field_with(name: "username").value = user
|
135
|
+
page.form.field_with(name: "password").value = password
|
136
|
+
page.form.submit
|
137
|
+
unless agent.page.title.match(/Subscription Content/)
|
138
|
+
raise "Could not log in"
|
139
|
+
end
|
140
|
+
agent
|
141
|
+
end
|
142
|
+
|
143
|
+
def http_fetch(uri)
|
144
|
+
request = Net::HTTP::Get.new(uri)
|
145
|
+
request.basic_auth user, password
|
146
|
+
Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
|
147
|
+
end
|
148
|
+
|
113
149
|
end
|
114
150
|
end
|
115
151
|
end
|
@@ -8,11 +8,12 @@ module Scrapers
|
|
8
8
|
module RubyTapas
|
9
9
|
|
10
10
|
# Scraper provides the methods to download, extract and build a collection
|
11
|
-
# of
|
11
|
+
# of DPD cart subscription episode from the RubyTapas RSS feed.
|
12
12
|
class Scraper
|
13
13
|
|
14
|
-
attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
14
|
+
attr_accessor :subscription, :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
15
15
|
attr_reader :dpdcart
|
16
|
+
attr_reader :episodes
|
16
17
|
|
17
18
|
# *episode_number* is the RubyTapas episode number (note! not the post id!) of the
|
18
19
|
# episode to download. If the episode number is the symbol :all, then all episodes
|
@@ -26,6 +27,7 @@ module Scrapers
|
|
26
27
|
# - "pw": the password of the RubyTapas account
|
27
28
|
# - "destination": the root destination of the episode downloads
|
28
29
|
def initialize(episode_number, options)
|
30
|
+
self.subscription = options.fetch("subscription") # let this fail if no subscription given.
|
29
31
|
self.episode_number = episode_number
|
30
32
|
self.user = options["user"]
|
31
33
|
self.pw = options["pw"]
|
@@ -33,13 +35,12 @@ module Scrapers
|
|
33
35
|
self.dry_run = options["dry_run"]
|
34
36
|
self.debug = options["debug"]
|
35
37
|
@dpdcart = Scrapers::RubyTapas::DpdCart.
|
36
|
-
|
37
|
-
|
38
|
+
new(user, pw, subscription, {dry_run: dry_run, debug: debug})
|
39
|
+
@episodes ||= fetch_episodes
|
38
40
|
end
|
39
41
|
|
40
42
|
# Perform the scraping operation
|
41
43
|
def scrape!
|
42
|
-
dpdcart.login!
|
43
44
|
if all_episodes?
|
44
45
|
episodes.each do |episode|
|
45
46
|
|
@@ -70,11 +71,6 @@ module Scrapers
|
|
70
71
|
end
|
71
72
|
end
|
72
73
|
|
73
|
-
# Returns the collection of episodes.
|
74
|
-
def episodes
|
75
|
-
@episodes ||= fetch_episodes
|
76
|
-
end
|
77
|
-
|
78
74
|
# Retrieves the episode associated with *episode number*.
|
79
75
|
def find_by_episode(episode_number)
|
80
76
|
episodes.detect {|e| e.number == episode_number}
|
data/lib/scrapers/version.rb
CHANGED
@@ -2,11 +2,11 @@ module Scrapers
|
|
2
2
|
module Version
|
3
3
|
|
4
4
|
MAJOR = 3
|
5
|
-
MINOR =
|
5
|
+
MINOR = 1
|
6
6
|
BUILD = 0
|
7
7
|
|
8
8
|
end
|
9
|
-
|
9
|
+
|
10
10
|
VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
|
11
11
|
|
12
12
|
DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
|
@@ -8,7 +8,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
8
8
|
|
9
9
|
describe "method signatures" do
|
10
10
|
it { is_expected.to respond_to(:feed!) }
|
11
|
-
it { is_expected.to respond_to(:login!)}
|
12
11
|
it { is_expected.to respond_to(:download!)}
|
13
12
|
end
|
14
13
|
|
@@ -22,17 +21,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
22
21
|
end
|
23
22
|
end
|
24
23
|
|
25
|
-
describe "#login!" do
|
26
|
-
it "shows the subscriber content page" do
|
27
|
-
VCR.use_cassette('rubytapas_login', record: :new_episodes,
|
28
|
-
match_requests_on: [:method, :host, :path]
|
29
|
-
) do
|
30
|
-
expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
|
31
|
-
end
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
24
|
describe "#download!" do
|
37
25
|
let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
|
38
26
|
let(:name) { "001-binary-literals.html" }
|
@@ -44,7 +32,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
44
32
|
VCR.use_cassette('rubytapas_download', record: :new_episodes,
|
45
33
|
match_requests_on: [:method, :host, :path,
|
46
34
|
:query]) do
|
47
|
-
gateway.login!
|
48
35
|
filename, body = gateway.download! file
|
49
36
|
expect(filename).to eq(name)
|
50
37
|
expect(body.size).to eq(5744)
|
@@ -55,7 +42,6 @@ describe Scrapers::RubyTapas::DpdCart do
|
|
55
42
|
VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
|
56
43
|
match_requests_on: [:method, :host, :path,
|
57
44
|
:query]) do
|
58
|
-
gateway.login!
|
59
45
|
filename, body = gateway.download! file
|
60
46
|
expect(filename).to eq(name)
|
61
47
|
filename, body = gateway.download! file2
|
@@ -13,7 +13,6 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
13
13
|
|
14
14
|
let(:cart) do
|
15
15
|
instance_spy("Scrapers::RubyTapas::DpdCart",
|
16
|
-
:login! => "Subscription Content | RubyTapas",
|
17
16
|
:feed! => feed,
|
18
17
|
:download! => download
|
19
18
|
)
|
@@ -35,12 +34,12 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
35
34
|
it "retrieves one episode" do
|
36
35
|
expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
|
37
36
|
expect(cart).to receive(:download!).exactly(3).times
|
38
|
-
|
37
|
+
|
39
38
|
VCR.use_cassette('rubytapas-download-1', :match_requests_on => [:method, :host, :path, :query]) do
|
40
39
|
Scrapers::RubyTapas::CLI.start(%w[download 001 --destination=. --user=joan@example.com --pw=password])
|
41
40
|
end
|
42
41
|
end
|
43
|
-
|
42
|
+
|
44
43
|
end
|
45
44
|
|
46
45
|
context "when scraping all episodes" do
|
@@ -48,7 +47,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
48
47
|
it "retrieves all episodes" do
|
49
48
|
expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
|
50
49
|
expect(cart).to receive(:download!).exactly(933).times
|
51
|
-
|
50
|
+
|
52
51
|
VCR.use_cassette('rubytapas-download-all', :match_requests_on => [:method, :host, :path, :query]) do
|
53
52
|
save_stdout = $stdout
|
54
53
|
# $stdout = output
|
@@ -56,9 +55,9 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
56
55
|
$stdout = save_stdout
|
57
56
|
end
|
58
57
|
end
|
59
|
-
|
58
|
+
|
60
59
|
end
|
61
|
-
|
60
|
+
|
62
61
|
end
|
63
62
|
|
64
63
|
describe "list command" do
|
@@ -73,7 +72,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
|
|
73
72
|
Scrapers::RubyTapas::CLI.start(%w[list --user=joan@example.com --pw=password])
|
74
73
|
end
|
75
74
|
end
|
76
|
-
|
75
|
+
|
77
76
|
describe "version command" do
|
78
77
|
it "prints the version numbers for rubytapas and scrapers" do
|
79
78
|
save_stdout = $stdout
|
@@ -13,12 +13,11 @@ describe Scrapers::RubyTapas::Scraper do
|
|
13
13
|
let(:options) do
|
14
14
|
{
|
15
15
|
"destination" => '.',
|
16
|
+
"subscription" => 'rubytapas'
|
16
17
|
}
|
17
18
|
end
|
18
19
|
let(:cart) {instance_spy("Scrapers::RubyTapas::DpdCart",
|
19
20
|
:feed! => feed,
|
20
|
-
:login! =>
|
21
|
-
double("Subscription Content | RubyTapas"),
|
22
21
|
:download! => [ 'filename',
|
23
22
|
'body' ]
|
24
23
|
)}
|
@@ -28,7 +27,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
28
27
|
|
29
28
|
describe "#episodes" do
|
30
29
|
it "gets a collection of episodes" do
|
31
|
-
expect(scraper).to receive(:fetch_episodes).and_call_original
|
32
30
|
expect(scraper.episodes.size).to eq(267)
|
33
31
|
end
|
34
32
|
end
|
@@ -53,7 +51,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
53
51
|
context "when scraping one episode" do
|
54
52
|
it "scrapes one episode" do
|
55
53
|
expect(scraper).to receive(:find_by_episode).with(episode_number).and_call_original
|
56
|
-
expect(scraper).to receive(:fetch_episodes).once.and_call_original
|
57
54
|
scraper.scrape!
|
58
55
|
end
|
59
56
|
|
@@ -63,7 +60,6 @@ describe Scrapers::RubyTapas::Scraper do
|
|
63
60
|
let(:scraper) { Scrapers::RubyTapas::Scraper.new(:all, options) }
|
64
61
|
|
65
62
|
it "scrapes all the episodes" do
|
66
|
-
expect(scraper).to receive(:fetch_episodes).once.and_call_original
|
67
63
|
scraper.scrape!
|
68
64
|
end
|
69
65
|
end
|