scrapers 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6a3e0b5e2fa58fb8686e8c350b1619dce0e85e0f
4
- data.tar.gz: 9b2f65ddff548a914a2b5e9d8155f725568c5708
3
+ metadata.gz: b09c6be6c6c50e75b02dcc6e889d577f1f3c4d1c
4
+ data.tar.gz: c35d853aad8bfab4880d445f7db11e7532537c76
5
5
  SHA512:
6
- metadata.gz: e67a79dccb7e436e9b629a63157ed44c0bdf76e83f99bbe8bc2d33f4edab9c0b68401d1c1da554077871b7f0affde40bb4a2f33508db20b6726c1c2f7781d549
7
- data.tar.gz: d8f62b7d23d09253ecaba8f37f26b0852019e0b4e8403feafaf109791896dd89d491edb99eeffec87b8fdf1946a0e4b8a40f3038f6706686b79c44e61554c2a1
6
+ metadata.gz: 945df36bd1df9f8d91dcf86ee3e7e51796e50c604da0c9f135339c9be5bc54aa0170c6f8af83aa40ad7f23a4ee4d7c5c840b7f2585cd58170b7a936206543b51
7
+ data.tar.gz: 245fc91742ca186713c28f6ccfb5b591604fdf0e4573a7bf50c32e723a5697f4e8f24f61ce57421830dbff58f6677bbba5f293a810526d66261c393833fcec34
data/lib/netrc_reader.rb CHANGED
@@ -1,12 +1,14 @@
1
1
  require 'netrc'
2
2
 
3
- module Scrapers
3
+ module Scrapers
4
4
  class NetrcReader
5
5
  attr_accessor :user, :pw
6
6
 
7
7
  def initialize(section)
8
8
  netrc = Netrc.read
9
- @user, @pw = netrc[section]
9
+ @user, @pw = netrc[section]
10
+ rescue NoMethodError => e
11
+ fail "Could not find credentials for #{section}"
10
12
  end
11
13
  end
12
14
  end
@@ -4,7 +4,7 @@ require 'scrapers/rubytapas'
4
4
 
5
5
  module Scrapers
6
6
  module RubyTapas
7
-
7
+
8
8
  # Thor script that handles things with Avdi Grimm's RubyTapas
9
9
  class CLI < Thor
10
10
 
@@ -15,6 +15,7 @@ module Scrapers
15
15
  method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
16
16
  method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
17
17
  method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
18
+ method_option :subscription, default: 'rubytapas'
18
19
  def download(episode)
19
20
  Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
20
21
  end
@@ -23,6 +24,7 @@ module Scrapers
23
24
  desc "list", "Show a list of the available episodes"
24
25
  method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
25
26
  method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
27
+ method_option :subscription, default: 'rubytapas'
26
28
  def list()
27
29
  Scrapers::RubyTapas::Scraper.new(nil, options).list!
28
30
  end
@@ -1,11 +1,5 @@
1
1
  module Scrapers
2
2
  module RubyTapas
3
3
 
4
- RUBYTAPAS_HOST="rubytapas.dpdcart.com"
5
- CONTENTS_PATH = "/subscriber/content"
6
- POST_PATH = "/post"
7
- FEED_PATH = "/feed"
8
- POST_QUERY = "id"
9
-
10
4
  end
11
5
  end
@@ -1,5 +1,5 @@
1
- require 'netrc'
2
- require 'mechanize'
1
+ require "mechanize"
2
+ require "netrc"
3
3
 
4
4
  module Scrapers
5
5
  module RubyTapas
@@ -8,17 +8,29 @@ module Scrapers
8
8
  # that provides a connection to rubytapas.dpdcart.com where the
9
9
  # RubyTapas episodes and download files are available, as well as
10
10
  # the episode feed.
11
+
11
12
  class DpdCart
12
13
 
13
- RUBYTAPAS_HOST = 'rubytapas.dpdcart.com'
14
- ENV_RUBYTAPAS_USER = 'RUBYTAPAS_USER'
15
- ENV_RUBYTAPAS_PASSWORD = 'RUBYTAPAS_PASSWORD'
16
- LOGIN_PATH = '/subscriber/login'
17
- LOGIN_URL = "https://#{RUBYTAPAS_HOST}#{LOGIN_PATH}"
18
- FEED_PATH = '/feed'
19
- FEED_URL = "https://#{RUBYTAPAS_HOST}#{FEED_PATH}"
20
- CONTENT_PATH = "/subscriber/content"
21
- CONTENT_URL = "https://#{RUBYTAPAS_HOST}#{CONTENT_PATH}"
14
+ # NOTE: Updating this since now I have *two* subscriptions that
15
+ # use DPD Cart, rubytapas and elixirsips. Generalizing this
16
+ # accordingly. :)
17
+
18
+ # The subscription name will be filled in depending on which
19
+ # subscription I'm downloading from. This is a stock sprintf-type
20
+ # fill in where you pass in the subscription parameter with a
21
+ # value, thusly:
22
+ #
23
+ # DPDCART_HOST % {subscription: "rubytapas"}
24
+ #
25
+ DPDCART_HOST_FORMAT = "%{subscription}.dpdcart.com"
26
+ ENV_DPDCART_USER_FORMAT = "%{subscription}_USER"
27
+ ENV_DPDCART_PASSWORD_FORMAT = "%{subscription}_USER"
28
+ LOGIN_PATH = '/subscriber/login'
29
+ FEED_PATH = '/feed'
30
+ CONTENT_PATH = "/subscriber/content"
31
+
32
+ # Subscription at dbdcart
33
+ attr_accessor :subscription
22
34
 
23
35
  # User name for dpdcart account
24
36
  attr_accessor :user
@@ -26,7 +38,13 @@ module Scrapers
26
38
  # Password for dpdcart acount
27
39
  attr_accessor :password
28
40
 
29
- attr_accessor :dry_run, :debug
41
+ def dpdcart_host ; @dpdcart_host ||= DPDCART_HOST_FORMAT % {subscription: subscription} ; end
42
+ def env_dpdcart_user ; @env_dpdcart_user ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
43
+ def env_dpdcart_password ; @env_dpdcart_password ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
44
+ def debug ; @debug ||= options[:debug] ; end
45
+ def dry_run ; @dry_run ||= options[:dry_run] ; end
46
+ def feed_url ; @feed_url ||= URI("https://#{dpdcart_host}#{FEED_PATH}") ; end
47
+ def login_url ; @login_url ||= URI("https://#{dpdcart_host}#{LOGIN_PATH}") ; end
30
48
 
31
49
  # Create a new instance of the DpdCart gateway.
32
50
  #
@@ -34,63 +52,47 @@ module Scrapers
34
52
  # email address.
35
53
  # @param password [String] - password associated with the
36
54
  # account.
55
+ # @param subscription [String] - subscription name at DPD Cart
56
+ # (e.g. 'rubytapas' or 'elixirsips')
37
57
  #
38
58
  # If the user and password are empty, the information will be
39
59
  # obtained in the following order:
40
60
  #
41
- # - reading the environment variables `RUBYTAPAS_USER` and
42
- # `RUBYTAPAS_PASSWORD`
61
+ # - reading the environment variables `<subscriptiion>_USER` and
62
+ # `<subscription>_PASSWORD`
63
+ #
64
+ # Note that <subscription> will be the subscription passed in
65
+ # above.
43
66
  #
44
67
  # - reading the user's `$HOME/.netrc` file and pulling the
45
- # credentials that match the host name for the rubytapas
68
+ # credentials that match the host name for the subscription
46
69
  # account.
47
70
  #
48
- # If no credentials can be found, it will raise and error:
71
+ # If no credentials can be found, it will raise an error:
49
72
  # `NoCredentialsError`.
50
- def initialize(user=nil, password=nil, options={})
51
- self.dry_run = options[:dry_run]
52
- self.debug = options[:debug]
53
- if user && password
54
- @user = user
55
- @password = password
56
- else
57
- @user, @password = get_credentials_from_environment
58
- unless user && password
59
- @user, @password = get_credentials_from_netrc
60
- end
61
- end
73
+ #
74
+ def initialize(user=nil, password=nil, subscription='rubytapas', options={})
75
+ self.options = options
76
+ self.subscription = subscription
77
+ set_user_and_password(user, password)
62
78
  self.agent = Mechanize.new
63
79
  end
64
80
 
65
- # Return the episode feed from dpdcart
81
+ # Retreive the episode feed from dpdcart
66
82
  def feed!
67
- uri = URI(FEED_URL)
68
- request = Net::HTTP::Get.new(uri)
69
- request.basic_auth user, password
70
- Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
71
- end
72
-
73
- # Login to dpdcart before downloading
74
- def login!
75
- page = agent.get LOGIN_URL
76
- page.form.field_with(name: "username").value = user
77
- page.form.field_with(name: "password").value = password
78
- page.form.submit
79
- unless agent.page.title.match(/Subscription Content/)
80
- raise "Could not log in"
81
- end
82
- agent
83
+ http_fetch(feed_url)
83
84
  end
84
85
 
85
86
  # Download the file from dpdcart
86
87
  def download!(file)
88
+ login
87
89
  warn "DEBUG: downloading #{file}" if debug
88
90
  if dry_run
89
91
  warn "DEBUG: download skipped for dry run" if dry_run
90
92
  filename = file
91
93
  body = "no body"
92
94
  else
93
- page = agent.get(file) unless dry_run
95
+ page = agent.get(file)
94
96
  filename = page.filename
95
97
  body = page.body
96
98
  end
@@ -101,15 +103,49 @@ module Scrapers
101
103
 
102
104
  attr_accessor :options, :agent
103
105
 
106
+ def set_user_and_password(user, password)
107
+ if user && password
108
+ @user = user
109
+ @password = password
110
+ else
111
+ @user, @password = get_credentials_from_environment
112
+ unless user && password
113
+ @user, @password = get_credentials_from_netrc
114
+ end
115
+ end
116
+ end
117
+
104
118
  def get_credentials_from_environment
105
- [ ENV[ENV_RUBYTAPAS_USER], ENV[ENV_RUBYTAPAS_PASSWORD] ]
119
+ [ ENV[env_dpdcart_user], ENV[env_dpdcart_password] ]
106
120
  end
107
121
 
108
122
  def get_credentials_from_netrc
109
- creds = Netrc.read[RUBYTAPAS_HOST]
123
+ creds = Netrc.read[dpdcart_host]
124
+ if creds.nil?
125
+ warn "Could not find credentials for #{dpdcart_host}"
126
+ exit -1
127
+ end
110
128
  [ creds.login, creds.password ]
111
129
  end
112
130
 
131
+ # Login to dpdcart before downloading
132
+ def login
133
+ page = agent.get login_url
134
+ page.form.field_with(name: "username").value = user
135
+ page.form.field_with(name: "password").value = password
136
+ page.form.submit
137
+ unless agent.page.title.match(/Subscription Content/)
138
+ raise "Could not log in"
139
+ end
140
+ agent
141
+ end
142
+
143
+ def http_fetch(uri)
144
+ request = Net::HTTP::Get.new(uri)
145
+ request.basic_auth user, password
146
+ Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
147
+ end
148
+
113
149
  end
114
150
  end
115
151
  end
@@ -8,11 +8,12 @@ module Scrapers
8
8
  module RubyTapas
9
9
 
10
10
  # Scraper provides the methods to download, extract and build a collection
11
- # of RubyTapas episodes from the RubyTapas RSS feed.
11
+ # of DPD cart subscription episode from the RubyTapas RSS feed.
12
12
  class Scraper
13
13
 
14
- attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
14
+ attr_accessor :subscription, :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
15
15
  attr_reader :dpdcart
16
+ attr_reader :episodes
16
17
 
17
18
  # *episode_number* is the RubyTapas episode number (note! not the post id!) of the
18
19
  # episode to download. If the episode number is the symbol :all, then all episodes
@@ -26,6 +27,7 @@ module Scrapers
26
27
  # - "pw": the password of the RubyTapas account
27
28
  # - "destination": the root destination of the episode downloads
28
29
  def initialize(episode_number, options)
30
+ self.subscription = options.fetch("subscription") # let this fail if no subscription given.
29
31
  self.episode_number = episode_number
30
32
  self.user = options["user"]
31
33
  self.pw = options["pw"]
@@ -33,13 +35,12 @@ module Scrapers
33
35
  self.dry_run = options["dry_run"]
34
36
  self.debug = options["debug"]
35
37
  @dpdcart = Scrapers::RubyTapas::DpdCart.
36
- new(user, pw, {dry_run: dry_run, debug: debug})
37
- warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
38
+ new(user, pw, subscription, {dry_run: dry_run, debug: debug})
39
+ @episodes ||= fetch_episodes
38
40
  end
39
41
 
40
42
  # Perform the scraping operation
41
43
  def scrape!
42
- dpdcart.login!
43
44
  if all_episodes?
44
45
  episodes.each do |episode|
45
46
 
@@ -70,11 +71,6 @@ module Scrapers
70
71
  end
71
72
  end
72
73
 
73
- # Returns the collection of episodes.
74
- def episodes
75
- @episodes ||= fetch_episodes
76
- end
77
-
78
74
  # Retrieves the episode associated with *episode number*.
79
75
  def find_by_episode(episode_number)
80
76
  episodes.detect {|e| e.number == episode_number}
@@ -2,11 +2,11 @@ module Scrapers
2
2
  module Version
3
3
 
4
4
  MAJOR = 3
5
- MINOR = 0
5
+ MINOR = 1
6
6
  BUILD = 0
7
7
 
8
8
  end
9
-
9
+
10
10
  VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
11
11
 
12
12
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
@@ -8,7 +8,6 @@ describe Scrapers::RubyTapas::DpdCart do
8
8
 
9
9
  describe "method signatures" do
10
10
  it { is_expected.to respond_to(:feed!) }
11
- it { is_expected.to respond_to(:login!)}
12
11
  it { is_expected.to respond_to(:download!)}
13
12
  end
14
13
 
@@ -22,17 +21,6 @@ describe Scrapers::RubyTapas::DpdCart do
22
21
  end
23
22
  end
24
23
 
25
- describe "#login!" do
26
- it "shows the subscriber content page" do
27
- VCR.use_cassette('rubytapas_login', record: :new_episodes,
28
- match_requests_on: [:method, :host, :path]
29
- ) do
30
- expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
31
- end
32
- end
33
- end
34
-
35
-
36
24
  describe "#download!" do
37
25
  let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
38
26
  let(:name) { "001-binary-literals.html" }
@@ -44,7 +32,6 @@ describe Scrapers::RubyTapas::DpdCart do
44
32
  VCR.use_cassette('rubytapas_download', record: :new_episodes,
45
33
  match_requests_on: [:method, :host, :path,
46
34
  :query]) do
47
- gateway.login!
48
35
  filename, body = gateway.download! file
49
36
  expect(filename).to eq(name)
50
37
  expect(body.size).to eq(5744)
@@ -55,7 +42,6 @@ describe Scrapers::RubyTapas::DpdCart do
55
42
  VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
56
43
  match_requests_on: [:method, :host, :path,
57
44
  :query]) do
58
- gateway.login!
59
45
  filename, body = gateway.download! file
60
46
  expect(filename).to eq(name)
61
47
  filename, body = gateway.download! file2
@@ -13,7 +13,6 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
13
13
 
14
14
  let(:cart) do
15
15
  instance_spy("Scrapers::RubyTapas::DpdCart",
16
- :login! => "Subscription Content | RubyTapas",
17
16
  :feed! => feed,
18
17
  :download! => download
19
18
  )
@@ -35,12 +34,12 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
35
34
  it "retrieves one episode" do
36
35
  expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
37
36
  expect(cart).to receive(:download!).exactly(3).times
38
-
37
+
39
38
  VCR.use_cassette('rubytapas-download-1', :match_requests_on => [:method, :host, :path, :query]) do
40
39
  Scrapers::RubyTapas::CLI.start(%w[download 001 --destination=. --user=joan@example.com --pw=password])
41
40
  end
42
41
  end
43
-
42
+
44
43
  end
45
44
 
46
45
  context "when scraping all episodes" do
@@ -48,7 +47,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
48
47
  it "retrieves all episodes" do
49
48
  expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
50
49
  expect(cart).to receive(:download!).exactly(933).times
51
-
50
+
52
51
  VCR.use_cassette('rubytapas-download-all', :match_requests_on => [:method, :host, :path, :query]) do
53
52
  save_stdout = $stdout
54
53
  # $stdout = output
@@ -56,9 +55,9 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
56
55
  $stdout = save_stdout
57
56
  end
58
57
  end
59
-
58
+
60
59
  end
61
-
60
+
62
61
  end
63
62
 
64
63
  describe "list command" do
@@ -73,7 +72,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
73
72
  Scrapers::RubyTapas::CLI.start(%w[list --user=joan@example.com --pw=password])
74
73
  end
75
74
  end
76
-
75
+
77
76
  describe "version command" do
78
77
  it "prints the version numbers for rubytapas and scrapers" do
79
78
  save_stdout = $stdout
@@ -13,12 +13,11 @@ describe Scrapers::RubyTapas::Scraper do
13
13
  let(:options) do
14
14
  {
15
15
  "destination" => '.',
16
+ "subscription" => 'rubytapas'
16
17
  }
17
18
  end
18
19
  let(:cart) {instance_spy("Scrapers::RubyTapas::DpdCart",
19
20
  :feed! => feed,
20
- :login! =>
21
- double("Subscription Content | RubyTapas"),
22
21
  :download! => [ 'filename',
23
22
  'body' ]
24
23
  )}
@@ -28,7 +27,6 @@ describe Scrapers::RubyTapas::Scraper do
28
27
 
29
28
  describe "#episodes" do
30
29
  it "gets a collection of episodes" do
31
- expect(scraper).to receive(:fetch_episodes).and_call_original
32
30
  expect(scraper.episodes.size).to eq(267)
33
31
  end
34
32
  end
@@ -53,7 +51,6 @@ describe Scrapers::RubyTapas::Scraper do
53
51
  context "when scraping one episode" do
54
52
  it "scrapes one episode" do
55
53
  expect(scraper).to receive(:find_by_episode).with(episode_number).and_call_original
56
- expect(scraper).to receive(:fetch_episodes).once.and_call_original
57
54
  scraper.scrape!
58
55
  end
59
56
 
@@ -63,7 +60,6 @@ describe Scrapers::RubyTapas::Scraper do
63
60
  let(:scraper) { Scrapers::RubyTapas::Scraper.new(:all, options) }
64
61
 
65
62
  it "scrapes all the episodes" do
66
- expect(scraper).to receive(:fetch_episodes).once.and_call_original
67
63
  scraper.scrape!
68
64
  end
69
65
  end