scrapers 3.0.0 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6a3e0b5e2fa58fb8686e8c350b1619dce0e85e0f
4
- data.tar.gz: 9b2f65ddff548a914a2b5e9d8155f725568c5708
3
+ metadata.gz: b09c6be6c6c50e75b02dcc6e889d577f1f3c4d1c
4
+ data.tar.gz: c35d853aad8bfab4880d445f7db11e7532537c76
5
5
  SHA512:
6
- metadata.gz: e67a79dccb7e436e9b629a63157ed44c0bdf76e83f99bbe8bc2d33f4edab9c0b68401d1c1da554077871b7f0affde40bb4a2f33508db20b6726c1c2f7781d549
7
- data.tar.gz: d8f62b7d23d09253ecaba8f37f26b0852019e0b4e8403feafaf109791896dd89d491edb99eeffec87b8fdf1946a0e4b8a40f3038f6706686b79c44e61554c2a1
6
+ metadata.gz: 945df36bd1df9f8d91dcf86ee3e7e51796e50c604da0c9f135339c9be5bc54aa0170c6f8af83aa40ad7f23a4ee4d7c5c840b7f2585cd58170b7a936206543b51
7
+ data.tar.gz: 245fc91742ca186713c28f6ccfb5b591604fdf0e4573a7bf50c32e723a5697f4e8f24f61ce57421830dbff58f6677bbba5f293a810526d66261c393833fcec34
data/lib/netrc_reader.rb CHANGED
@@ -1,12 +1,14 @@
1
1
  require 'netrc'
2
2
 
3
- module Scrapers
3
+ module Scrapers
4
4
  class NetrcReader
5
5
  attr_accessor :user, :pw
6
6
 
7
7
  def initialize(section)
8
8
  netrc = Netrc.read
9
- @user, @pw = netrc[section]
9
+ @user, @pw = netrc[section]
10
+ rescue NoMethodError => e
11
+ fail "Could not find credentials for #{section}"
10
12
  end
11
13
  end
12
14
  end
@@ -4,7 +4,7 @@ require 'scrapers/rubytapas'
4
4
 
5
5
  module Scrapers
6
6
  module RubyTapas
7
-
7
+
8
8
  # Thor script that handles things with Avdi Grimm's RubyTapas
9
9
  class CLI < Thor
10
10
 
@@ -15,6 +15,7 @@ module Scrapers
15
15
  method_option :destination, :aliases => %w{-d}, :desc => "Destination to store the downloads. Default is the current working directory.", :default => "."
16
16
  method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
17
17
  method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
18
+ method_option :subscription, default: 'rubytapas'
18
19
  def download(episode)
19
20
  Scrapers::RubyTapas::Scraper.new(episode, options).scrape!
20
21
  end
@@ -23,6 +24,7 @@ module Scrapers
23
24
  desc "list", "Show a list of the available episodes"
24
25
  method_option :user, :aliases => %w{-u}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
25
26
  method_option :pw, :aliases => %w{-p}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
27
+ method_option :subscription, default: 'rubytapas'
26
28
  def list()
27
29
  Scrapers::RubyTapas::Scraper.new(nil, options).list!
28
30
  end
@@ -1,11 +1,5 @@
1
1
  module Scrapers
2
2
  module RubyTapas
3
3
 
4
- RUBYTAPAS_HOST="rubytapas.dpdcart.com"
5
- CONTENTS_PATH = "/subscriber/content"
6
- POST_PATH = "/post"
7
- FEED_PATH = "/feed"
8
- POST_QUERY = "id"
9
-
10
4
  end
11
5
  end
@@ -1,5 +1,5 @@
1
- require 'netrc'
2
- require 'mechanize'
1
+ require "mechanize"
2
+ require "netrc"
3
3
 
4
4
  module Scrapers
5
5
  module RubyTapas
@@ -8,17 +8,29 @@ module Scrapers
8
8
  # that provides a connection to rubytapas.dpdcart.com where the
9
9
  # RubyTapas episodes and download files are available, as well as
10
10
  # the episode feed.
11
+
11
12
  class DpdCart
12
13
 
13
- RUBYTAPAS_HOST = 'rubytapas.dpdcart.com'
14
- ENV_RUBYTAPAS_USER = 'RUBYTAPAS_USER'
15
- ENV_RUBYTAPAS_PASSWORD = 'RUBYTAPAS_PASSWORD'
16
- LOGIN_PATH = '/subscriber/login'
17
- LOGIN_URL = "https://#{RUBYTAPAS_HOST}#{LOGIN_PATH}"
18
- FEED_PATH = '/feed'
19
- FEED_URL = "https://#{RUBYTAPAS_HOST}#{FEED_PATH}"
20
- CONTENT_PATH = "/subscriber/content"
21
- CONTENT_URL = "https://#{RUBYTAPAS_HOST}#{CONTENT_PATH}"
14
+ # NOTE: Updating this since now I have *two* subscriptions that
15
+ # use DPD Cart, rubytapas and elixirsips. Generalizing this
16
+ # accordingly. :)
17
+
18
+ # The subscription name will be filled in depending on which
19
+ # subscription I'm downloading from. This is a stock sprintf-type
20
+ # fill in where you pass in the subscription parameter with a
21
+ # value, thusly:
22
+ #
23
+ # DPDCART_HOST % {subscription: "rubytapas"}
24
+ #
25
+ DPDCART_HOST_FORMAT = "%{subscription}.dpdcart.com"
26
+ ENV_DPDCART_USER_FORMAT = "%{subscription}_USER"
27
+ ENV_DPDCART_PASSWORD_FORMAT = "%{subscription}_USER"
28
+ LOGIN_PATH = '/subscriber/login'
29
+ FEED_PATH = '/feed'
30
+ CONTENT_PATH = "/subscriber/content"
31
+
32
+ # Subscription at dbdcart
33
+ attr_accessor :subscription
22
34
 
23
35
  # User name for dpdcart account
24
36
  attr_accessor :user
@@ -26,7 +38,13 @@ module Scrapers
26
38
  # Password for dpdcart acount
27
39
  attr_accessor :password
28
40
 
29
- attr_accessor :dry_run, :debug
41
+ def dpdcart_host ; @dpdcart_host ||= DPDCART_HOST_FORMAT % {subscription: subscription} ; end
42
+ def env_dpdcart_user ; @env_dpdcart_user ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
43
+ def env_dpdcart_password ; @env_dpdcart_password ||= ENV_DPDCART_PASSWORD_FORMAT % {subscription: subscription} ; end
44
+ def debug ; @debug ||= options[:debug] ; end
45
+ def dry_run ; @dry_run ||= options[:dry_run] ; end
46
+ def feed_url ; @feed_url ||= URI("https://#{dpdcart_host}#{FEED_PATH}") ; end
47
+ def login_url ; @login_url ||= URI("https://#{dpdcart_host}#{LOGIN_PATH}") ; end
30
48
 
31
49
  # Create a new instance of the DpdCart gateway.
32
50
  #
@@ -34,63 +52,47 @@ module Scrapers
34
52
  # email address.
35
53
  # @param password [String] - password associated with the
36
54
  # account.
55
+ # @param subscription [String] - subscription name at DPD Cart
56
+ # (e.g. 'rubytapas' or 'elixirsips')
37
57
  #
38
58
  # If the user and password are empty, the information will be
39
59
  # obtained in the following order:
40
60
  #
41
- # - reading the environment variables `RUBYTAPAS_USER` and
42
- # `RUBYTAPAS_PASSWORD`
61
+ # - reading the environment variables `<subscriptiion>_USER` and
62
+ # `<subscription>_PASSWORD`
63
+ #
64
+ # Note that <subscription> will be the subscription passed in
65
+ # above.
43
66
  #
44
67
  # - reading the user's `$HOME/.netrc` file and pulling the
45
- # credentials that match the host name for the rubytapas
68
+ # credentials that match the host name for the subscription
46
69
  # account.
47
70
  #
48
- # If no credentials can be found, it will raise and error:
71
+ # If no credentials can be found, it will raise an error:
49
72
  # `NoCredentialsError`.
50
- def initialize(user=nil, password=nil, options={})
51
- self.dry_run = options[:dry_run]
52
- self.debug = options[:debug]
53
- if user && password
54
- @user = user
55
- @password = password
56
- else
57
- @user, @password = get_credentials_from_environment
58
- unless user && password
59
- @user, @password = get_credentials_from_netrc
60
- end
61
- end
73
+ #
74
+ def initialize(user=nil, password=nil, subscription='rubytapas', options={})
75
+ self.options = options
76
+ self.subscription = subscription
77
+ set_user_and_password(user, password)
62
78
  self.agent = Mechanize.new
63
79
  end
64
80
 
65
- # Return the episode feed from dpdcart
81
+ # Retreive the episode feed from dpdcart
66
82
  def feed!
67
- uri = URI(FEED_URL)
68
- request = Net::HTTP::Get.new(uri)
69
- request.basic_auth user, password
70
- Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
71
- end
72
-
73
- # Login to dpdcart before downloading
74
- def login!
75
- page = agent.get LOGIN_URL
76
- page.form.field_with(name: "username").value = user
77
- page.form.field_with(name: "password").value = password
78
- page.form.submit
79
- unless agent.page.title.match(/Subscription Content/)
80
- raise "Could not log in"
81
- end
82
- agent
83
+ http_fetch(feed_url)
83
84
  end
84
85
 
85
86
  # Download the file from dpdcart
86
87
  def download!(file)
88
+ login
87
89
  warn "DEBUG: downloading #{file}" if debug
88
90
  if dry_run
89
91
  warn "DEBUG: download skipped for dry run" if dry_run
90
92
  filename = file
91
93
  body = "no body"
92
94
  else
93
- page = agent.get(file) unless dry_run
95
+ page = agent.get(file)
94
96
  filename = page.filename
95
97
  body = page.body
96
98
  end
@@ -101,15 +103,49 @@ module Scrapers
101
103
 
102
104
  attr_accessor :options, :agent
103
105
 
106
+ def set_user_and_password(user, password)
107
+ if user && password
108
+ @user = user
109
+ @password = password
110
+ else
111
+ @user, @password = get_credentials_from_environment
112
+ unless user && password
113
+ @user, @password = get_credentials_from_netrc
114
+ end
115
+ end
116
+ end
117
+
104
118
  def get_credentials_from_environment
105
- [ ENV[ENV_RUBYTAPAS_USER], ENV[ENV_RUBYTAPAS_PASSWORD] ]
119
+ [ ENV[env_dpdcart_user], ENV[env_dpdcart_password] ]
106
120
  end
107
121
 
108
122
  def get_credentials_from_netrc
109
- creds = Netrc.read[RUBYTAPAS_HOST]
123
+ creds = Netrc.read[dpdcart_host]
124
+ if creds.nil?
125
+ warn "Could not find credentials for #{dpdcart_host}"
126
+ exit -1
127
+ end
110
128
  [ creds.login, creds.password ]
111
129
  end
112
130
 
131
+ # Login to dpdcart before downloading
132
+ def login
133
+ page = agent.get login_url
134
+ page.form.field_with(name: "username").value = user
135
+ page.form.field_with(name: "password").value = password
136
+ page.form.submit
137
+ unless agent.page.title.match(/Subscription Content/)
138
+ raise "Could not log in"
139
+ end
140
+ agent
141
+ end
142
+
143
+ def http_fetch(uri)
144
+ request = Net::HTTP::Get.new(uri)
145
+ request.basic_auth user, password
146
+ Net::HTTP.start(uri.host, uri.port, {:use_ssl => true}) {|http| http.request(request)}.body
147
+ end
148
+
113
149
  end
114
150
  end
115
151
  end
@@ -8,11 +8,12 @@ module Scrapers
8
8
  module RubyTapas
9
9
 
10
10
  # Scraper provides the methods to download, extract and build a collection
11
- # of RubyTapas episodes from the RubyTapas RSS feed.
11
+ # of DPD cart subscription episode from the RubyTapas RSS feed.
12
12
  class Scraper
13
13
 
14
- attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
14
+ attr_accessor :subscription, :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
15
15
  attr_reader :dpdcart
16
+ attr_reader :episodes
16
17
 
17
18
  # *episode_number* is the RubyTapas episode number (note! not the post id!) of the
18
19
  # episode to download. If the episode number is the symbol :all, then all episodes
@@ -26,6 +27,7 @@ module Scrapers
26
27
  # - "pw": the password of the RubyTapas account
27
28
  # - "destination": the root destination of the episode downloads
28
29
  def initialize(episode_number, options)
30
+ self.subscription = options.fetch("subscription") # let this fail if no subscription given.
29
31
  self.episode_number = episode_number
30
32
  self.user = options["user"]
31
33
  self.pw = options["pw"]
@@ -33,13 +35,12 @@ module Scrapers
33
35
  self.dry_run = options["dry_run"]
34
36
  self.debug = options["debug"]
35
37
  @dpdcart = Scrapers::RubyTapas::DpdCart.
36
- new(user, pw, {dry_run: dry_run, debug: debug})
37
- warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
38
+ new(user, pw, subscription, {dry_run: dry_run, debug: debug})
39
+ @episodes ||= fetch_episodes
38
40
  end
39
41
 
40
42
  # Perform the scraping operation
41
43
  def scrape!
42
- dpdcart.login!
43
44
  if all_episodes?
44
45
  episodes.each do |episode|
45
46
 
@@ -70,11 +71,6 @@ module Scrapers
70
71
  end
71
72
  end
72
73
 
73
- # Returns the collection of episodes.
74
- def episodes
75
- @episodes ||= fetch_episodes
76
- end
77
-
78
74
  # Retrieves the episode associated with *episode number*.
79
75
  def find_by_episode(episode_number)
80
76
  episodes.detect {|e| e.number == episode_number}
@@ -2,11 +2,11 @@ module Scrapers
2
2
  module Version
3
3
 
4
4
  MAJOR = 3
5
- MINOR = 0
5
+ MINOR = 1
6
6
  BUILD = 0
7
7
 
8
8
  end
9
-
9
+
10
10
  VERSION = [Version::MAJOR,Version::MINOR,Version::BUILD].map(&:to_s).join(".")
11
11
 
12
12
  DESCRIPTION = "A library of web site scrapers utilizing mechanize and other goodies. Helpful in gathering images, moving things, saving things, etc."
@@ -8,7 +8,6 @@ describe Scrapers::RubyTapas::DpdCart do
8
8
 
9
9
  describe "method signatures" do
10
10
  it { is_expected.to respond_to(:feed!) }
11
- it { is_expected.to respond_to(:login!)}
12
11
  it { is_expected.to respond_to(:download!)}
13
12
  end
14
13
 
@@ -22,17 +21,6 @@ describe Scrapers::RubyTapas::DpdCart do
22
21
  end
23
22
  end
24
23
 
25
- describe "#login!" do
26
- it "shows the subscriber content page" do
27
- VCR.use_cassette('rubytapas_login', record: :new_episodes,
28
- match_requests_on: [:method, :host, :path]
29
- ) do
30
- expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
31
- end
32
- end
33
- end
34
-
35
-
36
24
  describe "#download!" do
37
25
  let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
38
26
  let(:name) { "001-binary-literals.html" }
@@ -44,7 +32,6 @@ describe Scrapers::RubyTapas::DpdCart do
44
32
  VCR.use_cassette('rubytapas_download', record: :new_episodes,
45
33
  match_requests_on: [:method, :host, :path,
46
34
  :query]) do
47
- gateway.login!
48
35
  filename, body = gateway.download! file
49
36
  expect(filename).to eq(name)
50
37
  expect(body.size).to eq(5744)
@@ -55,7 +42,6 @@ describe Scrapers::RubyTapas::DpdCart do
55
42
  VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
56
43
  match_requests_on: [:method, :host, :path,
57
44
  :query]) do
58
- gateway.login!
59
45
  filename, body = gateway.download! file
60
46
  expect(filename).to eq(name)
61
47
  filename, body = gateway.download! file2
@@ -13,7 +13,6 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
13
13
 
14
14
  let(:cart) do
15
15
  instance_spy("Scrapers::RubyTapas::DpdCart",
16
- :login! => "Subscription Content | RubyTapas",
17
16
  :feed! => feed,
18
17
  :download! => download
19
18
  )
@@ -35,12 +34,12 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
35
34
  it "retrieves one episode" do
36
35
  expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
37
36
  expect(cart).to receive(:download!).exactly(3).times
38
-
37
+
39
38
  VCR.use_cassette('rubytapas-download-1', :match_requests_on => [:method, :host, :path, :query]) do
40
39
  Scrapers::RubyTapas::CLI.start(%w[download 001 --destination=. --user=joan@example.com --pw=password])
41
40
  end
42
41
  end
43
-
42
+
44
43
  end
45
44
 
46
45
  context "when scraping all episodes" do
@@ -48,7 +47,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
48
47
  it "retrieves all episodes" do
49
48
  expect_any_instance_of(Scrapers::RubyTapas::Scraper).to receive(:scrape!).once.and_call_original
50
49
  expect(cart).to receive(:download!).exactly(933).times
51
-
50
+
52
51
  VCR.use_cassette('rubytapas-download-all', :match_requests_on => [:method, :host, :path, :query]) do
53
52
  save_stdout = $stdout
54
53
  # $stdout = output
@@ -56,9 +55,9 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
56
55
  $stdout = save_stdout
57
56
  end
58
57
  end
59
-
58
+
60
59
  end
61
-
60
+
62
61
  end
63
62
 
64
63
  describe "list command" do
@@ -73,7 +72,7 @@ RSpec.describe "RubyTapas Thor Script", :type => :integration do
73
72
  Scrapers::RubyTapas::CLI.start(%w[list --user=joan@example.com --pw=password])
74
73
  end
75
74
  end
76
-
75
+
77
76
  describe "version command" do
78
77
  it "prints the version numbers for rubytapas and scrapers" do
79
78
  save_stdout = $stdout
@@ -13,12 +13,11 @@ describe Scrapers::RubyTapas::Scraper do
13
13
  let(:options) do
14
14
  {
15
15
  "destination" => '.',
16
+ "subscription" => 'rubytapas'
16
17
  }
17
18
  end
18
19
  let(:cart) {instance_spy("Scrapers::RubyTapas::DpdCart",
19
20
  :feed! => feed,
20
- :login! =>
21
- double("Subscription Content | RubyTapas"),
22
21
  :download! => [ 'filename',
23
22
  'body' ]
24
23
  )}
@@ -28,7 +27,6 @@ describe Scrapers::RubyTapas::Scraper do
28
27
 
29
28
  describe "#episodes" do
30
29
  it "gets a collection of episodes" do
31
- expect(scraper).to receive(:fetch_episodes).and_call_original
32
30
  expect(scraper.episodes.size).to eq(267)
33
31
  end
34
32
  end
@@ -53,7 +51,6 @@ describe Scrapers::RubyTapas::Scraper do
53
51
  context "when scraping one episode" do
54
52
  it "scrapes one episode" do
55
53
  expect(scraper).to receive(:find_by_episode).with(episode_number).and_call_original
56
- expect(scraper).to receive(:fetch_episodes).once.and_call_original
57
54
  scraper.scrape!
58
55
  end
59
56
 
@@ -63,7 +60,6 @@ describe Scrapers::RubyTapas::Scraper do
63
60
  let(:scraper) { Scrapers::RubyTapas::Scraper.new(:all, options) }
64
61
 
65
62
  it "scrapes all the episodes" do
66
- expect(scraper).to receive(:fetch_episodes).once.and_call_original
67
63
  scraper.scrape!
68
64
  end
69
65
  end