scrapers 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0fe614f84b4dd26d8bb7d314e7e13a08125fa022
4
- data.tar.gz: 06bf20ac2efe3bbf90a0fa5f1a35b4c2d92b0a5d
3
+ metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
4
+ data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
5
5
  SHA512:
6
- metadata.gz: 138916948ac182b0a0fb5998b5b4929493df0e70aee9530cb978f989dfa36dc02b8577f524c9712470d83eec48abca5c5aca976f4272308086e4219417333a37
7
- data.tar.gz: e8dcfc8be2dd0a391364f7a3c182241467dddf79a2b77514f90616b6b5f5b964f58394ccc00ae87644e6a49c0a3d10c991ef229bd90e057b8ced9e1f7ca62c50
6
+ metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
7
+ data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'thor'
3
- require 'netrc'
4
3
  require 'scrapers/manning_books'
5
4
  require 'awesome_print'
6
5
 
@@ -15,22 +14,16 @@ require 'awesome_print'
15
14
 
16
15
  class ManningBooks < Thor
17
16
 
18
- MANNING = 'manning'
19
-
20
17
  desc "download", "Downloads all the editions of all the books on your dashboard"
21
18
  method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
22
19
  method_option :user, :aliases => %w{-u -U}, :desc => "Manning user. Default is read from $HOME/.netrc"
23
20
  method_option :password, :aliases => %w{-p -pw}, :desc => "Manning password. Default is read from $HOME/.netrc"
21
+ method_option :delay, :aliases => %w{-t}, :type => :numeric, :desc => "delay time between requests", :default => Scrapers::ManningBooks::DELAY_TIME
22
+ method_option :dry_run, :aliases => %w[-n], :type => :boolean, :desc => "dry run, do not download and save books", :default => false
24
23
 
25
24
  def download
26
- netrc = Netrc.read
27
- user, pw = netrc[MANNING]
28
- user = options.fetch("user", user)
29
- pw = options.fetch("password", pw)
30
- destination = options.fetch("destination", nil)
31
- STDERR.puts "destination: #{destination}, user: #{user}, pw: #{pw.length}"
32
25
  Signal.trap('INT', proc { STDERR.puts "Download Interrupted"; exit(-1)})
33
- results = Scrapers::ManningBooks.scrape destination, user, pw
26
+ results = Scrapers::ManningBooks::Scraper.new(options).scrape
34
27
  ap results
35
28
  end
36
29
 
@@ -0,0 +1,12 @@
1
+ require 'netrc'
2
+
3
+ module Scrapers
4
+ class NetrcReader
5
+ attr_accessor :user, :pw
6
+
7
+ def initialize(section)
8
+ netrc = Netrc.read
9
+ @user, @pw = netrc[section]
10
+ end
11
+ end
12
+ end
@@ -1,55 +1,84 @@
1
1
  # -*- ruby -*-
2
2
  require 'mechanize'
3
- # require 'pry'
3
+ require 'netrc_reader'
4
4
 
5
5
  module Scrapers
6
6
  module ManningBooks
7
7
 
8
+ NETRC_MANNING_ENTRY = 'manning'
8
9
  DASHBOARD_URL = "https://account.manning.com/dashboard"
9
-
10
- def self.scrape(dest=".", user=nil, pw=nil)
11
- results = Array.new
10
+ DELAY_TIME = 5 # seconds
12
11
 
13
- Mechanize.start do |m|
14
- m.get DASHBOARD_URL
15
- unless m.current_page.uri == DASHBOARD_URL
16
- # log in
17
- m.current_page.form.field_with(:type => 'email').value= user
18
- m.current_page.form.field_with(:type => 'password').value= pw
19
- m.current_page.form.submit
20
- sleep 2
21
- raise "could not log in" unless m.current_page.uri.to_s == DASHBOARD_URL
22
- end
12
+ class Scraper
13
+ attr_accessor :user, :pw, :delay_time, :destination, :dry_run
23
14
 
24
- book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
25
-
26
- Dir.chdir(dest) do |dir|
27
- book_downloads.each do |book|
28
- puts "Downloading #{book.href}"
29
- m.get book.href
30
- results << [m.current_page.filename, m.current_page.uri.to_s]
31
- puts "Saving #{m.current_page.filename}"
32
- m.current_page.save! # overwrite!
15
+ def initialize(options={})
16
+ netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY)
17
+ @user = options.fetch("user", netrc_reader.user)
18
+ @pw = options.fetch("pw", netrc_reader.pw)
19
+ @delay_time = options.fetch("delay", DELAY_TIME)
20
+ @destination = options.fetch("destination", ".")
21
+ @dry_run = options.fetch("dry_run", false)
22
+ end
33
23
 
34
- wait_a_bit 5
24
+ def scrape
25
+ Mechanize.start do |m|
26
+ login(m) do |m|
27
+ book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
28
+ Dir.chdir(destination) do |dir|
29
+ @results = download_books(m, book_downloads)
30
+ end
35
31
  end
36
-
37
32
  end
38
33
 
34
+ Hash[@results]
39
35
  end
36
+
37
+ def login(agent, &block)
38
+ raise "Must provide a block to execute after logged in to site" unless block_given?
40
39
 
41
- Hash[results]
42
- end
40
+ agent.get DASHBOARD_URL
41
+ unless agent.current_page.uri == DASHBOARD_URL
42
+ # log in
43
+ agent.current_page.form.field_with(:type => 'email').value= user
44
+ agent.current_page.form.field_with(:type => 'password').value= pw
45
+ agent.current_page.form.submit
46
+ sleep 2
47
+ raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL
48
+ end
49
+ yield agent
50
+ end
43
51
 
44
- def self.wait_a_bit(delay)
45
- puts "delaying for #{delay} second(s)"
46
- %w[- \ | /].cycle(delay) do |c|
47
- print "\r#{c}"
48
- sleep 1
52
+ def wait_a_bit(delay)
53
+ puts "delaying for #{delay} second(s)"
54
+ %w[- * | +].cycle do |c|
55
+ print "\r#{c}"
56
+ sleep 1
57
+ delay -= 1
58
+ break if delay < 1
59
+ end
60
+ print "\r"
49
61
  end
50
- print "\r"
51
- end
52
62
 
53
63
 
64
+ def download_books(agent, books)
65
+ books.map do |book|
66
+ bookname = book.node.parent.parent.parent.parent.at_css('h1').text
67
+ puts "Downloading #{bookname} from #{book.href}"
68
+ if dry_run
69
+ warn "dry run, not saving"
70
+ else
71
+ agent.get book.href
72
+ puts "Saving #{agent.current_page.filename}"
73
+ agent.current_page.save! # overwrite!
74
+ end
75
+
76
+ wait_a_bit delay_time
77
+ [agent.current_page.filename, agent.current_page.uri.to_s]
78
+ end
79
+ end
80
+
81
+ end
54
82
  end
55
83
  end
84
+
@@ -2,8 +2,8 @@ module Scrapers
2
2
  module Version
3
3
 
4
4
  MAJOR = 2
5
- MINOR = 0
6
- BUILD = 2
5
+ MINOR = 1
6
+ BUILD = 0
7
7
 
8
8
  end
9
9
 
@@ -1,20 +1,113 @@
1
1
  # -*- ruby -*-
2
2
  require 'spec_helper'
3
3
  require 'scrapers/manning_books'
4
+ require 'ostruct'
4
5
 
5
- module Scrapers
6
+ RSpec.describe Scrapers::ManningBooks::Scraper do
7
+ describe "verify Class method signatures" do
8
+ it "responds to :new" do
9
+ expect(Scrapers::ManningBooks::Scraper).to respond_to(:new)
10
+ end
11
+ end
12
+ describe "verify instance method signatures" do
13
+ subject { Scrapers::ManningBooks::Scraper.new }
14
+ it { is_expected.to respond_to :scrape }
15
+ it { is_expected.to respond_to :login }
16
+ it { is_expected.to respond_to :wait_a_bit }
17
+ it { is_expected.to respond_to :download_books }
18
+ end
19
+ describe "#login" do
20
+ let(:scraper) { Scrapers::ManningBooks::Scraper.new }
21
+ let(:agent) { double('agent') }
6
22
 
7
- describe ManningBooks do
8
- it{should respond_to :scrape}
9
- context "scraping" do
10
- before(:all) do
11
- @comic = VCR.use_cassette('manning_books') do
12
- @result = Scrapers::ManningBooks.scrape
13
- end
23
+ before do
24
+ allow(Scrapers::NetrcReader).to receive(:new) do
25
+ OpenStruct.new(user: "joe@example.com", pw: "password")
26
+ end
27
+ end
28
+
29
+ it "verify user" do
30
+ expect(scraper.user).to eq("joe@example.com")
31
+ end
32
+ it "verify pw" do
33
+ expect(scraper.pw).to eq("password")
34
+ end
35
+
36
+ context "when login is passed a block" do
37
+ it "logs in and yields the block" do
38
+ expect(agent).to receive(:get).and_return(agent)
39
+ expect(agent).to receive(:current_page).at_least(5).times.and_return(agent)
40
+ expect(agent).to receive(:uri)
41
+ expect(agent).to receive(:form).exactly(3).times.and_return(agent)
42
+ expect(agent).to receive(:field_with).exactly(2).times.and_return(agent)
43
+ expect(agent).to receive(:value=).exactly(2).times.and_return(agent)
44
+ expect(agent).to receive(:submit).and_return(agent)
45
+ expect(agent).to receive(:uri).and_return(Scrapers::ManningBooks::DASHBOARD_URL)
46
+ scraper.login(agent) { |m| @result = "in yield" }
47
+ expect(@result).to eq("in yield")
14
48
  end
15
49
 
16
- it {expect(@result).to_not be_nil}
17
-
18
50
  end
51
+
52
+ context "when login is not passed a block" do
53
+ it "raises an exception" do
54
+ expect{ scraper.login(agent) }.to raise_error("Must provide a block to execute after logged in to site")
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ describe "#download_books" do
61
+ let(:scraper) {Scrapers::ManningBooks::Scraper.new}
62
+ let(:agent) {double('agent')}
63
+ let(:books) do
64
+ 3.times.map do |i|
65
+ OpenStruct.new(href: "http://#{Scrapers::ManningBooks::DASHBOARD_URL}/#{i}")
66
+ end
67
+ end
68
+
69
+
70
+ before do
71
+ allow(Scrapers::NetrcReader).to receive(:new) do
72
+ OpenStruct.new(user: "joe@example.com", pw: "password")
73
+ end
74
+
75
+ allow(scraper).to receive(:wait_a_bit).at_least(:once)
76
+ end
77
+
78
+ it "downloads the books" do
79
+ save_stdout = $stdout
80
+ $stdout = double('output').as_null_object
81
+ expect(agent).to receive(:get).exactly(3).times
82
+ expect(agent).to receive(:current_page).exactly(3*4).times.and_return(agent)
83
+ expect(agent).to receive(:filename).exactly(3*2).times.and_return("FILENAME")
84
+ expect(agent).to receive(:save!).exactly(3).times
85
+ expect(agent).to receive(:uri).exactly(3).times
86
+ results = scraper.download_books(agent, books)
87
+ $stdout = save_stdout
88
+ expect(results.size).to eq(3)
89
+ end
90
+
91
+ end
92
+
93
+ # Saving the best for last
94
+ describe "#scrape" do
95
+ let(:scraper) {Scrapers::ManningBooks::Scraper.new}
96
+ let(:agent) {double('agent').as_null_object}
97
+ let(:netrc_reader) {double('netrc_reader').as_null_object}
98
+ let(:book_list) {[['book1','url1'],['book2','url2']]}
99
+
100
+ before do
101
+ allow(Scrapers::NetrcReader).to receive(:new).and_return(netrc_reader)
102
+ allow(scraper).to receive(:wait_a_bit).at_least(:once)
103
+ allow(scraper).to receive(:login).and_yield(agent)
104
+ end
105
+
106
+ it "scrapes the dashboard" do
107
+ expect(Mechanize).to receive(:start).and_yield(agent)
108
+ expect(scraper).to receive(:download_books).and_return(book_list)
109
+ scraper.scrape
110
+ end
111
+
19
112
  end
20
113
  end
@@ -1,4 +1,3 @@
1
- #require 'webmock/rspec'
2
1
  require 'vcr'
3
2
 
4
3
  # This file was generated by the `rspec --init` command. Conventionally, all
@@ -8,7 +7,6 @@ require 'vcr'
8
7
  #
9
8
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
10
9
  RSpec.configure do |config|
11
- config.treat_symbols_as_metadata_keys_with_true_values = true
12
10
  config.run_all_when_everything_filtered = true
13
11
  config.filter_run :focus
14
12
 
@@ -23,9 +21,3 @@ VCR.configure do |c|
23
21
  c.cassette_library_dir = 'vcr_cassettes'
24
22
  c.hook_into :webmock
25
23
  end
26
-
27
-
28
-
29
-
30
- require 'scrapers.rb'
31
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapers
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tamara Temple
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-09 00:00:00.000000000 Z
11
+ date: 2014-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -215,6 +215,7 @@ files:
215
215
  - bin/new_scraper
216
216
  - bin/rubytapas
217
217
  - bin/wunderground
218
+ - lib/netrc_reader.rb
218
219
  - lib/scrapers.rb
219
220
  - lib/scrapers/allrecipes.rb
220
221
  - lib/scrapers/discoverynews.rb
@@ -297,3 +298,4 @@ test_files:
297
298
  - spec/scrapers_spec.rb
298
299
  - spec/spec_helper.rb
299
300
  - spec/wunderground_thor_spec.rb
301
+ has_rdoc: