scrapers 2.0.2 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 0fe614f84b4dd26d8bb7d314e7e13a08125fa022
4
- data.tar.gz: 06bf20ac2efe3bbf90a0fa5f1a35b4c2d92b0a5d
3
+ metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
4
+ data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
5
5
  SHA512:
6
- metadata.gz: 138916948ac182b0a0fb5998b5b4929493df0e70aee9530cb978f989dfa36dc02b8577f524c9712470d83eec48abca5c5aca976f4272308086e4219417333a37
7
- data.tar.gz: e8dcfc8be2dd0a391364f7a3c182241467dddf79a2b77514f90616b6b5f5b964f58394ccc00ae87644e6a49c0a3d10c991ef229bd90e057b8ced9e1f7ca62c50
6
+ metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
7
+ data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
@@ -1,6 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'thor'
3
- require 'netrc'
4
3
  require 'scrapers/manning_books'
5
4
  require 'awesome_print'
6
5
 
@@ -15,22 +14,16 @@ require 'awesome_print'
15
14
 
16
15
  class ManningBooks < Thor
17
16
 
18
- MANNING = 'manning'
19
-
20
17
  desc "download", "Downloads all the editions of all the books on your dashboard"
21
18
  method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
22
19
  method_option :user, :aliases => %w{-u -U}, :desc => "Manning user. Default is read from $HOME/.netrc"
23
20
  method_option :password, :aliases => %w{-p -pw}, :desc => "Manning password. Default is read from $HOME/.netrc"
21
+ method_option :delay, :aliases => %w{-t}, :type => :numeric, :desc => "delay time between requests", :default => Scrapers::ManningBooks::DELAY_TIME
22
+ method_option :dry_run, :aliases => %w[-n], :type => :boolean, :desc => "dry run, do not download and save books", :default => false
24
23
 
25
24
  def download
26
- netrc = Netrc.read
27
- user, pw = netrc[MANNING]
28
- user = options.fetch("user", user)
29
- pw = options.fetch("password", pw)
30
- destination = options.fetch("destination", nil)
31
- STDERR.puts "destination: #{destination}, user: #{user}, pw: #{pw.length}"
32
25
  Signal.trap('INT', proc { STDERR.puts "Download Interrupted"; exit(-1)})
33
- results = Scrapers::ManningBooks.scrape destination, user, pw
26
+ results = Scrapers::ManningBooks::Scraper.new(options).scrape
34
27
  ap results
35
28
  end
36
29
 
@@ -0,0 +1,12 @@
1
+ require 'netrc'
2
+
3
+ module Scrapers
4
+ class NetrcReader
5
+ attr_accessor :user, :pw
6
+
7
+ def initialize(section)
8
+ netrc = Netrc.read
9
+ @user, @pw = netrc[section]
10
+ end
11
+ end
12
+ end
@@ -1,55 +1,84 @@
1
1
  # -*- ruby -*-
2
2
  require 'mechanize'
3
- # require 'pry'
3
+ require 'netrc_reader'
4
4
 
5
5
  module Scrapers
6
6
  module ManningBooks
7
7
 
8
+ NETRC_MANNING_ENTRY = 'manning'
8
9
  DASHBOARD_URL = "https://account.manning.com/dashboard"
9
-
10
- def self.scrape(dest=".", user=nil, pw=nil)
11
- results = Array.new
10
+ DELAY_TIME = 5 # seconds
12
11
 
13
- Mechanize.start do |m|
14
- m.get DASHBOARD_URL
15
- unless m.current_page.uri == DASHBOARD_URL
16
- # log in
17
- m.current_page.form.field_with(:type => 'email').value= user
18
- m.current_page.form.field_with(:type => 'password').value= pw
19
- m.current_page.form.submit
20
- sleep 2
21
- raise "could not log in" unless m.current_page.uri.to_s == DASHBOARD_URL
22
- end
12
+ class Scraper
13
+ attr_accessor :user, :pw, :delay_time, :destination, :dry_run
23
14
 
24
- book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
25
-
26
- Dir.chdir(dest) do |dir|
27
- book_downloads.each do |book|
28
- puts "Downloading #{book.href}"
29
- m.get book.href
30
- results << [m.current_page.filename, m.current_page.uri.to_s]
31
- puts "Saving #{m.current_page.filename}"
32
- m.current_page.save! # overwrite!
15
+ def initialize(options={})
16
+ netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY)
17
+ @user = options.fetch("user", netrc_reader.user)
18
+ @pw = options.fetch("pw", netrc_reader.pw)
19
+ @delay_time = options.fetch("delay", DELAY_TIME)
20
+ @destination = options.fetch("destination", ".")
21
+ @dry_run = options.fetch("dry_run", false)
22
+ end
33
23
 
34
- wait_a_bit 5
24
+ def scrape
25
+ Mechanize.start do |m|
26
+ login(m) do |m|
27
+ book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
28
+ Dir.chdir(destination) do |dir|
29
+ @results = download_books(m, book_downloads)
30
+ end
35
31
  end
36
-
37
32
  end
38
33
 
34
+ Hash[@results]
39
35
  end
36
+
37
+ def login(agent, &block)
38
+ raise "Must provide a block to execute after logged in to site" unless block_given?
40
39
 
41
- Hash[results]
42
- end
40
+ agent.get DASHBOARD_URL
41
+ unless agent.current_page.uri == DASHBOARD_URL
42
+ # log in
43
+ agent.current_page.form.field_with(:type => 'email').value= user
44
+ agent.current_page.form.field_with(:type => 'password').value= pw
45
+ agent.current_page.form.submit
46
+ sleep 2
47
+ raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL
48
+ end
49
+ yield agent
50
+ end
43
51
 
44
- def self.wait_a_bit(delay)
45
- puts "delaying for #{delay} second(s)"
46
- %w[- \ | /].cycle(delay) do |c|
47
- print "\r#{c}"
48
- sleep 1
52
+ def wait_a_bit(delay)
53
+ puts "delaying for #{delay} second(s)"
54
+ %w[- * | +].cycle do |c|
55
+ print "\r#{c}"
56
+ sleep 1
57
+ delay -= 1
58
+ break if delay < 1
59
+ end
60
+ print "\r"
49
61
  end
50
- print "\r"
51
- end
52
62
 
53
63
 
64
+ def download_books(agent, books)
65
+ books.map do |book|
66
+ bookname = book.node.parent.parent.parent.parent.at_css('h1').text
67
+ puts "Downloading #{bookname} from #{book.href}"
68
+ if dry_run
69
+ warn "dry run, not saving"
70
+ else
71
+ agent.get book.href
72
+ puts "Saving #{agent.current_page.filename}"
73
+ agent.current_page.save! # overwrite!
74
+ end
75
+
76
+ wait_a_bit delay_time
77
+ [agent.current_page.filename, agent.current_page.uri.to_s]
78
+ end
79
+ end
80
+
81
+ end
54
82
  end
55
83
  end
84
+
@@ -2,8 +2,8 @@ module Scrapers
2
2
  module Version
3
3
 
4
4
  MAJOR = 2
5
- MINOR = 0
6
- BUILD = 2
5
+ MINOR = 1
6
+ BUILD = 0
7
7
 
8
8
  end
9
9
 
@@ -1,20 +1,113 @@
1
1
  # -*- ruby -*-
2
2
  require 'spec_helper'
3
3
  require 'scrapers/manning_books'
4
+ require 'ostruct'
4
5
 
5
- module Scrapers
6
+ RSpec.describe Scrapers::ManningBooks::Scraper do
7
+ describe "verify Class method signatures" do
8
+ it "responds to :new" do
9
+ expect(Scrapers::ManningBooks::Scraper).to respond_to(:new)
10
+ end
11
+ end
12
+ describe "verify instance method signatures" do
13
+ subject { Scrapers::ManningBooks::Scraper.new }
14
+ it { is_expected.to respond_to :scrape }
15
+ it { is_expected.to respond_to :login }
16
+ it { is_expected.to respond_to :wait_a_bit }
17
+ it { is_expected.to respond_to :download_books }
18
+ end
19
+ describe "#login" do
20
+ let(:scraper) { Scrapers::ManningBooks::Scraper.new }
21
+ let(:agent) { double('agent') }
6
22
 
7
- describe ManningBooks do
8
- it{should respond_to :scrape}
9
- context "scraping" do
10
- before(:all) do
11
- @comic = VCR.use_cassette('manning_books') do
12
- @result = Scrapers::ManningBooks.scrape
13
- end
23
+ before do
24
+ allow(Scrapers::NetrcReader).to receive(:new) do
25
+ OpenStruct.new(user: "joe@example.com", pw: "password")
26
+ end
27
+ end
28
+
29
+ it "verify user" do
30
+ expect(scraper.user).to eq("joe@example.com")
31
+ end
32
+ it "verify pw" do
33
+ expect(scraper.pw).to eq("password")
34
+ end
35
+
36
+ context "when login is passed a block" do
37
+ it "logs in and yields the block" do
38
+ expect(agent).to receive(:get).and_return(agent)
39
+ expect(agent).to receive(:current_page).at_least(5).times.and_return(agent)
40
+ expect(agent).to receive(:uri)
41
+ expect(agent).to receive(:form).exactly(3).times.and_return(agent)
42
+ expect(agent).to receive(:field_with).exactly(2).times.and_return(agent)
43
+ expect(agent).to receive(:value=).exactly(2).times.and_return(agent)
44
+ expect(agent).to receive(:submit).and_return(agent)
45
+ expect(agent).to receive(:uri).and_return(Scrapers::ManningBooks::DASHBOARD_URL)
46
+ scraper.login(agent) { |m| @result = "in yield" }
47
+ expect(@result).to eq("in yield")
14
48
  end
15
49
 
16
- it {expect(@result).to_not be_nil}
17
-
18
50
  end
51
+
52
+ context "when login is not passed a block" do
53
+ it "raises an exception" do
54
+ expect{ scraper.login(agent) }.to raise_error("Must provide a block to execute after logged in to site")
55
+ end
56
+ end
57
+
58
+ end
59
+
60
+ describe "#download_books" do
61
+ let(:scraper) {Scrapers::ManningBooks::Scraper.new}
62
+ let(:agent) {double('agent')}
63
+ let(:books) do
64
+ 3.times.map do |i|
65
+ OpenStruct.new(href: "http://#{Scrapers::ManningBooks::DASHBOARD_URL}/#{i}")
66
+ end
67
+ end
68
+
69
+
70
+ before do
71
+ allow(Scrapers::NetrcReader).to receive(:new) do
72
+ OpenStruct.new(user: "joe@example.com", pw: "password")
73
+ end
74
+
75
+ allow(scraper).to receive(:wait_a_bit).at_least(:once)
76
+ end
77
+
78
+ it "downloads the books" do
79
+ save_stdout = $stdout
80
+ $stdout = double('output').as_null_object
81
+ expect(agent).to receive(:get).exactly(3).times
82
+ expect(agent).to receive(:current_page).exactly(3*4).times.and_return(agent)
83
+ expect(agent).to receive(:filename).exactly(3*2).times.and_return("FILENAME")
84
+ expect(agent).to receive(:save!).exactly(3).times
85
+ expect(agent).to receive(:uri).exactly(3).times
86
+ results = scraper.download_books(agent, books)
87
+ $stdout = save_stdout
88
+ expect(results.size).to eq(3)
89
+ end
90
+
91
+ end
92
+
93
+ # Saving the best for last
94
+ describe "#scrape" do
95
+ let(:scraper) {Scrapers::ManningBooks::Scraper.new}
96
+ let(:agent) {double('agent').as_null_object}
97
+ let(:netrc_reader) {double('netrc_reader').as_null_object}
98
+ let(:book_list) {[['book1','url1'],['book2','url2']]}
99
+
100
+ before do
101
+ allow(Scrapers::NetrcReader).to receive(:new).and_return(netrc_reader)
102
+ allow(scraper).to receive(:wait_a_bit).at_least(:once)
103
+ allow(scraper).to receive(:login).and_yield(agent)
104
+ end
105
+
106
+ it "scrapes the dashboard" do
107
+ expect(Mechanize).to receive(:start).and_yield(agent)
108
+ expect(scraper).to receive(:download_books).and_return(book_list)
109
+ scraper.scrape
110
+ end
111
+
19
112
  end
20
113
  end
@@ -1,4 +1,3 @@
1
- #require 'webmock/rspec'
2
1
  require 'vcr'
3
2
 
4
3
  # This file was generated by the `rspec --init` command. Conventionally, all
@@ -8,7 +7,6 @@ require 'vcr'
8
7
  #
9
8
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
10
9
  RSpec.configure do |config|
11
- config.treat_symbols_as_metadata_keys_with_true_values = true
12
10
  config.run_all_when_everything_filtered = true
13
11
  config.filter_run :focus
14
12
 
@@ -23,9 +21,3 @@ VCR.configure do |c|
23
21
  c.cassette_library_dir = 'vcr_cassettes'
24
22
  c.hook_into :webmock
25
23
  end
26
-
27
-
28
-
29
-
30
- require 'scrapers.rb'
31
-
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scrapers
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tamara Temple
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-10-09 00:00:00.000000000 Z
11
+ date: 2014-12-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -215,6 +215,7 @@ files:
215
215
  - bin/new_scraper
216
216
  - bin/rubytapas
217
217
  - bin/wunderground
218
+ - lib/netrc_reader.rb
218
219
  - lib/scrapers.rb
219
220
  - lib/scrapers/allrecipes.rb
220
221
  - lib/scrapers/discoverynews.rb
@@ -297,3 +298,4 @@ test_files:
297
298
  - spec/scrapers_spec.rb
298
299
  - spec/spec_helper.rb
299
300
  - spec/wunderground_thor_spec.rb
301
+ has_rdoc: