scrapers 2.0.2 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/manning_books +3 -10
- data/lib/netrc_reader.rb +12 -0
- data/lib/scrapers/manning_books.rb +63 -34
- data/lib/scrapers/version.rb +2 -2
- data/spec/scrapers/manning_books_spec.rb +103 -10
- data/spec/spec_helper.rb +0 -8
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
|
4
|
+
data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
|
7
|
+
data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
|
data/bin/manning_books
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'thor'
|
3
|
-
require 'netrc'
|
4
3
|
require 'scrapers/manning_books'
|
5
4
|
require 'awesome_print'
|
6
5
|
|
@@ -15,22 +14,16 @@ require 'awesome_print'
|
|
15
14
|
|
16
15
|
class ManningBooks < Thor
|
17
16
|
|
18
|
-
MANNING = 'manning'
|
19
|
-
|
20
17
|
desc "download", "Downloads all the editions of all the books on your dashboard"
|
21
18
|
method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
|
22
19
|
method_option :user, :aliases => %w{-u -U}, :desc => "Manning user. Default is read from $HOME/.netrc"
|
23
20
|
method_option :password, :aliases => %w{-p -pw}, :desc => "Manning password. Default is read from $HOME/.netrc"
|
21
|
+
method_option :delay, :aliases => %w{-t}, :type => :numeric, :desc => "delay time between requests", :default => Scrapers::ManningBooks::DELAY_TIME
|
22
|
+
method_option :dry_run, :aliases => %w[-n], :type => :boolean, :desc => "dry run, do not download and save books", :default => false
|
24
23
|
|
25
24
|
def download
|
26
|
-
netrc = Netrc.read
|
27
|
-
user, pw = netrc[MANNING]
|
28
|
-
user = options.fetch("user", user)
|
29
|
-
pw = options.fetch("password", pw)
|
30
|
-
destination = options.fetch("destination", nil)
|
31
|
-
STDERR.puts "destination: #{destination}, user: #{user}, pw: #{pw.length}"
|
32
25
|
Signal.trap('INT', proc { STDERR.puts "Download Interrupted"; exit(-1)})
|
33
|
-
results = Scrapers::ManningBooks.scrape
|
26
|
+
results = Scrapers::ManningBooks::Scraper.new(options).scrape
|
34
27
|
ap results
|
35
28
|
end
|
36
29
|
|
data/lib/netrc_reader.rb
ADDED
@@ -1,55 +1,84 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
require 'mechanize'
|
3
|
-
|
3
|
+
require 'netrc_reader'
|
4
4
|
|
5
5
|
module Scrapers
|
6
6
|
module ManningBooks
|
7
7
|
|
8
|
+
NETRC_MANNING_ENTRY = 'manning'
|
8
9
|
DASHBOARD_URL = "https://account.manning.com/dashboard"
|
9
|
-
|
10
|
-
def self.scrape(dest=".", user=nil, pw=nil)
|
11
|
-
results = Array.new
|
10
|
+
DELAY_TIME = 5 # seconds
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
unless m.current_page.uri == DASHBOARD_URL
|
16
|
-
# log in
|
17
|
-
m.current_page.form.field_with(:type => 'email').value= user
|
18
|
-
m.current_page.form.field_with(:type => 'password').value= pw
|
19
|
-
m.current_page.form.submit
|
20
|
-
sleep 2
|
21
|
-
raise "could not log in" unless m.current_page.uri.to_s == DASHBOARD_URL
|
22
|
-
end
|
12
|
+
class Scraper
|
13
|
+
attr_accessor :user, :pw, :delay_time, :destination, :dry_run
|
23
14
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
m.current_page.save! # overwrite!
|
15
|
+
def initialize(options={})
|
16
|
+
netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY)
|
17
|
+
@user = options.fetch("user", netrc_reader.user)
|
18
|
+
@pw = options.fetch("pw", netrc_reader.pw)
|
19
|
+
@delay_time = options.fetch("delay", DELAY_TIME)
|
20
|
+
@destination = options.fetch("destination", ".")
|
21
|
+
@dry_run = options.fetch("dry_run", false)
|
22
|
+
end
|
33
23
|
|
34
|
-
|
24
|
+
def scrape
|
25
|
+
Mechanize.start do |m|
|
26
|
+
login(m) do |m|
|
27
|
+
book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
|
28
|
+
Dir.chdir(destination) do |dir|
|
29
|
+
@results = download_books(m, book_downloads)
|
30
|
+
end
|
35
31
|
end
|
36
|
-
|
37
32
|
end
|
38
33
|
|
34
|
+
Hash[@results]
|
39
35
|
end
|
36
|
+
|
37
|
+
def login(agent, &block)
|
38
|
+
raise "Must provide a block to execute after logged in to site" unless block_given?
|
40
39
|
|
41
|
-
|
42
|
-
|
40
|
+
agent.get DASHBOARD_URL
|
41
|
+
unless agent.current_page.uri == DASHBOARD_URL
|
42
|
+
# log in
|
43
|
+
agent.current_page.form.field_with(:type => 'email').value= user
|
44
|
+
agent.current_page.form.field_with(:type => 'password').value= pw
|
45
|
+
agent.current_page.form.submit
|
46
|
+
sleep 2
|
47
|
+
raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL
|
48
|
+
end
|
49
|
+
yield agent
|
50
|
+
end
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
def wait_a_bit(delay)
|
53
|
+
puts "delaying for #{delay} second(s)"
|
54
|
+
%w[- * | +].cycle do |c|
|
55
|
+
print "\r#{c}"
|
56
|
+
sleep 1
|
57
|
+
delay -= 1
|
58
|
+
break if delay < 1
|
59
|
+
end
|
60
|
+
print "\r"
|
49
61
|
end
|
50
|
-
print "\r"
|
51
|
-
end
|
52
62
|
|
53
63
|
|
64
|
+
def download_books(agent, books)
|
65
|
+
books.map do |book|
|
66
|
+
bookname = book.node.parent.parent.parent.parent.at_css('h1').text
|
67
|
+
puts "Downloading #{bookname} from #{book.href}"
|
68
|
+
if dry_run
|
69
|
+
warn "dry run, not saving"
|
70
|
+
else
|
71
|
+
agent.get book.href
|
72
|
+
puts "Saving #{agent.current_page.filename}"
|
73
|
+
agent.current_page.save! # overwrite!
|
74
|
+
end
|
75
|
+
|
76
|
+
wait_a_bit delay_time
|
77
|
+
[agent.current_page.filename, agent.current_page.uri.to_s]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
54
82
|
end
|
55
83
|
end
|
84
|
+
|
data/lib/scrapers/version.rb
CHANGED
@@ -1,20 +1,113 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
require 'spec_helper'
|
3
3
|
require 'scrapers/manning_books'
|
4
|
+
require 'ostruct'
|
4
5
|
|
5
|
-
|
6
|
+
RSpec.describe Scrapers::ManningBooks::Scraper do
|
7
|
+
describe "verify Class method signatures" do
|
8
|
+
it "responds to :new" do
|
9
|
+
expect(Scrapers::ManningBooks::Scraper).to respond_to(:new)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
describe "verify instance method signatures" do
|
13
|
+
subject { Scrapers::ManningBooks::Scraper.new }
|
14
|
+
it { is_expected.to respond_to :scrape }
|
15
|
+
it { is_expected.to respond_to :login }
|
16
|
+
it { is_expected.to respond_to :wait_a_bit }
|
17
|
+
it { is_expected.to respond_to :download_books }
|
18
|
+
end
|
19
|
+
describe "#login" do
|
20
|
+
let(:scraper) { Scrapers::ManningBooks::Scraper.new }
|
21
|
+
let(:agent) { double('agent') }
|
6
22
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
23
|
+
before do
|
24
|
+
allow(Scrapers::NetrcReader).to receive(:new) do
|
25
|
+
OpenStruct.new(user: "joe@example.com", pw: "password")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
it "verify user" do
|
30
|
+
expect(scraper.user).to eq("joe@example.com")
|
31
|
+
end
|
32
|
+
it "verify pw" do
|
33
|
+
expect(scraper.pw).to eq("password")
|
34
|
+
end
|
35
|
+
|
36
|
+
context "when login is passed a block" do
|
37
|
+
it "logs in and yields the block" do
|
38
|
+
expect(agent).to receive(:get).and_return(agent)
|
39
|
+
expect(agent).to receive(:current_page).at_least(5).times.and_return(agent)
|
40
|
+
expect(agent).to receive(:uri)
|
41
|
+
expect(agent).to receive(:form).exactly(3).times.and_return(agent)
|
42
|
+
expect(agent).to receive(:field_with).exactly(2).times.and_return(agent)
|
43
|
+
expect(agent).to receive(:value=).exactly(2).times.and_return(agent)
|
44
|
+
expect(agent).to receive(:submit).and_return(agent)
|
45
|
+
expect(agent).to receive(:uri).and_return(Scrapers::ManningBooks::DASHBOARD_URL)
|
46
|
+
scraper.login(agent) { |m| @result = "in yield" }
|
47
|
+
expect(@result).to eq("in yield")
|
14
48
|
end
|
15
49
|
|
16
|
-
it {expect(@result).to_not be_nil}
|
17
|
-
|
18
50
|
end
|
51
|
+
|
52
|
+
context "when login is not passed a block" do
|
53
|
+
it "raises an exception" do
|
54
|
+
expect{ scraper.login(agent) }.to raise_error("Must provide a block to execute after logged in to site")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#download_books" do
|
61
|
+
let(:scraper) {Scrapers::ManningBooks::Scraper.new}
|
62
|
+
let(:agent) {double('agent')}
|
63
|
+
let(:books) do
|
64
|
+
3.times.map do |i|
|
65
|
+
OpenStruct.new(href: "http://#{Scrapers::ManningBooks::DASHBOARD_URL}/#{i}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
before do
|
71
|
+
allow(Scrapers::NetrcReader).to receive(:new) do
|
72
|
+
OpenStruct.new(user: "joe@example.com", pw: "password")
|
73
|
+
end
|
74
|
+
|
75
|
+
allow(scraper).to receive(:wait_a_bit).at_least(:once)
|
76
|
+
end
|
77
|
+
|
78
|
+
it "downloads the books" do
|
79
|
+
save_stdout = $stdout
|
80
|
+
$stdout = double('output').as_null_object
|
81
|
+
expect(agent).to receive(:get).exactly(3).times
|
82
|
+
expect(agent).to receive(:current_page).exactly(3*4).times.and_return(agent)
|
83
|
+
expect(agent).to receive(:filename).exactly(3*2).times.and_return("FILENAME")
|
84
|
+
expect(agent).to receive(:save!).exactly(3).times
|
85
|
+
expect(agent).to receive(:uri).exactly(3).times
|
86
|
+
results = scraper.download_books(agent, books)
|
87
|
+
$stdout = save_stdout
|
88
|
+
expect(results.size).to eq(3)
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Saving the best for last
|
94
|
+
describe "#scrape" do
|
95
|
+
let(:scraper) {Scrapers::ManningBooks::Scraper.new}
|
96
|
+
let(:agent) {double('agent').as_null_object}
|
97
|
+
let(:netrc_reader) {double('netrc_reader').as_null_object}
|
98
|
+
let(:book_list) {[['book1','url1'],['book2','url2']]}
|
99
|
+
|
100
|
+
before do
|
101
|
+
allow(Scrapers::NetrcReader).to receive(:new).and_return(netrc_reader)
|
102
|
+
allow(scraper).to receive(:wait_a_bit).at_least(:once)
|
103
|
+
allow(scraper).to receive(:login).and_yield(agent)
|
104
|
+
end
|
105
|
+
|
106
|
+
it "scrapes the dashboard" do
|
107
|
+
expect(Mechanize).to receive(:start).and_yield(agent)
|
108
|
+
expect(scraper).to receive(:download_books).and_return(book_list)
|
109
|
+
scraper.scrape
|
110
|
+
end
|
111
|
+
|
19
112
|
end
|
20
113
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#require 'webmock/rspec'
|
2
1
|
require 'vcr'
|
3
2
|
|
4
3
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
@@ -8,7 +7,6 @@ require 'vcr'
|
|
8
7
|
#
|
9
8
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
10
9
|
RSpec.configure do |config|
|
11
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
12
10
|
config.run_all_when_everything_filtered = true
|
13
11
|
config.filter_run :focus
|
14
12
|
|
@@ -23,9 +21,3 @@ VCR.configure do |c|
|
|
23
21
|
c.cassette_library_dir = 'vcr_cassettes'
|
24
22
|
c.hook_into :webmock
|
25
23
|
end
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
require 'scrapers.rb'
|
31
|
-
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tamara Temple
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -215,6 +215,7 @@ files:
|
|
215
215
|
- bin/new_scraper
|
216
216
|
- bin/rubytapas
|
217
217
|
- bin/wunderground
|
218
|
+
- lib/netrc_reader.rb
|
218
219
|
- lib/scrapers.rb
|
219
220
|
- lib/scrapers/allrecipes.rb
|
220
221
|
- lib/scrapers/discoverynews.rb
|
@@ -297,3 +298,4 @@ test_files:
|
|
297
298
|
- spec/scrapers_spec.rb
|
298
299
|
- spec/spec_helper.rb
|
299
300
|
- spec/wunderground_thor_spec.rb
|
301
|
+
has_rdoc:
|