scrapers 2.0.2 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/manning_books +3 -10
- data/lib/netrc_reader.rb +12 -0
- data/lib/scrapers/manning_books.rb +63 -34
- data/lib/scrapers/version.rb +2 -2
- data/spec/scrapers/manning_books_spec.rb +103 -10
- data/spec/spec_helper.rb +0 -8
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a122441c7a4d715eded98e92a58d31d6b00f21c
|
4
|
+
data.tar.gz: 74e3ad669f233d43542155819d2224499c062d5e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c7fe23236b2a325eec855f865aa687b329d0cc3b470cad66f56623df7b4833f11fc6167871cfa6b10351da1d0c3747b40d44b64bc0057bb563735b234cf15a56
|
7
|
+
data.tar.gz: a22303c8d58b65795a5811c6f997b47ff3a4bc64d60849e83e43bc5794650222718ee106f60891c344acd210524e00c15a2074958732d14b50a54bd8b2d3e57c
|
data/bin/manning_books
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
require 'thor'
|
3
|
-
require 'netrc'
|
4
3
|
require 'scrapers/manning_books'
|
5
4
|
require 'awesome_print'
|
6
5
|
|
@@ -15,22 +14,16 @@ require 'awesome_print'
|
|
15
14
|
|
16
15
|
class ManningBooks < Thor
|
17
16
|
|
18
|
-
MANNING = 'manning'
|
19
|
-
|
20
17
|
desc "download", "Downloads all the editions of all the books on your dashboard"
|
21
18
|
method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination to store the downloads", :default => "."
|
22
19
|
method_option :user, :aliases => %w{-u -U}, :desc => "Manning user. Default is read from $HOME/.netrc"
|
23
20
|
method_option :password, :aliases => %w{-p -pw}, :desc => "Manning password. Default is read from $HOME/.netrc"
|
21
|
+
method_option :delay, :aliases => %w{-t}, :type => :numeric, :desc => "delay time between requests", :default => Scrapers::ManningBooks::DELAY_TIME
|
22
|
+
method_option :dry_run, :aliases => %w[-n], :type => :boolean, :desc => "dry run, do not download and save books", :default => false
|
24
23
|
|
25
24
|
def download
|
26
|
-
netrc = Netrc.read
|
27
|
-
user, pw = netrc[MANNING]
|
28
|
-
user = options.fetch("user", user)
|
29
|
-
pw = options.fetch("password", pw)
|
30
|
-
destination = options.fetch("destination", nil)
|
31
|
-
STDERR.puts "destination: #{destination}, user: #{user}, pw: #{pw.length}"
|
32
25
|
Signal.trap('INT', proc { STDERR.puts "Download Interrupted"; exit(-1)})
|
33
|
-
results = Scrapers::ManningBooks.scrape
|
26
|
+
results = Scrapers::ManningBooks::Scraper.new(options).scrape
|
34
27
|
ap results
|
35
28
|
end
|
36
29
|
|
data/lib/netrc_reader.rb
ADDED
@@ -1,55 +1,84 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
require 'mechanize'
|
3
|
-
|
3
|
+
require 'netrc_reader'
|
4
4
|
|
5
5
|
module Scrapers
|
6
6
|
module ManningBooks
|
7
7
|
|
8
|
+
NETRC_MANNING_ENTRY = 'manning'
|
8
9
|
DASHBOARD_URL = "https://account.manning.com/dashboard"
|
9
|
-
|
10
|
-
def self.scrape(dest=".", user=nil, pw=nil)
|
11
|
-
results = Array.new
|
10
|
+
DELAY_TIME = 5 # seconds
|
12
11
|
|
13
|
-
|
14
|
-
|
15
|
-
unless m.current_page.uri == DASHBOARD_URL
|
16
|
-
# log in
|
17
|
-
m.current_page.form.field_with(:type => 'email').value= user
|
18
|
-
m.current_page.form.field_with(:type => 'password').value= pw
|
19
|
-
m.current_page.form.submit
|
20
|
-
sleep 2
|
21
|
-
raise "could not log in" unless m.current_page.uri.to_s == DASHBOARD_URL
|
22
|
-
end
|
12
|
+
class Scraper
|
13
|
+
attr_accessor :user, :pw, :delay_time, :destination, :dry_run
|
23
14
|
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
m.current_page.save! # overwrite!
|
15
|
+
def initialize(options={})
|
16
|
+
netrc_reader = ::Scrapers::NetrcReader.new(NETRC_MANNING_ENTRY)
|
17
|
+
@user = options.fetch("user", netrc_reader.user)
|
18
|
+
@pw = options.fetch("pw", netrc_reader.pw)
|
19
|
+
@delay_time = options.fetch("delay", DELAY_TIME)
|
20
|
+
@destination = options.fetch("destination", ".")
|
21
|
+
@dry_run = options.fetch("dry_run", false)
|
22
|
+
end
|
33
23
|
|
34
|
-
|
24
|
+
def scrape
|
25
|
+
Mechanize.start do |m|
|
26
|
+
login(m) do |m|
|
27
|
+
book_downloads = m.current_page.links_with(:href => %r{/account/bookProduct/download})
|
28
|
+
Dir.chdir(destination) do |dir|
|
29
|
+
@results = download_books(m, book_downloads)
|
30
|
+
end
|
35
31
|
end
|
36
|
-
|
37
32
|
end
|
38
33
|
|
34
|
+
Hash[@results]
|
39
35
|
end
|
36
|
+
|
37
|
+
def login(agent, &block)
|
38
|
+
raise "Must provide a block to execute after logged in to site" unless block_given?
|
40
39
|
|
41
|
-
|
42
|
-
|
40
|
+
agent.get DASHBOARD_URL
|
41
|
+
unless agent.current_page.uri == DASHBOARD_URL
|
42
|
+
# log in
|
43
|
+
agent.current_page.form.field_with(:type => 'email').value= user
|
44
|
+
agent.current_page.form.field_with(:type => 'password').value= pw
|
45
|
+
agent.current_page.form.submit
|
46
|
+
sleep 2
|
47
|
+
raise "could not log in" unless agent.current_page.uri.to_s == DASHBOARD_URL
|
48
|
+
end
|
49
|
+
yield agent
|
50
|
+
end
|
43
51
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
52
|
+
def wait_a_bit(delay)
|
53
|
+
puts "delaying for #{delay} second(s)"
|
54
|
+
%w[- * | +].cycle do |c|
|
55
|
+
print "\r#{c}"
|
56
|
+
sleep 1
|
57
|
+
delay -= 1
|
58
|
+
break if delay < 1
|
59
|
+
end
|
60
|
+
print "\r"
|
49
61
|
end
|
50
|
-
print "\r"
|
51
|
-
end
|
52
62
|
|
53
63
|
|
64
|
+
def download_books(agent, books)
|
65
|
+
books.map do |book|
|
66
|
+
bookname = book.node.parent.parent.parent.parent.at_css('h1').text
|
67
|
+
puts "Downloading #{bookname} from #{book.href}"
|
68
|
+
if dry_run
|
69
|
+
warn "dry run, not saving"
|
70
|
+
else
|
71
|
+
agent.get book.href
|
72
|
+
puts "Saving #{agent.current_page.filename}"
|
73
|
+
agent.current_page.save! # overwrite!
|
74
|
+
end
|
75
|
+
|
76
|
+
wait_a_bit delay_time
|
77
|
+
[agent.current_page.filename, agent.current_page.uri.to_s]
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
54
82
|
end
|
55
83
|
end
|
84
|
+
|
data/lib/scrapers/version.rb
CHANGED
@@ -1,20 +1,113 @@
|
|
1
1
|
# -*- ruby -*-
|
2
2
|
require 'spec_helper'
|
3
3
|
require 'scrapers/manning_books'
|
4
|
+
require 'ostruct'
|
4
5
|
|
5
|
-
|
6
|
+
RSpec.describe Scrapers::ManningBooks::Scraper do
|
7
|
+
describe "verify Class method signatures" do
|
8
|
+
it "responds to :new" do
|
9
|
+
expect(Scrapers::ManningBooks::Scraper).to respond_to(:new)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
describe "verify instance method signatures" do
|
13
|
+
subject { Scrapers::ManningBooks::Scraper.new }
|
14
|
+
it { is_expected.to respond_to :scrape }
|
15
|
+
it { is_expected.to respond_to :login }
|
16
|
+
it { is_expected.to respond_to :wait_a_bit }
|
17
|
+
it { is_expected.to respond_to :download_books }
|
18
|
+
end
|
19
|
+
describe "#login" do
|
20
|
+
let(:scraper) { Scrapers::ManningBooks::Scraper.new }
|
21
|
+
let(:agent) { double('agent') }
|
6
22
|
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
23
|
+
before do
|
24
|
+
allow(Scrapers::NetrcReader).to receive(:new) do
|
25
|
+
OpenStruct.new(user: "joe@example.com", pw: "password")
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
it "verify user" do
|
30
|
+
expect(scraper.user).to eq("joe@example.com")
|
31
|
+
end
|
32
|
+
it "verify pw" do
|
33
|
+
expect(scraper.pw).to eq("password")
|
34
|
+
end
|
35
|
+
|
36
|
+
context "when login is passed a block" do
|
37
|
+
it "logs in and yields the block" do
|
38
|
+
expect(agent).to receive(:get).and_return(agent)
|
39
|
+
expect(agent).to receive(:current_page).at_least(5).times.and_return(agent)
|
40
|
+
expect(agent).to receive(:uri)
|
41
|
+
expect(agent).to receive(:form).exactly(3).times.and_return(agent)
|
42
|
+
expect(agent).to receive(:field_with).exactly(2).times.and_return(agent)
|
43
|
+
expect(agent).to receive(:value=).exactly(2).times.and_return(agent)
|
44
|
+
expect(agent).to receive(:submit).and_return(agent)
|
45
|
+
expect(agent).to receive(:uri).and_return(Scrapers::ManningBooks::DASHBOARD_URL)
|
46
|
+
scraper.login(agent) { |m| @result = "in yield" }
|
47
|
+
expect(@result).to eq("in yield")
|
14
48
|
end
|
15
49
|
|
16
|
-
it {expect(@result).to_not be_nil}
|
17
|
-
|
18
50
|
end
|
51
|
+
|
52
|
+
context "when login is not passed a block" do
|
53
|
+
it "raises an exception" do
|
54
|
+
expect{ scraper.login(agent) }.to raise_error("Must provide a block to execute after logged in to site")
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#download_books" do
|
61
|
+
let(:scraper) {Scrapers::ManningBooks::Scraper.new}
|
62
|
+
let(:agent) {double('agent')}
|
63
|
+
let(:books) do
|
64
|
+
3.times.map do |i|
|
65
|
+
OpenStruct.new(href: "http://#{Scrapers::ManningBooks::DASHBOARD_URL}/#{i}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
before do
|
71
|
+
allow(Scrapers::NetrcReader).to receive(:new) do
|
72
|
+
OpenStruct.new(user: "joe@example.com", pw: "password")
|
73
|
+
end
|
74
|
+
|
75
|
+
allow(scraper).to receive(:wait_a_bit).at_least(:once)
|
76
|
+
end
|
77
|
+
|
78
|
+
it "downloads the books" do
|
79
|
+
save_stdout = $stdout
|
80
|
+
$stdout = double('output').as_null_object
|
81
|
+
expect(agent).to receive(:get).exactly(3).times
|
82
|
+
expect(agent).to receive(:current_page).exactly(3*4).times.and_return(agent)
|
83
|
+
expect(agent).to receive(:filename).exactly(3*2).times.and_return("FILENAME")
|
84
|
+
expect(agent).to receive(:save!).exactly(3).times
|
85
|
+
expect(agent).to receive(:uri).exactly(3).times
|
86
|
+
results = scraper.download_books(agent, books)
|
87
|
+
$stdout = save_stdout
|
88
|
+
expect(results.size).to eq(3)
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Saving the best for last
|
94
|
+
describe "#scrape" do
|
95
|
+
let(:scraper) {Scrapers::ManningBooks::Scraper.new}
|
96
|
+
let(:agent) {double('agent').as_null_object}
|
97
|
+
let(:netrc_reader) {double('netrc_reader').as_null_object}
|
98
|
+
let(:book_list) {[['book1','url1'],['book2','url2']]}
|
99
|
+
|
100
|
+
before do
|
101
|
+
allow(Scrapers::NetrcReader).to receive(:new).and_return(netrc_reader)
|
102
|
+
allow(scraper).to receive(:wait_a_bit).at_least(:once)
|
103
|
+
allow(scraper).to receive(:login).and_yield(agent)
|
104
|
+
end
|
105
|
+
|
106
|
+
it "scrapes the dashboard" do
|
107
|
+
expect(Mechanize).to receive(:start).and_yield(agent)
|
108
|
+
expect(scraper).to receive(:download_books).and_return(book_list)
|
109
|
+
scraper.scrape
|
110
|
+
end
|
111
|
+
|
19
112
|
end
|
20
113
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#require 'webmock/rspec'
|
2
1
|
require 'vcr'
|
3
2
|
|
4
3
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
@@ -8,7 +7,6 @@ require 'vcr'
|
|
8
7
|
#
|
9
8
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
10
9
|
RSpec.configure do |config|
|
11
|
-
config.treat_symbols_as_metadata_keys_with_true_values = true
|
12
10
|
config.run_all_when_everything_filtered = true
|
13
11
|
config.filter_run :focus
|
14
12
|
|
@@ -23,9 +21,3 @@ VCR.configure do |c|
|
|
23
21
|
c.cassette_library_dir = 'vcr_cassettes'
|
24
22
|
c.hook_into :webmock
|
25
23
|
end
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
require 'scrapers.rb'
|
31
|
-
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tamara Temple
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -215,6 +215,7 @@ files:
|
|
215
215
|
- bin/new_scraper
|
216
216
|
- bin/rubytapas
|
217
217
|
- bin/wunderground
|
218
|
+
- lib/netrc_reader.rb
|
218
219
|
- lib/scrapers.rb
|
219
220
|
- lib/scrapers/allrecipes.rb
|
220
221
|
- lib/scrapers/discoverynews.rb
|
@@ -297,3 +298,4 @@ test_files:
|
|
297
298
|
- spec/scrapers_spec.rb
|
298
299
|
- spec/spec_helper.rb
|
299
300
|
- spec/wunderground_thor_spec.rb
|
301
|
+
has_rdoc:
|