scrapers 2.1.0 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/ChangeLog +7 -0
- data/Gemfile +0 -8
- data/Guardfile +1 -1
- data/bin/rubytapas +2 -75
- data/lib/scrapers.rb +1 -3
- data/lib/scrapers/manning_books.rb +37 -27
- data/lib/scrapers/rubytapas.rb +6 -81
- data/lib/scrapers/rubytapas/cli.rb +39 -0
- data/lib/scrapers/rubytapas/config.rb +11 -0
- data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
- data/lib/scrapers/rubytapas/episode.rb +86 -0
- data/lib/scrapers/rubytapas/scraper.rb +142 -0
- data/lib/scrapers/version.rb +2 -2
- data/scrapers.gemspec +4 -1
- data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
- data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
- data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
- data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
- data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
- data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
- data/spec/scrapers/allrecipes_spec.rb +2 -2
- data/spec/scrapers/discoverynews_spec.rb +3 -14
- data/spec/scrapers/download_spec.rb +6 -16
- data/spec/scrapers/gocomics_spec.rb +3 -3
- data/spec/scrapers/imgur_spec.rb +10 -22
- data/spec/scrapers/manning_books_spec.rb +9 -6
- data/spec/scrapers/nasa_apod_spec.rb +12 -14
- data/spec/scrapers/sinfest_spec.rb +3 -3
- data/spec/scrapers/xkcd_spec.rb +1 -0
- data/spec/scrapers_spec.rb +2 -1
- data/spec/spec_helper.rb +1 -8
- data/spec/support/dir_helpers.rb +13 -0
- data/spec/support/use_vcr.rb +9 -0
- data/vcr_cassettes/nasa-apod.yml +348 -0
- data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
- data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
- data/vcr_cassettes/rubytapas_download.yml +982 -0
- data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
- data/vcr_cassettes/rubytapas_feed.yml +5880 -0
- data/vcr_cassettes/rubytapas_login.yml +849 -0
- metadata +74 -6
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'date'
|
3
|
+
require 'stringex_lite'
|
4
|
+
|
5
|
+
module Scrapers
|
6
|
+
module RubyTapas
|
7
|
+
|
8
|
+
class Episode
|
9
|
+
|
10
|
+
FileLink = Struct.new :filename, :href
|
11
|
+
|
12
|
+
attr_accessor :number, :title, :link, :description, :guid, :pub_date, :file_list, :slug
|
13
|
+
|
14
|
+
def initialize(*args)
|
15
|
+
if args.size == 1
|
16
|
+
case args[0]
|
17
|
+
when String
|
18
|
+
parse_item(Nokogiri::XML.parse(args[0]){|c| c.noblanks}.children.first)
|
19
|
+
when Nokogiri::XML::Element
|
20
|
+
parse_item(args[0])
|
21
|
+
when Hash
|
22
|
+
parse_options(args[0])
|
23
|
+
else
|
24
|
+
end
|
25
|
+
elsif args.size > 1
|
26
|
+
assign_from_args(*args)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def number_from_title
|
31
|
+
title.scan(/\w+/).first
|
32
|
+
end
|
33
|
+
|
34
|
+
def slug_from_title
|
35
|
+
title.to_s.to_url
|
36
|
+
end
|
37
|
+
|
38
|
+
def file_list_from_description
|
39
|
+
find_file_list(description)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def parse_item(item)
|
45
|
+
self.title = item.xpath("title").text
|
46
|
+
self.number = number_from_title
|
47
|
+
self.slug = slug_from_title
|
48
|
+
self.link = item.xpath("link").text
|
49
|
+
self.description = item.xpath("description").text
|
50
|
+
self.guid = item.xpath("guid").text
|
51
|
+
self.pub_date = DateTime.parse(item.xpath("pubDate").text)
|
52
|
+
self.file_list = file_list_from_description
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_options(options)
|
56
|
+
self.number = options[:number]
|
57
|
+
self.title = options[:title]
|
58
|
+
self.slug = options[:slug]
|
59
|
+
self.link = options[:link]
|
60
|
+
self.description = options[:description]
|
61
|
+
self.guid = options[:guid]
|
62
|
+
self.pub_date = options[:pub_date]
|
63
|
+
self.file_list = options[:file_list]
|
64
|
+
end
|
65
|
+
|
66
|
+
def assign_from_args(*args)
|
67
|
+
self.number,
|
68
|
+
self.title,
|
69
|
+
self.slug,
|
70
|
+
self.link,
|
71
|
+
self.description,
|
72
|
+
self.guid,
|
73
|
+
self.pub_date,
|
74
|
+
self.file_list =
|
75
|
+
*args
|
76
|
+
end
|
77
|
+
|
78
|
+
def find_file_list(content)
|
79
|
+
Nokogiri::HTML.parse(content).css("a").
|
80
|
+
select {|link| link['href'] =~ /download\?file_id=/ }.
|
81
|
+
map { |link| FileLink.new(link.child.text, link['href']) }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'scrapers/rubytapas/config'
|
4
|
+
require 'scrapers/rubytapas/episode'
|
5
|
+
require 'scrapers/rubytapas/dpdcart'
|
6
|
+
|
7
|
+
module Scrapers
|
8
|
+
module RubyTapas
|
9
|
+
|
10
|
+
# Scraper provides the methods to download, extract and build a collection
|
11
|
+
# of RubyTapas episodes from the RubyTapas RSS feed.
|
12
|
+
class Scraper
|
13
|
+
|
14
|
+
attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
15
|
+
attr_reader :dpdcart
|
16
|
+
|
17
|
+
# *episode_number* is the RubyTapas episode number (note! not the post id!) of the
|
18
|
+
# episode to download. If the episode number is the symbol :all, then all episodes
|
19
|
+
# will be retrieved. Note that if any of the episodes have been previously retrieved
|
20
|
+
# to the destination, i.e., the associated directory already exists, that episode
|
21
|
+
# download will be skipped.
|
22
|
+
#
|
23
|
+
# *options* contains the options from the cli, which include:
|
24
|
+
#
|
25
|
+
# - "user": the username of the RubyTapas account
|
26
|
+
# - "pw": the password of the RubyTapas account
|
27
|
+
# - "destination": the root destination of the episode downloads
|
28
|
+
def initialize(episode_number, options)
|
29
|
+
self.episode_number = episode_number
|
30
|
+
self.user = options["user"]
|
31
|
+
self.pw = options["pw"]
|
32
|
+
self.destination = options.fetch("destination", Dir.pwd)
|
33
|
+
self.dry_run = options["dry_run"]
|
34
|
+
self.debug = options["debug"]
|
35
|
+
@dpdcart = Scrapers::RubyTapas::DpdCart.
|
36
|
+
new(user, pw, {dry_run: dry_run, debug: debug})
|
37
|
+
warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
|
38
|
+
end
|
39
|
+
|
40
|
+
# Perform the scraping operation
|
41
|
+
def scrape!
|
42
|
+
dpdcart.login!
|
43
|
+
if all_episodes?
|
44
|
+
episodes.each do |episode|
|
45
|
+
|
46
|
+
begin
|
47
|
+
download(episode)
|
48
|
+
friendly_pause unless dry_run
|
49
|
+
rescue Errno::EEXIST
|
50
|
+
warn "Episode previously downloaded. Skipping."
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
else
|
55
|
+
episode = find_by_episode(episode_number)
|
56
|
+
if episode.nil?
|
57
|
+
raise "Unknown episode for #{episode_number}"
|
58
|
+
else
|
59
|
+
download(episode)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Print a list of episodes
|
65
|
+
def list!
|
66
|
+
with_pager do |pager|
|
67
|
+
episodes.each do |episode|
|
68
|
+
pager.puts format_episode(episode)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns the collection of episodes.
|
74
|
+
def episodes
|
75
|
+
@episodes ||= fetch_episodes
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieves the episode associated with *episode number*.
|
79
|
+
def find_by_episode(episode_number)
|
80
|
+
episodes.detect {|e| e.number == episode_number}
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def all_episodes?
|
86
|
+
episode_number.to_s.downcase.to_sym == :all
|
87
|
+
end
|
88
|
+
|
89
|
+
# Builds a collection of all the episodes listed in the feed
|
90
|
+
def fetch_episodes
|
91
|
+
feed = Nokogiri::XML.parse(dpdcart.feed!)
|
92
|
+
feed.xpath("//item").map do |item|
|
93
|
+
Episode.new(item)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def download(episode)
|
98
|
+
download_directory = make_download_directory(episode.slug)
|
99
|
+
episode.file_list.each do |file|
|
100
|
+
download_file(download_directory, file.href)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def download_file(dir, url)
|
105
|
+
warn "fetching #{url}" if debug
|
106
|
+
name, body = dpdcart.download!(url)
|
107
|
+
File.binwrite(File.join(dir,name), body) unless dry_run
|
108
|
+
warn "saved #{name} to #{dir}" if debug
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
def make_download_directory(slug)
|
113
|
+
dir = File.join(File.realpath(destination), slug)
|
114
|
+
warn "Downloading to #{dir}" if debug
|
115
|
+
if dry_run
|
116
|
+
"no dir for dry run"
|
117
|
+
else
|
118
|
+
FileUtils.mkdir(dir).first
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def friendly_pause(delay=5)
|
123
|
+
puts
|
124
|
+
print "Sleeping #{delay} seconds"
|
125
|
+
delay.downto(1) { sleep 1; print "." }
|
126
|
+
puts "\n"
|
127
|
+
end
|
128
|
+
|
129
|
+
def with_pager(&block)
|
130
|
+
raise "Must be called with block" unless block_given?
|
131
|
+
pager = open("|more","w")
|
132
|
+
yield pager
|
133
|
+
pager.close
|
134
|
+
end
|
135
|
+
|
136
|
+
def format_episode(episode)
|
137
|
+
"%-5s\t%-40s\t%-15s" % [episode.number, episode.title, episode.pub_date.strftime("%Y-%b-%d")]
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
data/lib/scrapers/version.rb
CHANGED
data/scrapers.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency "activesupport", "~> 4.1"
|
25
25
|
spec.add_dependency "highline"
|
26
26
|
spec.add_dependency "awesome_print"
|
27
|
+
spec.add_dependency "stringex"
|
27
28
|
|
28
29
|
spec.add_development_dependency "bundler"
|
29
30
|
spec.add_development_dependency "rake"
|
@@ -32,5 +33,7 @@ Gem::Specification.new do |spec|
|
|
32
33
|
spec.add_development_dependency "guard-rspec"
|
33
34
|
spec.add_development_dependency "webmock"
|
34
35
|
spec.add_development_dependency "vcr"
|
35
|
-
|
36
|
+
spec.add_development_dependency "pry"
|
37
|
+
spec.add_development_dependency "pry-byebug"
|
38
|
+
|
36
39
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'support/use_vcr'
|
3
|
+
require 'scrapers/rubytapas/dpdcart'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
describe Scrapers::RubyTapas::DpdCart do
|
7
|
+
let(:gateway) { Scrapers::RubyTapas::DpdCart.new }
|
8
|
+
|
9
|
+
describe "method signatures" do
|
10
|
+
it { is_expected.to respond_to(:feed!) }
|
11
|
+
it { is_expected.to respond_to(:login!)}
|
12
|
+
it { is_expected.to respond_to(:download!)}
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#feed!" do
|
16
|
+
it "returns an rss feed" do
|
17
|
+
VCR.use_cassette('rubytapas_feed', record: :new_episodes,
|
18
|
+
match_requests_on: [:method, :host, :path]
|
19
|
+
) do
|
20
|
+
expect(Nokogiri::XML.parse(gateway.feed!)).to be_a(Nokogiri::XML::Document)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#login!" do
|
26
|
+
it "shows the subscriber content page" do
|
27
|
+
VCR.use_cassette('rubytapas_login', record: :new_episodes,
|
28
|
+
match_requests_on: [:method, :host, :path]
|
29
|
+
) do
|
30
|
+
expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
describe "#download!" do
|
37
|
+
let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
|
38
|
+
let(:name) { "001-binary-literals.html" }
|
39
|
+
|
40
|
+
let(:file2){ "https://rubytapas.dpdcart.com/subscriber/download?file_id=27" }
|
41
|
+
let(:name2) { "001-binary-literals.rb" }
|
42
|
+
|
43
|
+
it "returns the downloaded file" do
|
44
|
+
VCR.use_cassette('rubytapas_download', record: :new_episodes,
|
45
|
+
match_requests_on: [:method, :host, :path,
|
46
|
+
:query]) do
|
47
|
+
gateway.login!
|
48
|
+
filename, body = gateway.download! file
|
49
|
+
expect(filename).to eq(name)
|
50
|
+
expect(body.size).to eq(5744)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
it "can download multiple files with a single login" do
|
55
|
+
VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
|
56
|
+
match_requests_on: [:method, :host, :path,
|
57
|
+
:query]) do
|
58
|
+
gateway.login!
|
59
|
+
filename, body = gateway.download! file
|
60
|
+
expect(filename).to eq(name)
|
61
|
+
filename, body = gateway.download! file2
|
62
|
+
expect(filename).to eq(name2)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'scrapers/rubytapas/episode'
|
3
|
+
|
4
|
+
describe Scrapers::RubyTapas::Episode do
|
5
|
+
|
6
|
+
let(:number) { "001" }
|
7
|
+
let(:title) { "001 Binary Literals" }
|
8
|
+
let(:slug) { "001-binary-literals" }
|
9
|
+
let(:link) { "https://rubytapas.dpdcart.com/subscriber/post?id=18" }
|
10
|
+
let(:description) {<<-DESC
|
11
|
+
<div class="blog-entry">
|
12
|
+
<div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
|
13
|
+
</div>
|
14
|
+
<h3>Attached Files</h3>
|
15
|
+
<ul>
|
16
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
|
17
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
|
18
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
|
19
|
+
</ul></div>
|
20
|
+
DESC
|
21
|
+
}
|
22
|
+
let(:guid) { "dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1" }
|
23
|
+
let(:pub_date) { DateTime.new(2012,9,24,9,0,0,'-4') }
|
24
|
+
let(:file_list) do
|
25
|
+
[
|
26
|
+
Scrapers::RubyTapas::Episode::FileLink.new("RubyTapas001.mp4", "https://rubytapas.dpdcart.com/subscriber/download?file_id=25"),
|
27
|
+
Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.html", "https://rubytapas.dpdcart.com/subscriber/download?file_id=26"),
|
28
|
+
Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.rb", "https://rubytapas.dpdcart.com/subscriber/download?file_id=27")
|
29
|
+
]
|
30
|
+
end
|
31
|
+
|
32
|
+
let(:xml_string) do
|
33
|
+
<<-ITEM
|
34
|
+
<item>
|
35
|
+
<title><![CDATA[001 Binary Literals]]></title>
|
36
|
+
<link>https://rubytapas.dpdcart.com/subscriber/post?id=18</link>
|
37
|
+
<description><![CDATA[<div class="blog-entry">
|
38
|
+
<div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
|
39
|
+
</div>
|
40
|
+
<h3>Attached Files</h3>
|
41
|
+
<ul>
|
42
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
|
43
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
|
44
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
|
45
|
+
</ul></div>]]></description>
|
46
|
+
<guid isPermaLink="false">dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1</guid>
|
47
|
+
<pubDate>Mon, 24 Sep 2012 09:00:00 -0400</pubDate>
|
48
|
+
<enclosure url="https://rubytapas.dpdcart.com/feed/download/25/RubyTapas001.mp4" length="12502397" type="video/mp4"/>
|
49
|
+
<itunes:image href="https://getdpd.com/uploads/ruby-tapas.png"/>
|
50
|
+
</item>
|
51
|
+
ITEM
|
52
|
+
end
|
53
|
+
|
54
|
+
let(:xml_item) do
|
55
|
+
Nokogiri::XML.parse(xml_string) {|c| c.noblanks }.children.first
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "#initialize" do
|
59
|
+
|
60
|
+
shared_examples "initialize episode" do
|
61
|
+
it "is episode number 001" do
|
62
|
+
expect(episode.number).to eq(number)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "is episode '001 Binary Literals'" do
|
66
|
+
expect(episode.title).to eq(title)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "has slug" do
|
70
|
+
expect(episode.slug).to eq(slug)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "has link" do
|
74
|
+
expect(episode.link).to eq(link)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "has guid" do
|
78
|
+
expect(episode.guid).to eq(guid)
|
79
|
+
end
|
80
|
+
|
81
|
+
it "has publication date" do
|
82
|
+
expect(episode.pub_date).to eq(pub_date)
|
83
|
+
end
|
84
|
+
|
85
|
+
it "has file list" do
|
86
|
+
expect(episode.file_list).to match_array(file_list)
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
context "when given an xml string" do
|
92
|
+
include_examples "initialize episode" do
|
93
|
+
let(:episode) {Scrapers::RubyTapas::Episode.new(xml_string)}
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
context "when given a Nokogiri::XML::Element" do
|
99
|
+
include_examples "initialize episode" do
|
100
|
+
let(:episode) {Scrapers::RubyTapas::Episode.new(xml_item)}
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
context "when given a Hash" do
|
105
|
+
include_examples "initialize episode" do
|
106
|
+
let(:episode) do
|
107
|
+
Scrapers::RubyTapas::Episode.new(
|
108
|
+
:number => number,
|
109
|
+
:title => title,
|
110
|
+
:slug => slug,
|
111
|
+
:link => link,
|
112
|
+
:description => description,
|
113
|
+
:guid => guid,
|
114
|
+
:pub_date => pub_date,
|
115
|
+
:file_list => file_list
|
116
|
+
)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
context "when given a list of arguments" do
|
122
|
+
include_examples "initialize episode" do
|
123
|
+
let(:episode) do
|
124
|
+
Scrapers::RubyTapas::Episode.new(
|
125
|
+
number,
|
126
|
+
title,
|
127
|
+
slug,
|
128
|
+
link,
|
129
|
+
description,
|
130
|
+
guid,
|
131
|
+
pub_date,
|
132
|
+
file_list
|
133
|
+
)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|