scrapers 2.1.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/ChangeLog +7 -0
- data/Gemfile +0 -8
- data/Guardfile +1 -1
- data/bin/rubytapas +2 -75
- data/lib/scrapers.rb +1 -3
- data/lib/scrapers/manning_books.rb +37 -27
- data/lib/scrapers/rubytapas.rb +6 -81
- data/lib/scrapers/rubytapas/cli.rb +39 -0
- data/lib/scrapers/rubytapas/config.rb +11 -0
- data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
- data/lib/scrapers/rubytapas/episode.rb +86 -0
- data/lib/scrapers/rubytapas/scraper.rb +142 -0
- data/lib/scrapers/version.rb +2 -2
- data/scrapers.gemspec +4 -1
- data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
- data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
- data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
- data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
- data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
- data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
- data/spec/scrapers/allrecipes_spec.rb +2 -2
- data/spec/scrapers/discoverynews_spec.rb +3 -14
- data/spec/scrapers/download_spec.rb +6 -16
- data/spec/scrapers/gocomics_spec.rb +3 -3
- data/spec/scrapers/imgur_spec.rb +10 -22
- data/spec/scrapers/manning_books_spec.rb +9 -6
- data/spec/scrapers/nasa_apod_spec.rb +12 -14
- data/spec/scrapers/sinfest_spec.rb +3 -3
- data/spec/scrapers/xkcd_spec.rb +1 -0
- data/spec/scrapers_spec.rb +2 -1
- data/spec/spec_helper.rb +1 -8
- data/spec/support/dir_helpers.rb +13 -0
- data/spec/support/use_vcr.rb +9 -0
- data/vcr_cassettes/nasa-apod.yml +348 -0
- data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
- data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
- data/vcr_cassettes/rubytapas_download.yml +982 -0
- data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
- data/vcr_cassettes/rubytapas_feed.yml +5880 -0
- data/vcr_cassettes/rubytapas_login.yml +849 -0
- metadata +74 -6
@@ -0,0 +1,86 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'date'
|
3
|
+
require 'stringex_lite'
|
4
|
+
|
5
|
+
module Scrapers
|
6
|
+
module RubyTapas
|
7
|
+
|
8
|
+
class Episode
|
9
|
+
|
10
|
+
FileLink = Struct.new :filename, :href
|
11
|
+
|
12
|
+
attr_accessor :number, :title, :link, :description, :guid, :pub_date, :file_list, :slug
|
13
|
+
|
14
|
+
def initialize(*args)
|
15
|
+
if args.size == 1
|
16
|
+
case args[0]
|
17
|
+
when String
|
18
|
+
parse_item(Nokogiri::XML.parse(args[0]){|c| c.noblanks}.children.first)
|
19
|
+
when Nokogiri::XML::Element
|
20
|
+
parse_item(args[0])
|
21
|
+
when Hash
|
22
|
+
parse_options(args[0])
|
23
|
+
else
|
24
|
+
end
|
25
|
+
elsif args.size > 1
|
26
|
+
assign_from_args(*args)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def number_from_title
|
31
|
+
title.scan(/\w+/).first
|
32
|
+
end
|
33
|
+
|
34
|
+
def slug_from_title
|
35
|
+
title.to_s.to_url
|
36
|
+
end
|
37
|
+
|
38
|
+
def file_list_from_description
|
39
|
+
find_file_list(description)
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def parse_item(item)
|
45
|
+
self.title = item.xpath("title").text
|
46
|
+
self.number = number_from_title
|
47
|
+
self.slug = slug_from_title
|
48
|
+
self.link = item.xpath("link").text
|
49
|
+
self.description = item.xpath("description").text
|
50
|
+
self.guid = item.xpath("guid").text
|
51
|
+
self.pub_date = DateTime.parse(item.xpath("pubDate").text)
|
52
|
+
self.file_list = file_list_from_description
|
53
|
+
end
|
54
|
+
|
55
|
+
def parse_options(options)
|
56
|
+
self.number = options[:number]
|
57
|
+
self.title = options[:title]
|
58
|
+
self.slug = options[:slug]
|
59
|
+
self.link = options[:link]
|
60
|
+
self.description = options[:description]
|
61
|
+
self.guid = options[:guid]
|
62
|
+
self.pub_date = options[:pub_date]
|
63
|
+
self.file_list = options[:file_list]
|
64
|
+
end
|
65
|
+
|
66
|
+
def assign_from_args(*args)
|
67
|
+
self.number,
|
68
|
+
self.title,
|
69
|
+
self.slug,
|
70
|
+
self.link,
|
71
|
+
self.description,
|
72
|
+
self.guid,
|
73
|
+
self.pub_date,
|
74
|
+
self.file_list =
|
75
|
+
*args
|
76
|
+
end
|
77
|
+
|
78
|
+
def find_file_list(content)
|
79
|
+
Nokogiri::HTML.parse(content).css("a").
|
80
|
+
select {|link| link['href'] =~ /download\?file_id=/ }.
|
81
|
+
map { |link| FileLink.new(link.child.text, link['href']) }
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'scrapers/rubytapas/config'
|
4
|
+
require 'scrapers/rubytapas/episode'
|
5
|
+
require 'scrapers/rubytapas/dpdcart'
|
6
|
+
|
7
|
+
module Scrapers
|
8
|
+
module RubyTapas
|
9
|
+
|
10
|
+
# Scraper provides the methods to download, extract and build a collection
|
11
|
+
# of RubyTapas episodes from the RubyTapas RSS feed.
|
12
|
+
class Scraper
|
13
|
+
|
14
|
+
attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
|
15
|
+
attr_reader :dpdcart
|
16
|
+
|
17
|
+
# *episode_number* is the RubyTapas episode number (note! not the post id!) of the
|
18
|
+
# episode to download. If the episode number is the symbol :all, then all episodes
|
19
|
+
# will be retrieved. Note that if any of the episodes have been previously retrieved
|
20
|
+
# to the destination, i.e., the associated directory already exists, that episode
|
21
|
+
# download will be skipped.
|
22
|
+
#
|
23
|
+
# *options* contains the options from the cli, which include:
|
24
|
+
#
|
25
|
+
# - "user": the username of the RubyTapas account
|
26
|
+
# - "pw": the password of the RubyTapas account
|
27
|
+
# - "destination": the root destination of the episode downloads
|
28
|
+
def initialize(episode_number, options)
|
29
|
+
self.episode_number = episode_number
|
30
|
+
self.user = options["user"]
|
31
|
+
self.pw = options["pw"]
|
32
|
+
self.destination = options.fetch("destination", Dir.pwd)
|
33
|
+
self.dry_run = options["dry_run"]
|
34
|
+
self.debug = options["debug"]
|
35
|
+
@dpdcart = Scrapers::RubyTapas::DpdCart.
|
36
|
+
new(user, pw, {dry_run: dry_run, debug: debug})
|
37
|
+
warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
|
38
|
+
end
|
39
|
+
|
40
|
+
# Perform the scraping operation
|
41
|
+
def scrape!
|
42
|
+
dpdcart.login!
|
43
|
+
if all_episodes?
|
44
|
+
episodes.each do |episode|
|
45
|
+
|
46
|
+
begin
|
47
|
+
download(episode)
|
48
|
+
friendly_pause unless dry_run
|
49
|
+
rescue Errno::EEXIST
|
50
|
+
warn "Episode previously downloaded. Skipping."
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
else
|
55
|
+
episode = find_by_episode(episode_number)
|
56
|
+
if episode.nil?
|
57
|
+
raise "Unknown episode for #{episode_number}"
|
58
|
+
else
|
59
|
+
download(episode)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Print a list of episodes
|
65
|
+
def list!
|
66
|
+
with_pager do |pager|
|
67
|
+
episodes.each do |episode|
|
68
|
+
pager.puts format_episode(episode)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns the collection of episodes.
|
74
|
+
def episodes
|
75
|
+
@episodes ||= fetch_episodes
|
76
|
+
end
|
77
|
+
|
78
|
+
# Retrieves the episode associated with *episode number*.
|
79
|
+
def find_by_episode(episode_number)
|
80
|
+
episodes.detect {|e| e.number == episode_number}
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def all_episodes?
|
86
|
+
episode_number.to_s.downcase.to_sym == :all
|
87
|
+
end
|
88
|
+
|
89
|
+
# Builds a collection of all the episodes listed in the feed
|
90
|
+
def fetch_episodes
|
91
|
+
feed = Nokogiri::XML.parse(dpdcart.feed!)
|
92
|
+
feed.xpath("//item").map do |item|
|
93
|
+
Episode.new(item)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def download(episode)
|
98
|
+
download_directory = make_download_directory(episode.slug)
|
99
|
+
episode.file_list.each do |file|
|
100
|
+
download_file(download_directory, file.href)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def download_file(dir, url)
|
105
|
+
warn "fetching #{url}" if debug
|
106
|
+
name, body = dpdcart.download!(url)
|
107
|
+
File.binwrite(File.join(dir,name), body) unless dry_run
|
108
|
+
warn "saved #{name} to #{dir}" if debug
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
def make_download_directory(slug)
|
113
|
+
dir = File.join(File.realpath(destination), slug)
|
114
|
+
warn "Downloading to #{dir}" if debug
|
115
|
+
if dry_run
|
116
|
+
"no dir for dry run"
|
117
|
+
else
|
118
|
+
FileUtils.mkdir(dir).first
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
def friendly_pause(delay=5)
|
123
|
+
puts
|
124
|
+
print "Sleeping #{delay} seconds"
|
125
|
+
delay.downto(1) { sleep 1; print "." }
|
126
|
+
puts "\n"
|
127
|
+
end
|
128
|
+
|
129
|
+
def with_pager(&block)
|
130
|
+
raise "Must be called with block" unless block_given?
|
131
|
+
pager = open("|more","w")
|
132
|
+
yield pager
|
133
|
+
pager.close
|
134
|
+
end
|
135
|
+
|
136
|
+
def format_episode(episode)
|
137
|
+
"%-5s\t%-40s\t%-15s" % [episode.number, episode.title, episode.pub_date.strftime("%Y-%b-%d")]
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
data/lib/scrapers/version.rb
CHANGED
data/scrapers.gemspec
CHANGED
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency "activesupport", "~> 4.1"
|
25
25
|
spec.add_dependency "highline"
|
26
26
|
spec.add_dependency "awesome_print"
|
27
|
+
spec.add_dependency "stringex"
|
27
28
|
|
28
29
|
spec.add_development_dependency "bundler"
|
29
30
|
spec.add_development_dependency "rake"
|
@@ -32,5 +33,7 @@ Gem::Specification.new do |spec|
|
|
32
33
|
spec.add_development_dependency "guard-rspec"
|
33
34
|
spec.add_development_dependency "webmock"
|
34
35
|
spec.add_development_dependency "vcr"
|
35
|
-
|
36
|
+
spec.add_development_dependency "pry"
|
37
|
+
spec.add_development_dependency "pry-byebug"
|
38
|
+
|
36
39
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'support/use_vcr'
|
3
|
+
require 'scrapers/rubytapas/dpdcart'
|
4
|
+
require 'nokogiri'
|
5
|
+
|
6
|
+
describe Scrapers::RubyTapas::DpdCart do
|
7
|
+
let(:gateway) { Scrapers::RubyTapas::DpdCart.new }
|
8
|
+
|
9
|
+
describe "method signatures" do
|
10
|
+
it { is_expected.to respond_to(:feed!) }
|
11
|
+
it { is_expected.to respond_to(:login!)}
|
12
|
+
it { is_expected.to respond_to(:download!)}
|
13
|
+
end
|
14
|
+
|
15
|
+
describe "#feed!" do
|
16
|
+
it "returns an rss feed" do
|
17
|
+
VCR.use_cassette('rubytapas_feed', record: :new_episodes,
|
18
|
+
match_requests_on: [:method, :host, :path]
|
19
|
+
) do
|
20
|
+
expect(Nokogiri::XML.parse(gateway.feed!)).to be_a(Nokogiri::XML::Document)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
describe "#login!" do
|
26
|
+
it "shows the subscriber content page" do
|
27
|
+
VCR.use_cassette('rubytapas_login', record: :new_episodes,
|
28
|
+
match_requests_on: [:method, :host, :path]
|
29
|
+
) do
|
30
|
+
expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
describe "#download!" do
|
37
|
+
let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
|
38
|
+
let(:name) { "001-binary-literals.html" }
|
39
|
+
|
40
|
+
let(:file2){ "https://rubytapas.dpdcart.com/subscriber/download?file_id=27" }
|
41
|
+
let(:name2) { "001-binary-literals.rb" }
|
42
|
+
|
43
|
+
it "returns the downloaded file" do
|
44
|
+
VCR.use_cassette('rubytapas_download', record: :new_episodes,
|
45
|
+
match_requests_on: [:method, :host, :path,
|
46
|
+
:query]) do
|
47
|
+
gateway.login!
|
48
|
+
filename, body = gateway.download! file
|
49
|
+
expect(filename).to eq(name)
|
50
|
+
expect(body.size).to eq(5744)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
it "can download multiple files with a single login" do
|
55
|
+
VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
|
56
|
+
match_requests_on: [:method, :host, :path,
|
57
|
+
:query]) do
|
58
|
+
gateway.login!
|
59
|
+
filename, body = gateway.download! file
|
60
|
+
expect(filename).to eq(name)
|
61
|
+
filename, body = gateway.download! file2
|
62
|
+
expect(filename).to eq(name2)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
|
68
|
+
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'scrapers/rubytapas/episode'
|
3
|
+
|
4
|
+
describe Scrapers::RubyTapas::Episode do
|
5
|
+
|
6
|
+
let(:number) { "001" }
|
7
|
+
let(:title) { "001 Binary Literals" }
|
8
|
+
let(:slug) { "001-binary-literals" }
|
9
|
+
let(:link) { "https://rubytapas.dpdcart.com/subscriber/post?id=18" }
|
10
|
+
let(:description) {<<-DESC
|
11
|
+
<div class="blog-entry">
|
12
|
+
<div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
|
13
|
+
</div>
|
14
|
+
<h3>Attached Files</h3>
|
15
|
+
<ul>
|
16
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
|
17
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
|
18
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
|
19
|
+
</ul></div>
|
20
|
+
DESC
|
21
|
+
}
|
22
|
+
let(:guid) { "dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1" }
|
23
|
+
let(:pub_date) { DateTime.new(2012,9,24,9,0,0,'-4') }
|
24
|
+
let(:file_list) do
|
25
|
+
[
|
26
|
+
Scrapers::RubyTapas::Episode::FileLink.new("RubyTapas001.mp4", "https://rubytapas.dpdcart.com/subscriber/download?file_id=25"),
|
27
|
+
Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.html", "https://rubytapas.dpdcart.com/subscriber/download?file_id=26"),
|
28
|
+
Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.rb", "https://rubytapas.dpdcart.com/subscriber/download?file_id=27")
|
29
|
+
]
|
30
|
+
end
|
31
|
+
|
32
|
+
let(:xml_string) do
|
33
|
+
<<-ITEM
|
34
|
+
<item>
|
35
|
+
<title><![CDATA[001 Binary Literals]]></title>
|
36
|
+
<link>https://rubytapas.dpdcart.com/subscriber/post?id=18</link>
|
37
|
+
<description><![CDATA[<div class="blog-entry">
|
38
|
+
<div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
|
39
|
+
</div>
|
40
|
+
<h3>Attached Files</h3>
|
41
|
+
<ul>
|
42
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
|
43
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
|
44
|
+
<li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
|
45
|
+
</ul></div>]]></description>
|
46
|
+
<guid isPermaLink="false">dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1</guid>
|
47
|
+
<pubDate>Mon, 24 Sep 2012 09:00:00 -0400</pubDate>
|
48
|
+
<enclosure url="https://rubytapas.dpdcart.com/feed/download/25/RubyTapas001.mp4" length="12502397" type="video/mp4"/>
|
49
|
+
<itunes:image href="https://getdpd.com/uploads/ruby-tapas.png"/>
|
50
|
+
</item>
|
51
|
+
ITEM
|
52
|
+
end
|
53
|
+
|
54
|
+
let(:xml_item) do
|
55
|
+
Nokogiri::XML.parse(xml_string) {|c| c.noblanks }.children.first
|
56
|
+
end
|
57
|
+
|
58
|
+
describe "#initialize" do
|
59
|
+
|
60
|
+
shared_examples "initialize episode" do
|
61
|
+
it "is episode number 001" do
|
62
|
+
expect(episode.number).to eq(number)
|
63
|
+
end
|
64
|
+
|
65
|
+
it "is episode '001 Binary Literals'" do
|
66
|
+
expect(episode.title).to eq(title)
|
67
|
+
end
|
68
|
+
|
69
|
+
it "has slug" do
|
70
|
+
expect(episode.slug).to eq(slug)
|
71
|
+
end
|
72
|
+
|
73
|
+
it "has link" do
|
74
|
+
expect(episode.link).to eq(link)
|
75
|
+
end
|
76
|
+
|
77
|
+
it "has guid" do
|
78
|
+
expect(episode.guid).to eq(guid)
|
79
|
+
end
|
80
|
+
|
81
|
+
it "has publication date" do
|
82
|
+
expect(episode.pub_date).to eq(pub_date)
|
83
|
+
end
|
84
|
+
|
85
|
+
it "has file list" do
|
86
|
+
expect(episode.file_list).to match_array(file_list)
|
87
|
+
end
|
88
|
+
|
89
|
+
end
|
90
|
+
|
91
|
+
context "when given an xml string" do
|
92
|
+
include_examples "initialize episode" do
|
93
|
+
let(:episode) {Scrapers::RubyTapas::Episode.new(xml_string)}
|
94
|
+
end
|
95
|
+
|
96
|
+
end
|
97
|
+
|
98
|
+
context "when given a Nokogiri::XML::Element" do
|
99
|
+
include_examples "initialize episode" do
|
100
|
+
let(:episode) {Scrapers::RubyTapas::Episode.new(xml_item)}
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
context "when given a Hash" do
|
105
|
+
include_examples "initialize episode" do
|
106
|
+
let(:episode) do
|
107
|
+
Scrapers::RubyTapas::Episode.new(
|
108
|
+
:number => number,
|
109
|
+
:title => title,
|
110
|
+
:slug => slug,
|
111
|
+
:link => link,
|
112
|
+
:description => description,
|
113
|
+
:guid => guid,
|
114
|
+
:pub_date => pub_date,
|
115
|
+
:file_list => file_list
|
116
|
+
)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
context "when given a list of arguments" do
|
122
|
+
include_examples "initialize episode" do
|
123
|
+
let(:episode) do
|
124
|
+
Scrapers::RubyTapas::Episode.new(
|
125
|
+
number,
|
126
|
+
title,
|
127
|
+
slug,
|
128
|
+
link,
|
129
|
+
description,
|
130
|
+
guid,
|
131
|
+
pub_date,
|
132
|
+
file_list
|
133
|
+
)
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|