scrapers 2.1.0 → 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/ChangeLog +7 -0
  4. data/Gemfile +0 -8
  5. data/Guardfile +1 -1
  6. data/bin/rubytapas +2 -75
  7. data/lib/scrapers.rb +1 -3
  8. data/lib/scrapers/manning_books.rb +37 -27
  9. data/lib/scrapers/rubytapas.rb +6 -81
  10. data/lib/scrapers/rubytapas/cli.rb +39 -0
  11. data/lib/scrapers/rubytapas/config.rb +11 -0
  12. data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
  13. data/lib/scrapers/rubytapas/episode.rb +86 -0
  14. data/lib/scrapers/rubytapas/scraper.rb +142 -0
  15. data/lib/scrapers/version.rb +2 -2
  16. data/scrapers.gemspec +4 -1
  17. data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
  18. data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
  19. data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
  20. data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
  21. data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
  22. data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
  23. data/spec/scrapers/allrecipes_spec.rb +2 -2
  24. data/spec/scrapers/discoverynews_spec.rb +3 -14
  25. data/spec/scrapers/download_spec.rb +6 -16
  26. data/spec/scrapers/gocomics_spec.rb +3 -3
  27. data/spec/scrapers/imgur_spec.rb +10 -22
  28. data/spec/scrapers/manning_books_spec.rb +9 -6
  29. data/spec/scrapers/nasa_apod_spec.rb +12 -14
  30. data/spec/scrapers/sinfest_spec.rb +3 -3
  31. data/spec/scrapers/xkcd_spec.rb +1 -0
  32. data/spec/scrapers_spec.rb +2 -1
  33. data/spec/spec_helper.rb +1 -8
  34. data/spec/support/dir_helpers.rb +13 -0
  35. data/spec/support/use_vcr.rb +9 -0
  36. data/vcr_cassettes/nasa-apod.yml +348 -0
  37. data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
  38. data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
  39. data/vcr_cassettes/rubytapas_download.yml +982 -0
  40. data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
  41. data/vcr_cassettes/rubytapas_feed.yml +5880 -0
  42. data/vcr_cassettes/rubytapas_login.yml +849 -0
  43. metadata +74 -6
@@ -0,0 +1,86 @@
1
+ require 'nokogiri'
2
+ require 'date'
3
+ require 'stringex_lite'
4
+
5
+ module Scrapers
6
+ module RubyTapas
7
+
8
+ class Episode
9
+
10
+ FileLink = Struct.new :filename, :href
11
+
12
+ attr_accessor :number, :title, :link, :description, :guid, :pub_date, :file_list, :slug
13
+
14
+ def initialize(*args)
15
+ if args.size == 1
16
+ case args[0]
17
+ when String
18
+ parse_item(Nokogiri::XML.parse(args[0]){|c| c.noblanks}.children.first)
19
+ when Nokogiri::XML::Element
20
+ parse_item(args[0])
21
+ when Hash
22
+ parse_options(args[0])
23
+ else
24
+ end
25
+ elsif args.size > 1
26
+ assign_from_args(*args)
27
+ end
28
+ end
29
+
30
+ def number_from_title
31
+ title.scan(/\w+/).first
32
+ end
33
+
34
+ def slug_from_title
35
+ title.to_s.to_url
36
+ end
37
+
38
+ def file_list_from_description
39
+ find_file_list(description)
40
+ end
41
+
42
+ private
43
+
44
+ def parse_item(item)
45
+ self.title = item.xpath("title").text
46
+ self.number = number_from_title
47
+ self.slug = slug_from_title
48
+ self.link = item.xpath("link").text
49
+ self.description = item.xpath("description").text
50
+ self.guid = item.xpath("guid").text
51
+ self.pub_date = DateTime.parse(item.xpath("pubDate").text)
52
+ self.file_list = file_list_from_description
53
+ end
54
+
55
+ def parse_options(options)
56
+ self.number = options[:number]
57
+ self.title = options[:title]
58
+ self.slug = options[:slug]
59
+ self.link = options[:link]
60
+ self.description = options[:description]
61
+ self.guid = options[:guid]
62
+ self.pub_date = options[:pub_date]
63
+ self.file_list = options[:file_list]
64
+ end
65
+
66
+ def assign_from_args(*args)
67
+ self.number,
68
+ self.title,
69
+ self.slug,
70
+ self.link,
71
+ self.description,
72
+ self.guid,
73
+ self.pub_date,
74
+ self.file_list =
75
+ *args
76
+ end
77
+
78
+ def find_file_list(content)
79
+ Nokogiri::HTML.parse(content).css("a").
80
+ select {|link| link['href'] =~ /download\?file_id=/ }.
81
+ map { |link| FileLink.new(link.child.text, link['href']) }
82
+ end
83
+ end
84
+
85
+ end
86
+ end
@@ -0,0 +1,142 @@
1
+ require 'nokogiri'
2
+ require 'fileutils'
3
+ require 'scrapers/rubytapas/config'
4
+ require 'scrapers/rubytapas/episode'
5
+ require 'scrapers/rubytapas/dpdcart'
6
+
7
+ module Scrapers
8
+ module RubyTapas
9
+
10
+ # Scraper provides the methods to download, extract and build a collection
11
+ # of RubyTapas episodes from the RubyTapas RSS feed.
12
+ class Scraper
13
+
14
+ attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
15
+ attr_reader :dpdcart
16
+
17
+ # *episode_number* is the RubyTapas episode number (note! not the post id!) of the
18
+ # episode to download. If the episode number is the symbol :all, then all episodes
19
+ # will be retrieved. Note that if any of the episodes have been previously retrieved
20
+ # to the destination, i.e., the associated directory already exists, that episode
21
+ # download will be skipped.
22
+ #
23
+ # *options* contains the options from the cli, which include:
24
+ #
25
+ # - "user": the username of the RubyTapas account
26
+ # - "pw": the password of the RubyTapas account
27
+ # - "destination": the root destination of the episode downloads
28
+ def initialize(episode_number, options)
29
+ self.episode_number = episode_number
30
+ self.user = options["user"]
31
+ self.pw = options["pw"]
32
+ self.destination = options.fetch("destination", Dir.pwd)
33
+ self.dry_run = options["dry_run"]
34
+ self.debug = options["debug"]
35
+ @dpdcart = Scrapers::RubyTapas::DpdCart.
36
+ new(user, pw, {dry_run: dry_run, debug: debug})
37
+ warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
38
+ end
39
+
40
+ # Perform the scraping operation
41
+ def scrape!
42
+ dpdcart.login!
43
+ if all_episodes?
44
+ episodes.each do |episode|
45
+
46
+ begin
47
+ download(episode)
48
+ friendly_pause unless dry_run
49
+ rescue Errno::EEXIST
50
+ warn "Episode previously downloaded. Skipping."
51
+ end
52
+
53
+ end
54
+ else
55
+ episode = find_by_episode(episode_number)
56
+ if episode.nil?
57
+ raise "Unknown episode for #{episode_number}"
58
+ else
59
+ download(episode)
60
+ end
61
+ end
62
+ end
63
+
64
+ # Print a list of episodes
65
+ def list!
66
+ with_pager do |pager|
67
+ episodes.each do |episode|
68
+ pager.puts format_episode(episode)
69
+ end
70
+ end
71
+ end
72
+
73
+ # Returns the collection of episodes.
74
+ def episodes
75
+ @episodes ||= fetch_episodes
76
+ end
77
+
78
+ # Retrieves the episode associated with *episode number*.
79
+ def find_by_episode(episode_number)
80
+ episodes.detect {|e| e.number == episode_number}
81
+ end
82
+
83
+ private
84
+
85
+ def all_episodes?
86
+ episode_number.to_s.downcase.to_sym == :all
87
+ end
88
+
89
+ # Builds a collection of all the episodes listed in the feed
90
+ def fetch_episodes
91
+ feed = Nokogiri::XML.parse(dpdcart.feed!)
92
+ feed.xpath("//item").map do |item|
93
+ Episode.new(item)
94
+ end
95
+ end
96
+
97
+ def download(episode)
98
+ download_directory = make_download_directory(episode.slug)
99
+ episode.file_list.each do |file|
100
+ download_file(download_directory, file.href)
101
+ end
102
+ end
103
+
104
+ def download_file(dir, url)
105
+ warn "fetching #{url}" if debug
106
+ name, body = dpdcart.download!(url)
107
+ File.binwrite(File.join(dir,name), body) unless dry_run
108
+ warn "saved #{name} to #{dir}" if debug
109
+ end
110
+
111
+
112
+ def make_download_directory(slug)
113
+ dir = File.join(File.realpath(destination), slug)
114
+ warn "Downloading to #{dir}" if debug
115
+ if dry_run
116
+ "no dir for dry run"
117
+ else
118
+ FileUtils.mkdir(dir).first
119
+ end
120
+ end
121
+
122
+ def friendly_pause(delay=5)
123
+ puts
124
+ print "Sleeping #{delay} seconds"
125
+ delay.downto(1) { sleep 1; print "." }
126
+ puts "\n"
127
+ end
128
+
129
+ def with_pager(&block)
130
+ raise "Must be called with block" unless block_given?
131
+ pager = open("|more","w")
132
+ yield pager
133
+ pager.close
134
+ end
135
+
136
+ def format_episode(episode)
137
+ "%-5s\t%-40s\t%-15s" % [episode.number, episode.title, episode.pub_date.strftime("%Y-%b-%d")]
138
+ end
139
+
140
+ end
141
+ end
142
+ end
@@ -1,8 +1,8 @@
1
1
  module Scrapers
2
2
  module Version
3
3
 
4
- MAJOR = 2
5
- MINOR = 1
4
+ MAJOR = 3
5
+ MINOR = 0
6
6
  BUILD = 0
7
7
 
8
8
  end
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "activesupport", "~> 4.1"
25
25
  spec.add_dependency "highline"
26
26
  spec.add_dependency "awesome_print"
27
+ spec.add_dependency "stringex"
27
28
 
28
29
  spec.add_development_dependency "bundler"
29
30
  spec.add_development_dependency "rake"
@@ -32,5 +33,7 @@ Gem::Specification.new do |spec|
32
33
  spec.add_development_dependency "guard-rspec"
33
34
  spec.add_development_dependency "webmock"
34
35
  spec.add_development_dependency "vcr"
35
-
36
+ spec.add_development_dependency "pry"
37
+ spec.add_development_dependency "pry-byebug"
38
+
36
39
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+ require 'support/use_vcr'
3
+ require 'scrapers/rubytapas/dpdcart'
4
+ require 'nokogiri'
5
+
6
+ describe Scrapers::RubyTapas::DpdCart do
7
+ let(:gateway) { Scrapers::RubyTapas::DpdCart.new }
8
+
9
+ describe "method signatures" do
10
+ it { is_expected.to respond_to(:feed!) }
11
+ it { is_expected.to respond_to(:login!)}
12
+ it { is_expected.to respond_to(:download!)}
13
+ end
14
+
15
+ describe "#feed!" do
16
+ it "returns an rss feed" do
17
+ VCR.use_cassette('rubytapas_feed', record: :new_episodes,
18
+ match_requests_on: [:method, :host, :path]
19
+ ) do
20
+ expect(Nokogiri::XML.parse(gateway.feed!)).to be_a(Nokogiri::XML::Document)
21
+ end
22
+ end
23
+ end
24
+
25
+ describe "#login!" do
26
+ it "shows the subscriber content page" do
27
+ VCR.use_cassette('rubytapas_login', record: :new_episodes,
28
+ match_requests_on: [:method, :host, :path]
29
+ ) do
30
+ expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ describe "#download!" do
37
+ let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
38
+ let(:name) { "001-binary-literals.html" }
39
+
40
+ let(:file2){ "https://rubytapas.dpdcart.com/subscriber/download?file_id=27" }
41
+ let(:name2) { "001-binary-literals.rb" }
42
+
43
+ it "returns the downloaded file" do
44
+ VCR.use_cassette('rubytapas_download', record: :new_episodes,
45
+ match_requests_on: [:method, :host, :path,
46
+ :query]) do
47
+ gateway.login!
48
+ filename, body = gateway.download! file
49
+ expect(filename).to eq(name)
50
+ expect(body.size).to eq(5744)
51
+ end
52
+ end
53
+
54
+ it "can download multiple files with a single login" do
55
+ VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
56
+ match_requests_on: [:method, :host, :path,
57
+ :query]) do
58
+ gateway.login!
59
+ filename, body = gateway.download! file
60
+ expect(filename).to eq(name)
61
+ filename, body = gateway.download! file2
62
+ expect(filename).to eq(name2)
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,140 @@
1
+ require 'spec_helper'
2
+ require 'scrapers/rubytapas/episode'
3
+
4
+ describe Scrapers::RubyTapas::Episode do
5
+
6
+ let(:number) { "001" }
7
+ let(:title) { "001 Binary Literals" }
8
+ let(:slug) { "001-binary-literals" }
9
+ let(:link) { "https://rubytapas.dpdcart.com/subscriber/post?id=18" }
10
+ let(:description) {<<-DESC
11
+ <div class="blog-entry">
12
+ <div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
13
+ </div>
14
+ <h3>Attached Files</h3>
15
+ <ul>
16
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
17
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
18
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
19
+ </ul></div>
20
+ DESC
21
+ }
22
+ let(:guid) { "dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1" }
23
+ let(:pub_date) { DateTime.new(2012,9,24,9,0,0,'-4') }
24
+ let(:file_list) do
25
+ [
26
+ Scrapers::RubyTapas::Episode::FileLink.new("RubyTapas001.mp4", "https://rubytapas.dpdcart.com/subscriber/download?file_id=25"),
27
+ Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.html", "https://rubytapas.dpdcart.com/subscriber/download?file_id=26"),
28
+ Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.rb", "https://rubytapas.dpdcart.com/subscriber/download?file_id=27")
29
+ ]
30
+ end
31
+
32
+ let(:xml_string) do
33
+ <<-ITEM
34
+ <item>
35
+ <title><![CDATA[001 Binary Literals]]></title>
36
+ <link>https://rubytapas.dpdcart.com/subscriber/post?id=18</link>
37
+ <description><![CDATA[<div class="blog-entry">
38
+ <div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
39
+ </div>
40
+ <h3>Attached Files</h3>
41
+ <ul>
42
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
43
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
44
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
45
+ </ul></div>]]></description>
46
+ <guid isPermaLink="false">dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1</guid>
47
+ <pubDate>Mon, 24 Sep 2012 09:00:00 -0400</pubDate>
48
+ <enclosure url="https://rubytapas.dpdcart.com/feed/download/25/RubyTapas001.mp4" length="12502397" type="video/mp4"/>
49
+ <itunes:image href="https://getdpd.com/uploads/ruby-tapas.png"/>
50
+ </item>
51
+ ITEM
52
+ end
53
+
54
+ let(:xml_item) do
55
+ Nokogiri::XML.parse(xml_string) {|c| c.noblanks }.children.first
56
+ end
57
+
58
+ describe "#initialize" do
59
+
60
+ shared_examples "initialize episode" do
61
+ it "is episode number 001" do
62
+ expect(episode.number).to eq(number)
63
+ end
64
+
65
+ it "is episode '001 Binary Literals'" do
66
+ expect(episode.title).to eq(title)
67
+ end
68
+
69
+ it "has slug" do
70
+ expect(episode.slug).to eq(slug)
71
+ end
72
+
73
+ it "has link" do
74
+ expect(episode.link).to eq(link)
75
+ end
76
+
77
+ it "has guid" do
78
+ expect(episode.guid).to eq(guid)
79
+ end
80
+
81
+ it "has publication date" do
82
+ expect(episode.pub_date).to eq(pub_date)
83
+ end
84
+
85
+ it "has file list" do
86
+ expect(episode.file_list).to match_array(file_list)
87
+ end
88
+
89
+ end
90
+
91
+ context "when given an xml string" do
92
+ include_examples "initialize episode" do
93
+ let(:episode) {Scrapers::RubyTapas::Episode.new(xml_string)}
94
+ end
95
+
96
+ end
97
+
98
+ context "when given a Nokogiri::XML::Element" do
99
+ include_examples "initialize episode" do
100
+ let(:episode) {Scrapers::RubyTapas::Episode.new(xml_item)}
101
+ end
102
+ end
103
+
104
+ context "when given a Hash" do
105
+ include_examples "initialize episode" do
106
+ let(:episode) do
107
+ Scrapers::RubyTapas::Episode.new(
108
+ :number => number,
109
+ :title => title,
110
+ :slug => slug,
111
+ :link => link,
112
+ :description => description,
113
+ :guid => guid,
114
+ :pub_date => pub_date,
115
+ :file_list => file_list
116
+ )
117
+ end
118
+ end
119
+ end
120
+
121
+ context "when given a list of arguments" do
122
+ include_examples "initialize episode" do
123
+ let(:episode) do
124
+ Scrapers::RubyTapas::Episode.new(
125
+ number,
126
+ title,
127
+ slug,
128
+ link,
129
+ description,
130
+ guid,
131
+ pub_date,
132
+ file_list
133
+ )
134
+ end
135
+ end
136
+ end
137
+
138
+ end
139
+ end
140
+