scrapers 2.1.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/ChangeLog +7 -0
  4. data/Gemfile +0 -8
  5. data/Guardfile +1 -1
  6. data/bin/rubytapas +2 -75
  7. data/lib/scrapers.rb +1 -3
  8. data/lib/scrapers/manning_books.rb +37 -27
  9. data/lib/scrapers/rubytapas.rb +6 -81
  10. data/lib/scrapers/rubytapas/cli.rb +39 -0
  11. data/lib/scrapers/rubytapas/config.rb +11 -0
  12. data/lib/scrapers/rubytapas/dpdcart.rb +115 -0
  13. data/lib/scrapers/rubytapas/episode.rb +86 -0
  14. data/lib/scrapers/rubytapas/scraper.rb +142 -0
  15. data/lib/scrapers/version.rb +2 -2
  16. data/scrapers.gemspec +4 -1
  17. data/spec/lib/scrapers/rubytapas/dpdcart_spec.rb +68 -0
  18. data/spec/lib/scrapers/rubytapas/episode_spec.rb +140 -0
  19. data/spec/lib/scrapers/rubytapas/rubytapas_spec.rb +87 -0
  20. data/spec/lib/scrapers/rubytapas/scraper_spec.rb +83 -0
  21. data/spec/lib/scrapers/rubytapas/test_data/feed.xml +7038 -0
  22. data/spec/lib/scrapers/{wunderground_spec.rb → wunderground_spec.rb.no} +0 -0
  23. data/spec/scrapers/allrecipes_spec.rb +2 -2
  24. data/spec/scrapers/discoverynews_spec.rb +3 -14
  25. data/spec/scrapers/download_spec.rb +6 -16
  26. data/spec/scrapers/gocomics_spec.rb +3 -3
  27. data/spec/scrapers/imgur_spec.rb +10 -22
  28. data/spec/scrapers/manning_books_spec.rb +9 -6
  29. data/spec/scrapers/nasa_apod_spec.rb +12 -14
  30. data/spec/scrapers/sinfest_spec.rb +3 -3
  31. data/spec/scrapers/xkcd_spec.rb +1 -0
  32. data/spec/scrapers_spec.rb +2 -1
  33. data/spec/spec_helper.rb +1 -8
  34. data/spec/support/dir_helpers.rb +13 -0
  35. data/spec/support/use_vcr.rb +9 -0
  36. data/vcr_cassettes/nasa-apod.yml +348 -0
  37. data/vcr_cassettes/rubytapas-download-1.yml +6726 -0
  38. data/vcr_cassettes/rubytapas-download-all.yml +6726 -0
  39. data/vcr_cassettes/rubytapas_download.yml +982 -0
  40. data/vcr_cassettes/rubytapas_download_twice.yml +1064 -0
  41. data/vcr_cassettes/rubytapas_feed.yml +5880 -0
  42. data/vcr_cassettes/rubytapas_login.yml +849 -0
  43. metadata +74 -6
@@ -0,0 +1,86 @@
1
+ require 'nokogiri'
2
+ require 'date'
3
+ require 'stringex_lite'
4
+
5
+ module Scrapers
6
+ module RubyTapas
7
+
8
+ class Episode
9
+
10
+ FileLink = Struct.new :filename, :href
11
+
12
+ attr_accessor :number, :title, :link, :description, :guid, :pub_date, :file_list, :slug
13
+
14
+ def initialize(*args)
15
+ if args.size == 1
16
+ case args[0]
17
+ when String
18
+ parse_item(Nokogiri::XML.parse(args[0]){|c| c.noblanks}.children.first)
19
+ when Nokogiri::XML::Element
20
+ parse_item(args[0])
21
+ when Hash
22
+ parse_options(args[0])
23
+ else
24
+ end
25
+ elsif args.size > 1
26
+ assign_from_args(*args)
27
+ end
28
+ end
29
+
30
+ def number_from_title
31
+ title.scan(/\w+/).first
32
+ end
33
+
34
+ def slug_from_title
35
+ title.to_s.to_url
36
+ end
37
+
38
+ def file_list_from_description
39
+ find_file_list(description)
40
+ end
41
+
42
+ private
43
+
44
+ def parse_item(item)
45
+ self.title = item.xpath("title").text
46
+ self.number = number_from_title
47
+ self.slug = slug_from_title
48
+ self.link = item.xpath("link").text
49
+ self.description = item.xpath("description").text
50
+ self.guid = item.xpath("guid").text
51
+ self.pub_date = DateTime.parse(item.xpath("pubDate").text)
52
+ self.file_list = file_list_from_description
53
+ end
54
+
55
+ def parse_options(options)
56
+ self.number = options[:number]
57
+ self.title = options[:title]
58
+ self.slug = options[:slug]
59
+ self.link = options[:link]
60
+ self.description = options[:description]
61
+ self.guid = options[:guid]
62
+ self.pub_date = options[:pub_date]
63
+ self.file_list = options[:file_list]
64
+ end
65
+
66
+ def assign_from_args(*args)
67
+ self.number,
68
+ self.title,
69
+ self.slug,
70
+ self.link,
71
+ self.description,
72
+ self.guid,
73
+ self.pub_date,
74
+ self.file_list =
75
+ *args
76
+ end
77
+
78
+ def find_file_list(content)
79
+ Nokogiri::HTML.parse(content).css("a").
80
+ select {|link| link['href'] =~ /download\?file_id=/ }.
81
+ map { |link| FileLink.new(link.child.text, link['href']) }
82
+ end
83
+ end
84
+
85
+ end
86
+ end
@@ -0,0 +1,142 @@
1
+ require 'nokogiri'
2
+ require 'fileutils'
3
+ require 'scrapers/rubytapas/config'
4
+ require 'scrapers/rubytapas/episode'
5
+ require 'scrapers/rubytapas/dpdcart'
6
+
7
+ module Scrapers
8
+ module RubyTapas
9
+
10
+ # Scraper provides the methods to download, extract and build a collection
11
+ # of RubyTapas episodes from the RubyTapas RSS feed.
12
+ class Scraper
13
+
14
+ attr_accessor :user, :pw, :destination, :episode_number, :netrc_reader, :dry_run, :debug
15
+ attr_reader :dpdcart
16
+
17
+ # *episode_number* is the RubyTapas episode number (note! not the post id!) of the
18
+ # episode to download. If the episode number is the symbol :all, then all episodes
19
+ # will be retrieved. Note that if any of the episodes have been previously retrieved
20
+ # to the destination, i.e., the associated directory already exists, that episode
21
+ # download will be skipped.
22
+ #
23
+ # *options* contains the options from the cli, which include:
24
+ #
25
+ # - "user": the username of the RubyTapas account
26
+ # - "pw": the password of the RubyTapas account
27
+ # - "destination": the root destination of the episode downloads
28
+ def initialize(episode_number, options)
29
+ self.episode_number = episode_number
30
+ self.user = options["user"]
31
+ self.pw = options["pw"]
32
+ self.destination = options.fetch("destination", Dir.pwd)
33
+ self.dry_run = options["dry_run"]
34
+ self.debug = options["debug"]
35
+ @dpdcart = Scrapers::RubyTapas::DpdCart.
36
+ new(user, pw, {dry_run: dry_run, debug: debug})
37
+ warn "DEBUG: episode_number: #{episode_number}, options: #{options.inspect}" if debug
38
+ end
39
+
40
+ # Perform the scraping operation
41
+ def scrape!
42
+ dpdcart.login!
43
+ if all_episodes?
44
+ episodes.each do |episode|
45
+
46
+ begin
47
+ download(episode)
48
+ friendly_pause unless dry_run
49
+ rescue Errno::EEXIST
50
+ warn "Episode previously downloaded. Skipping."
51
+ end
52
+
53
+ end
54
+ else
55
+ episode = find_by_episode(episode_number)
56
+ if episode.nil?
57
+ raise "Unknown episode for #{episode_number}"
58
+ else
59
+ download(episode)
60
+ end
61
+ end
62
+ end
63
+
64
+ # Print a list of episodes
65
+ def list!
66
+ with_pager do |pager|
67
+ episodes.each do |episode|
68
+ pager.puts format_episode(episode)
69
+ end
70
+ end
71
+ end
72
+
73
+ # Returns the collection of episodes.
74
+ def episodes
75
+ @episodes ||= fetch_episodes
76
+ end
77
+
78
+ # Retrieves the episode associated with *episode number*.
79
+ def find_by_episode(episode_number)
80
+ episodes.detect {|e| e.number == episode_number}
81
+ end
82
+
83
+ private
84
+
85
+ def all_episodes?
86
+ episode_number.to_s.downcase.to_sym == :all
87
+ end
88
+
89
+ # Builds a collection of all the episodes listed in the feed
90
+ def fetch_episodes
91
+ feed = Nokogiri::XML.parse(dpdcart.feed!)
92
+ feed.xpath("//item").map do |item|
93
+ Episode.new(item)
94
+ end
95
+ end
96
+
97
+ def download(episode)
98
+ download_directory = make_download_directory(episode.slug)
99
+ episode.file_list.each do |file|
100
+ download_file(download_directory, file.href)
101
+ end
102
+ end
103
+
104
+ def download_file(dir, url)
105
+ warn "fetching #{url}" if debug
106
+ name, body = dpdcart.download!(url)
107
+ File.binwrite(File.join(dir,name), body) unless dry_run
108
+ warn "saved #{name} to #{dir}" if debug
109
+ end
110
+
111
+
112
+ def make_download_directory(slug)
113
+ dir = File.join(File.realpath(destination), slug)
114
+ warn "Downloading to #{dir}" if debug
115
+ if dry_run
116
+ "no dir for dry run"
117
+ else
118
+ FileUtils.mkdir(dir).first
119
+ end
120
+ end
121
+
122
+ def friendly_pause(delay=5)
123
+ puts
124
+ print "Sleeping #{delay} seconds"
125
+ delay.downto(1) { sleep 1; print "." }
126
+ puts "\n"
127
+ end
128
+
129
+ def with_pager(&block)
130
+ raise "Must be called with block" unless block_given?
131
+ pager = open("|more","w")
132
+ yield pager
133
+ pager.close
134
+ end
135
+
136
+ def format_episode(episode)
137
+ "%-5s\t%-40s\t%-15s" % [episode.number, episode.title, episode.pub_date.strftime("%Y-%b-%d")]
138
+ end
139
+
140
+ end
141
+ end
142
+ end
@@ -1,8 +1,8 @@
1
1
  module Scrapers
2
2
  module Version
3
3
 
4
- MAJOR = 2
5
- MINOR = 1
4
+ MAJOR = 3
5
+ MINOR = 0
6
6
  BUILD = 0
7
7
 
8
8
  end
@@ -24,6 +24,7 @@ Gem::Specification.new do |spec|
24
24
  spec.add_dependency "activesupport", "~> 4.1"
25
25
  spec.add_dependency "highline"
26
26
  spec.add_dependency "awesome_print"
27
+ spec.add_dependency "stringex"
27
28
 
28
29
  spec.add_development_dependency "bundler"
29
30
  spec.add_development_dependency "rake"
@@ -32,5 +33,7 @@ Gem::Specification.new do |spec|
32
33
  spec.add_development_dependency "guard-rspec"
33
34
  spec.add_development_dependency "webmock"
34
35
  spec.add_development_dependency "vcr"
35
-
36
+ spec.add_development_dependency "pry"
37
+ spec.add_development_dependency "pry-byebug"
38
+
36
39
  end
@@ -0,0 +1,68 @@
1
+ require 'spec_helper'
2
+ require 'support/use_vcr'
3
+ require 'scrapers/rubytapas/dpdcart'
4
+ require 'nokogiri'
5
+
6
+ describe Scrapers::RubyTapas::DpdCart do
7
+ let(:gateway) { Scrapers::RubyTapas::DpdCart.new }
8
+
9
+ describe "method signatures" do
10
+ it { is_expected.to respond_to(:feed!) }
11
+ it { is_expected.to respond_to(:login!)}
12
+ it { is_expected.to respond_to(:download!)}
13
+ end
14
+
15
+ describe "#feed!" do
16
+ it "returns an rss feed" do
17
+ VCR.use_cassette('rubytapas_feed', record: :new_episodes,
18
+ match_requests_on: [:method, :host, :path]
19
+ ) do
20
+ expect(Nokogiri::XML.parse(gateway.feed!)).to be_a(Nokogiri::XML::Document)
21
+ end
22
+ end
23
+ end
24
+
25
+ describe "#login!" do
26
+ it "shows the subscriber content page" do
27
+ VCR.use_cassette('rubytapas_login', record: :new_episodes,
28
+ match_requests_on: [:method, :host, :path]
29
+ ) do
30
+ expect(gateway.login!.page.title).to eq("Subscription Content | RubyTapas")
31
+ end
32
+ end
33
+ end
34
+
35
+
36
+ describe "#download!" do
37
+ let(:file) { "https://rubytapas.dpdcart.com/subscriber/download?file_id=26" }
38
+ let(:name) { "001-binary-literals.html" }
39
+
40
+ let(:file2){ "https://rubytapas.dpdcart.com/subscriber/download?file_id=27" }
41
+ let(:name2) { "001-binary-literals.rb" }
42
+
43
+ it "returns the downloaded file" do
44
+ VCR.use_cassette('rubytapas_download', record: :new_episodes,
45
+ match_requests_on: [:method, :host, :path,
46
+ :query]) do
47
+ gateway.login!
48
+ filename, body = gateway.download! file
49
+ expect(filename).to eq(name)
50
+ expect(body.size).to eq(5744)
51
+ end
52
+ end
53
+
54
+ it "can download multiple files with a single login" do
55
+ VCR.use_cassette('rubytapas_download_twice', record: :new_episodes,
56
+ match_requests_on: [:method, :host, :path,
57
+ :query]) do
58
+ gateway.login!
59
+ filename, body = gateway.download! file
60
+ expect(filename).to eq(name)
61
+ filename, body = gateway.download! file2
62
+ expect(filename).to eq(name2)
63
+ end
64
+ end
65
+
66
+ end
67
+
68
+ end
@@ -0,0 +1,140 @@
1
+ require 'spec_helper'
2
+ require 'scrapers/rubytapas/episode'
3
+
4
+ describe Scrapers::RubyTapas::Episode do
5
+
6
+ let(:number) { "001" }
7
+ let(:title) { "001 Binary Literals" }
8
+ let(:slug) { "001-binary-literals" }
9
+ let(:link) { "https://rubytapas.dpdcart.com/subscriber/post?id=18" }
10
+ let(:description) {<<-DESC
11
+ <div class="blog-entry">
12
+ <div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
13
+ </div>
14
+ <h3>Attached Files</h3>
15
+ <ul>
16
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
17
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
18
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
19
+ </ul></div>
20
+ DESC
21
+ }
22
+ let(:guid) { "dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1" }
23
+ let(:pub_date) { DateTime.new(2012,9,24,9,0,0,'-4') }
24
+ let(:file_list) do
25
+ [
26
+ Scrapers::RubyTapas::Episode::FileLink.new("RubyTapas001.mp4", "https://rubytapas.dpdcart.com/subscriber/download?file_id=25"),
27
+ Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.html", "https://rubytapas.dpdcart.com/subscriber/download?file_id=26"),
28
+ Scrapers::RubyTapas::Episode::FileLink.new("001-binary-literals.rb", "https://rubytapas.dpdcart.com/subscriber/download?file_id=27")
29
+ ]
30
+ end
31
+
32
+ let(:xml_string) do
33
+ <<-ITEM
34
+ <item>
35
+ <title><![CDATA[001 Binary Literals]]></title>
36
+ <link>https://rubytapas.dpdcart.com/subscriber/post?id=18</link>
37
+ <description><![CDATA[<div class="blog-entry">
38
+ <div class="blog-content"><p>In this inaugural episode, a look at a handy syntax for writing out binary numbers.</p>
39
+ </div>
40
+ <h3>Attached Files</h3>
41
+ <ul>
42
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=25">RubyTapas001.mp4</a></li>
43
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=26">001-binary-literals.html</a></li>
44
+ <li><a href="https://rubytapas.dpdcart.com/subscriber/download?file_id=27">001-binary-literals.rb</a></li>
45
+ </ul></div>]]></description>
46
+ <guid isPermaLink="false">dpd-89e8004c8242e7ad548833bef1e18a5b575c92c1</guid>
47
+ <pubDate>Mon, 24 Sep 2012 09:00:00 -0400</pubDate>
48
+ <enclosure url="https://rubytapas.dpdcart.com/feed/download/25/RubyTapas001.mp4" length="12502397" type="video/mp4"/>
49
+ <itunes:image href="https://getdpd.com/uploads/ruby-tapas.png"/>
50
+ </item>
51
+ ITEM
52
+ end
53
+
54
+ let(:xml_item) do
55
+ Nokogiri::XML.parse(xml_string) {|c| c.noblanks }.children.first
56
+ end
57
+
58
+ describe "#initialize" do
59
+
60
+ shared_examples "initialize episode" do
61
+ it "is episode number 001" do
62
+ expect(episode.number).to eq(number)
63
+ end
64
+
65
+ it "is episode '001 Binary Literals'" do
66
+ expect(episode.title).to eq(title)
67
+ end
68
+
69
+ it "has slug" do
70
+ expect(episode.slug).to eq(slug)
71
+ end
72
+
73
+ it "has link" do
74
+ expect(episode.link).to eq(link)
75
+ end
76
+
77
+ it "has guid" do
78
+ expect(episode.guid).to eq(guid)
79
+ end
80
+
81
+ it "has publication date" do
82
+ expect(episode.pub_date).to eq(pub_date)
83
+ end
84
+
85
+ it "has file list" do
86
+ expect(episode.file_list).to match_array(file_list)
87
+ end
88
+
89
+ end
90
+
91
+ context "when given an xml string" do
92
+ include_examples "initialize episode" do
93
+ let(:episode) {Scrapers::RubyTapas::Episode.new(xml_string)}
94
+ end
95
+
96
+ end
97
+
98
+ context "when given a Nokogiri::XML::Element" do
99
+ include_examples "initialize episode" do
100
+ let(:episode) {Scrapers::RubyTapas::Episode.new(xml_item)}
101
+ end
102
+ end
103
+
104
+ context "when given a Hash" do
105
+ include_examples "initialize episode" do
106
+ let(:episode) do
107
+ Scrapers::RubyTapas::Episode.new(
108
+ :number => number,
109
+ :title => title,
110
+ :slug => slug,
111
+ :link => link,
112
+ :description => description,
113
+ :guid => guid,
114
+ :pub_date => pub_date,
115
+ :file_list => file_list
116
+ )
117
+ end
118
+ end
119
+ end
120
+
121
+ context "when given a list of arguments" do
122
+ include_examples "initialize episode" do
123
+ let(:episode) do
124
+ Scrapers::RubyTapas::Episode.new(
125
+ number,
126
+ title,
127
+ slug,
128
+ link,
129
+ description,
130
+ guid,
131
+ pub_date,
132
+ file_list
133
+ )
134
+ end
135
+ end
136
+ end
137
+
138
+ end
139
+ end
140
+