scrapers 1.5.1 → 1.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/rubytapas +29 -0
- data/lib/scrapers.rb +8 -1
- data/lib/scrapers/rubytapas.rb +32 -10
- data/lib/scrapers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86724e63b99e28cc9e82a1c6806dce93515f9e3e
|
4
|
+
data.tar.gz: 7f1789114ac38d02a006e99bfb00294fa8c3c5b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c60d23ca256369982c57e8ad8099403042f411d9989694b3f9d32e65fa1307ba6452bf64d93b895059a457c44916aa2c90149903e075bdb76e0fe33fe640c0e5
|
7
|
+
data.tar.gz: 692ca67d599d2e846dc191827c5f77199f4c2de5d5b5415243995569f153681b80c72a72d910716af92d10c76c87775e705b3d0bc5b76191382092196ace3a70
|
data/bin/rubytapas
CHANGED
@@ -41,6 +41,35 @@ class RubyTapasDownload < Thor
|
|
41
41
|
Scrapers::RubyTapas.scrape url, user, pw, destination
|
42
42
|
end
|
43
43
|
|
44
|
+
desc "all", "Download all rubytapas episodes"
|
45
|
+
method_option :destination, :aliases => %w{-d --dest}, :desc => "Destination of dowload", :default => '.'
|
46
|
+
method_option :url, :desc => "url of showlist", :default => 'https://rubytapas.dpdcart.com/subscriber/content'
|
47
|
+
method_option :user, :aliases => %w{-u -U}, :desc => "dpdcart user. Default is read from $HOME/.netrc"
|
48
|
+
method_option :password, :aliases => %w{-p -pw}, :desc => "dpdcart password. Default is read from $HOME/.netrc"
|
49
|
+
|
50
|
+
def all
|
51
|
+
STDERR.puts options.inspect
|
52
|
+
|
53
|
+
netrc = Netrc.read
|
54
|
+
user, pw = netrc[RUBYTAPAS]
|
55
|
+
user = options.fetch("user", user)
|
56
|
+
pw = options.fetch("password", pw)
|
57
|
+
url = options.fetch("url", nil)
|
58
|
+
destination = options.fetch("destination", nil)
|
59
|
+
STDERR.puts "destination: #{destination}, url: #{url}, user: #{user}, pw: #{pw.length}"
|
60
|
+
raise "Must give url" unless url
|
61
|
+
|
62
|
+
showlist_urls = Scrapers::RubyTapas.showlist(url, user, pw)
|
63
|
+
|
64
|
+
showlist_urls.each do |url|
|
65
|
+
Scrapers::RubyTapas.scrape url, user, pw, destination
|
66
|
+
print "pausing..."
|
67
|
+
sleep 5
|
68
|
+
puts "."
|
69
|
+
end
|
70
|
+
|
71
|
+
end
|
72
|
+
|
44
73
|
end
|
45
74
|
|
46
75
|
RubyTapasDownload.start
|
data/lib/scrapers.rb
CHANGED
@@ -1,9 +1,16 @@
|
|
1
1
|
require 'mechanize'
|
2
|
-
|
2
|
+
require 'uri'
|
3
3
|
Dir[File.join(File.expand_path('../', __FILE__),'**','*.rb')].each {|file| require file}
|
4
4
|
|
5
5
|
module Scrapers
|
6
6
|
def self.agent()
|
7
7
|
@agent ||= Mechanize.new
|
8
8
|
end
|
9
|
+
|
10
|
+
def self.base(url)
|
11
|
+
u = URI.parse(url)
|
12
|
+
u.path=''
|
13
|
+
u.to_s
|
14
|
+
end
|
15
|
+
|
9
16
|
end
|
data/lib/scrapers/rubytapas.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'fileutils'
|
2
2
|
require 'ostruct'
|
3
3
|
require 'mechanize'
|
4
|
+
require 'uri'
|
4
5
|
|
5
6
|
module Scrapers
|
6
7
|
|
@@ -32,20 +33,13 @@ module Scrapers
|
|
32
33
|
|
33
34
|
tapas = OpenStruct.new
|
34
35
|
|
35
|
-
|
36
|
-
m.get url
|
37
|
-
m.current_page.form.field_with(:name => "username").value = user
|
38
|
-
m.current_page.form.field_with(:name => "password").value = pw
|
39
|
-
m.current_page.form.submit
|
40
|
-
|
41
|
-
# Second time, we should land on episode page
|
42
|
-
m.get url
|
43
|
-
raise "Not where I expected. #{m.current_page.uri} is not #{url}" unless m.current_page.uri != url
|
36
|
+
m = self.login(m, url, user, pw)
|
44
37
|
|
45
38
|
m.current_page.tap do |page|
|
46
39
|
tapas.title = page.title.strip
|
47
40
|
tapas.episode_dir = File.join(dest,tapas.title.split("|").first.strip.downcase.gsub(%r{\s+},'-'))
|
48
41
|
tapas.attachments = page.links_with(:href => %r{\bdownload\b})
|
42
|
+
puts "Fetching and saving #{tapas.title} into #{tapas.episode_dir}"
|
49
43
|
FileUtils.mkdir(tapas.episode_dir)
|
50
44
|
Dir.chdir(tapas.episode_dir) do |dir|
|
51
45
|
tapas.attachments.each do |att|
|
@@ -61,6 +55,34 @@ module Scrapers
|
|
61
55
|
|
62
56
|
end
|
63
57
|
end
|
58
|
+
|
59
|
+
# retrieve a list of URLs for shows from the showlist
|
60
|
+
def self.showlist(showlist_url, user=nil, pw=nil)
|
61
|
+
raise "Must give showlist url, user, and password" if showlist_url.to_s.empty? || user.to_s.empty? || pw.to_s.empty?
|
62
|
+
|
63
|
+
Mechanize.start do |m|
|
64
|
+
m = self.login(m, showlist_url, user, pw)
|
65
|
+
links = m.current_page.links_with(:text => "Read More")
|
66
|
+
s = URI.parse(showlist_url)
|
67
|
+
s.path = ''
|
68
|
+
links.map{|l| "#{s}#{l.href}" }
|
69
|
+
end
|
70
|
+
|
71
|
+
|
72
|
+
end
|
73
|
+
|
74
|
+
def self.login(m, url, user, pw)
|
75
|
+
# First time, we will get redirected to the login page
|
76
|
+
m.get url
|
77
|
+
m.current_page.form.field_with(:name => "username").value = user
|
78
|
+
m.current_page.form.field_with(:name => "password").value = pw
|
79
|
+
m.current_page.form.submit
|
80
|
+
|
81
|
+
# Second time, we should land on episode page
|
82
|
+
m.get url
|
83
|
+
raise "Not where I expected. #{m.current_page.uri} is not #{url}" unless m.current_page.uri != url
|
84
|
+
m
|
85
|
+
end
|
86
|
+
|
64
87
|
end
|
65
|
-
|
66
88
|
end
|
data/lib/scrapers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: scrapers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tamara Temple
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-02-
|
11
|
+
date: 2014-02-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|