coursera_downloader 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +3 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +19 -0
- data/README.markdown +63 -0
- data/bin/coursera_downloader +6 -0
- data/coursera_downloader.gemspec +19 -0
- data/download_policy.yml +22 -0
- data/lib/coursera_downloader.rb +4 -0
- data/lib/coursera_downloader/course.rb +58 -0
- data/lib/coursera_downloader/document.rb +26 -0
- data/lib/coursera_downloader/document_processor.rb +116 -0
- data/lib/coursera_downloader/downloader.rb +142 -0
- data/lib/coursera_downloader/file_store.rb +56 -0
- data/lib/coursera_downloader/log_formatter.rb +19 -0
- data/lib/coursera_downloader/policy.rb +46 -0
- data/lib/coursera_downloader/runner.rb +34 -0
- data/lib/coursera_downloader/util.rb +21 -0
- metadata +110 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
coursera_downloader (0.1.0)
|
5
|
+
curb (~> 0.8)
|
6
|
+
nokogiri (~> 1.5)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://www.rubygems.org/
|
10
|
+
specs:
|
11
|
+
curb (0.8.3)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
coursera_downloader!
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (C) 2012 by Nick Ewing
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
Coursera Downloader
|
2
|
+
===================
|
3
|
+
|
4
|
+
Download static versions of Coursera course websites for offline reference.
|
5
|
+
|
6
|
+
Install
|
7
|
+
-------
|
8
|
+
Download and install with the following command:
|
9
|
+
|
10
|
+
gem install coursera_downloader
|
11
|
+
|
12
|
+
Use
|
13
|
+
---
|
14
|
+
After installing the Ruby gem, use the `coursera_downloader` command to download
|
15
|
+
a course like so:
|
16
|
+
|
17
|
+
coursera_downloader course-identifier email password destination-directory [policy-file]
|
18
|
+
|
19
|
+
For example, to download the first offering of the Neural Networks class, run
|
20
|
+
the following command:
|
21
|
+
|
22
|
+
coursera_downloader neuralnets-2012-001 foo.bar@example.com password123 neuralnets
|
23
|
+
|
24
|
+
Default Policy File
|
25
|
+
-------------------
|
26
|
+
The policy file defines Ruby regular expressions used to determine how to handle
|
27
|
+
new URLs found on the course page. It is in the YAML format.
|
28
|
+
|
29
|
+
The default policy file is shown below.
|
30
|
+
|
31
|
+
---
|
32
|
+
whitelist:
|
33
|
+
- ^https?://class\.coursera\.org/[^/]+/
|
34
|
+
- ^https?://[^\.]+\.s3\.amazonaws\.com
|
35
|
+
- ^https?://s3\.amazonaws\.com
|
36
|
+
- ^https?://[^\.]+\.cloudfront\.net
|
37
|
+
blacklist:
|
38
|
+
- \.(exe|dmg)(\?.*)?$
|
39
|
+
disable:
|
40
|
+
# - ^https?://class\.coursera\.org/[^/]+/quiz
|
41
|
+
# - ^https?://class\.coursera\.org/[^/]+/forum
|
42
|
+
# - ^https?://class\.coursera\.org/[^/]+/lecture
|
43
|
+
# - ^https?://class\.coursera\.org/[^/]+/wiki
|
44
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/thread?.*view=.*
|
45
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag?.*view=.*
|
46
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/list?.*view=.*
|
47
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/toggle
|
48
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag_modify
|
49
|
+
- ^https?://class\.coursera\.org/[^/]+/quiz/start
|
50
|
+
- ^https?://class\.coursera\.org/[^/]+/generic/apply_late_days
|
51
|
+
- ^https?://class\.coursera\.org/[^/]+/auth/logout
|
52
|
+
- ^https?://class\.coursera\.org/[^/]+/class/preferences
|
53
|
+
|
54
|
+
License
|
55
|
+
-------
|
56
|
+
Coursera Downloader is available under an MIT-style license.
|
57
|
+
|
58
|
+
See LICENSE.
|
59
|
+
|
60
|
+
This software should only be used in accordance to Coursera's Terms of Service.
|
61
|
+
Please see: https://www.coursera.org/about/terms
|
62
|
+
|
63
|
+
Copyright (C) 2012 by Nick Ewing
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "coursera_downloader"
|
3
|
+
s.version = "1.0.0"
|
4
|
+
s.authors = ["Nick Ewing"]
|
5
|
+
s.email = ""
|
6
|
+
s.homepage = "https://github.com/nickewing/coursera_downloader"
|
7
|
+
s.summary = "Download static versions of Coursera course websites."
|
8
|
+
s.description = "#{s.summary}."
|
9
|
+
|
10
|
+
s.files = `git ls-files`.split("\n").reject {|path| path =~ /\.gitignore$/ }
|
11
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
s.platform = Gem::Platform::RUBY
|
13
|
+
s.require_path = "lib"
|
14
|
+
s.rubyforge_project = "[none]"
|
15
|
+
|
16
|
+
s.add_dependency "curb", "~> 0.8"
|
17
|
+
s.add_dependency "nokogiri", "~> 1.5"
|
18
|
+
s.add_dependency "colored", "~> 1.2"
|
19
|
+
end
|
data/download_policy.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
whitelist:
|
3
|
+
- ^https?://class\.coursera\.org/[^/]+/
|
4
|
+
- ^https?://[^\.]+\.s3\.amazonaws\.com
|
5
|
+
- ^https?://s3\.amazonaws\.com
|
6
|
+
- ^https?://[^\.]+\.cloudfront\.net
|
7
|
+
blacklist:
|
8
|
+
- \.(exe|dmg)(\?.*)?$
|
9
|
+
disable:
|
10
|
+
# - ^https?://class\.coursera\.org/[^/]+/quiz
|
11
|
+
# - ^https?://class\.coursera\.org/[^/]+/forum
|
12
|
+
# - ^https?://class\.coursera\.org/[^/]+/lecture
|
13
|
+
# - ^https?://class\.coursera\.org/[^/]+/wiki
|
14
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/thread?.*view=.*
|
15
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag?.*view=.*
|
16
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/list?.*view=.*
|
17
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/toggle
|
18
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag_modify
|
19
|
+
- ^https?://class\.coursera\.org/[^/]+/quiz/start
|
20
|
+
- ^https?://class\.coursera\.org/[^/]+/generic/apply_late_days
|
21
|
+
- ^https?://class\.coursera\.org/[^/]+/auth/logout
|
22
|
+
- ^https?://class\.coursera\.org/[^/]+/class/preferences
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require "curb"
|
2
|
+
require "uri"
|
3
|
+
require "tempfile"
|
4
|
+
|
5
|
+
module CourseraDownloader
|
6
|
+
class Course
|
7
|
+
attr_reader :cookie_file
|
8
|
+
|
9
|
+
def initialize(name)
|
10
|
+
@name = name
|
11
|
+
@cookie_file = Tempfile.new('coursera_cookies')
|
12
|
+
end
|
13
|
+
|
14
|
+
def login(email, password)
|
15
|
+
curl = Curl::Easy.new
|
16
|
+
|
17
|
+
curl.verbose = false
|
18
|
+
curl.enable_cookies = true
|
19
|
+
curl.cookiefile = @cookie_file.path
|
20
|
+
curl.cookiejar = @cookie_file.path
|
21
|
+
curl.follow_location = true
|
22
|
+
|
23
|
+
curl.url = login_redirect_url
|
24
|
+
curl.http_get
|
25
|
+
|
26
|
+
curl.url = curl.last_effective_url
|
27
|
+
curl.http_post([
|
28
|
+
Curl::PostField.content('email', email),
|
29
|
+
Curl::PostField.content('password', password),
|
30
|
+
Curl::PostField.content('login', "Login")
|
31
|
+
])
|
32
|
+
|
33
|
+
curl.follow_location = false
|
34
|
+
curl.url = index_url.to_s
|
35
|
+
curl.http_get
|
36
|
+
|
37
|
+
response_code = curl.response_code
|
38
|
+
curl.close
|
39
|
+
|
40
|
+
response_code == 200
|
41
|
+
end
|
42
|
+
|
43
|
+
def index_url
|
44
|
+
URI.parse("#{host_url}/#{@name}/class/index")
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def host_url
|
50
|
+
"https://class.coursera.org"
|
51
|
+
end
|
52
|
+
|
53
|
+
def login_redirect_url
|
54
|
+
"#{host_url}/#{@name}/auth/auth_redirector?type=login&subtype=normal&email=&visiting=&minimal=true"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Document
|
5
|
+
attr_reader :url, :content_type
|
6
|
+
attr_accessor :body
|
7
|
+
|
8
|
+
def initialize(url, body, content_type)
|
9
|
+
@url = url
|
10
|
+
@body = body
|
11
|
+
@content_type = content_type
|
12
|
+
end
|
13
|
+
|
14
|
+
def is_html?
|
15
|
+
content_type =~ /text\/html/
|
16
|
+
end
|
17
|
+
|
18
|
+
def is_css?
|
19
|
+
content_type =~ /text\/css/
|
20
|
+
end
|
21
|
+
|
22
|
+
def is_javascript?
|
23
|
+
content_type =~ /javascript/
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class DocumentProcessor
|
5
|
+
DISABLED_HREF = "javascript:alert('Link disabled during download.');"
|
6
|
+
|
7
|
+
attr_reader :resource_urls, :document
|
8
|
+
|
9
|
+
def initialize(document, store, policy)
|
10
|
+
@document = document
|
11
|
+
@store = store
|
12
|
+
@policy = policy
|
13
|
+
@resource_urls = Set.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def process
|
17
|
+
if @document.is_html?
|
18
|
+
@document.body = process_html(@document.body)
|
19
|
+
elsif @document.is_css?
|
20
|
+
@document.body = process_css(@document.body)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def process_html(html)
|
27
|
+
doc = Nokogiri::HTML(html)
|
28
|
+
|
29
|
+
urls = Set.new
|
30
|
+
doc.css("a, img, script, link").each do |element|
|
31
|
+
url = element.attr("href") || element.attr("src")
|
32
|
+
|
33
|
+
url = normalize_url(url)
|
34
|
+
|
35
|
+
case @policy.url_action(url)
|
36
|
+
when :disable
|
37
|
+
disable_html_element(element)
|
38
|
+
when :download
|
39
|
+
urls << url unless urls.include?(url)
|
40
|
+
localize_element(element, url)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
doc.xpath("//*[@style]").each do |element|
|
45
|
+
element["style"] = process_css(element.attr("style"))
|
46
|
+
end
|
47
|
+
|
48
|
+
@resource_urls += urls
|
49
|
+
|
50
|
+
doc.to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def localize_element(element, url)
|
54
|
+
path = @store.relative_resource_path(@document.url, url, true)
|
55
|
+
if element.attr("href")
|
56
|
+
element["href"] = path
|
57
|
+
elsif element.attr("src")
|
58
|
+
element["src"] = path
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def disable_html_element(element)
|
63
|
+
if element.attr("href")
|
64
|
+
element["href"] = DISABLED_HREF
|
65
|
+
elsif element.attr("src")
|
66
|
+
element.remove
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_css(css)
|
71
|
+
matches = css.scan(/url\(((")([^"]*)"|(')([^']*)'|[^\)]*)\)/)
|
72
|
+
|
73
|
+
urls = Set.new
|
74
|
+
matches.each do |match|
|
75
|
+
quote = match[1] || match[3]
|
76
|
+
raw_url = match[2] || match[4] || match[0]
|
77
|
+
|
78
|
+
# puts ">>>> #{raw_url}"
|
79
|
+
url = normalize_url(raw_url)
|
80
|
+
|
81
|
+
# p @policy.url_action(url)
|
82
|
+
|
83
|
+
case @policy.url_action(url)
|
84
|
+
when :disable
|
85
|
+
css.gsub!("url(#{quote}#{raw_url}#{quote})", "url()")
|
86
|
+
when :download
|
87
|
+
urls << url unless urls.include?(url)
|
88
|
+
path = @store.relative_resource_path(@document.url, url, true)
|
89
|
+
css.gsub!("url(#{quote}#{raw_url}#{quote})", "url('#{path}')")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
@resource_urls += urls
|
94
|
+
|
95
|
+
css
|
96
|
+
end
|
97
|
+
|
98
|
+
def normalize_url(url)
|
99
|
+
return nil if !url || url.length == 0
|
100
|
+
|
101
|
+
begin
|
102
|
+
url = URI.parse(url)
|
103
|
+
rescue URI::InvalidURIError => e
|
104
|
+
return nil
|
105
|
+
end
|
106
|
+
|
107
|
+
if url.host
|
108
|
+
url = url
|
109
|
+
else
|
110
|
+
url = URI.join(@document.url, url)
|
111
|
+
end
|
112
|
+
|
113
|
+
url
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require "set"
|
2
|
+
require "yaml"
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module CourseraDownloader
|
6
|
+
class Downloader
|
7
|
+
MAX_BATCH_SIZE = 10
|
8
|
+
|
9
|
+
def initialize(cookie_file, policy, store, logger)
|
10
|
+
@cookie_file = cookie_file
|
11
|
+
@store = store
|
12
|
+
@policy = policy
|
13
|
+
@logger = logger
|
14
|
+
|
15
|
+
@queue = []
|
16
|
+
@enqueued = Set.new # all URLs that have been ever enqueued during this run
|
17
|
+
|
18
|
+
read_state
|
19
|
+
end
|
20
|
+
|
21
|
+
def get(url)
|
22
|
+
url = URI.parse(url) unless url.is_a?(URI)
|
23
|
+
|
24
|
+
enqueue_new_url(url)
|
25
|
+
fetch_all
|
26
|
+
@logger.info("Downloaded #{@enqueued.length} total files")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def state_save_path
|
32
|
+
File.join(@store.containing_dir, "manifest.yml")
|
33
|
+
end
|
34
|
+
|
35
|
+
def write_state
|
36
|
+
if File.exists?(@store.containing_dir)
|
37
|
+
File.open(state_save_path, "wb") do |file|
|
38
|
+
state = {
|
39
|
+
:queue => @queue,
|
40
|
+
:enqueued => @enqueued.to_a
|
41
|
+
}
|
42
|
+
file.write(YAML::dump(state))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def read_state
|
48
|
+
if File.exists?(state_save_path)
|
49
|
+
saved_state = YAML::load(File.read(state_save_path))
|
50
|
+
|
51
|
+
@queue = saved_state[:queue]
|
52
|
+
@enqueued = Set.new(saved_state[:enqueued])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def handle_document(document)
|
57
|
+
if document.is_html? || document.is_css?
|
58
|
+
processor = DocumentProcessor.new(document, @store, @policy)
|
59
|
+
processor.process
|
60
|
+
|
61
|
+
processor.resource_urls.map do |resource_url|
|
62
|
+
enqueue_new_url(resource_url)
|
63
|
+
end
|
64
|
+
|
65
|
+
body = processor.document.body
|
66
|
+
end
|
67
|
+
|
68
|
+
@store.write(document)
|
69
|
+
end
|
70
|
+
|
71
|
+
def fetch_all
|
72
|
+
begin
|
73
|
+
interrupted = false
|
74
|
+
trap("INT") do
|
75
|
+
interrupted = true
|
76
|
+
show_interrupt_message
|
77
|
+
end
|
78
|
+
|
79
|
+
while @queue.length > 0 && !interrupted
|
80
|
+
batch = @queue.shift(MAX_BATCH_SIZE)
|
81
|
+
fetch_batch(batch)
|
82
|
+
end
|
83
|
+
|
84
|
+
trap("INT", "DEFAULT")
|
85
|
+
ensure
|
86
|
+
write_state
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def fetch_batch(batch)
|
91
|
+
m = Curl::Multi.new
|
92
|
+
|
93
|
+
batch.each do |url|
|
94
|
+
curl = Curl::Easy.new(url.to_s) do |curl|
|
95
|
+
curl.enable_cookies = true
|
96
|
+
curl.cookiefile = @cookie_file.path
|
97
|
+
curl.cookiejar = @cookie_file.path
|
98
|
+
curl.follow_location = true
|
99
|
+
|
100
|
+
curl.on_complete do |result|
|
101
|
+
begin
|
102
|
+
if result.response_code == 200
|
103
|
+
@logger.info("Downloaded: #{url}")
|
104
|
+
|
105
|
+
content_type = result.content_type
|
106
|
+
content_type.force_encoding("ASCII") if content_type.respond_to?(:force_encoding)
|
107
|
+
|
108
|
+
document = Document.new(url, result.body_str, content_type)
|
109
|
+
handle_document(document)
|
110
|
+
else
|
111
|
+
@logger.warn("Failed to get URL '#{url}'. Response code #{result.response_code}")
|
112
|
+
end
|
113
|
+
rescue => e
|
114
|
+
@logger.error(e.message + "\n " + e.backtrace.join("\n "))
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
m.add(curl)
|
120
|
+
end
|
121
|
+
|
122
|
+
m.perform
|
123
|
+
end
|
124
|
+
|
125
|
+
def enqueue_new_url(url)
|
126
|
+
if !@enqueued.include?(url)
|
127
|
+
@enqueued << url
|
128
|
+
@queue.push(url)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def show_interrupt_message
|
133
|
+
return if @interrupt_message_shown
|
134
|
+
|
135
|
+
@logger.warn("Finishing current download batch.")
|
136
|
+
@logger.warn("This download can be resumed by rerunning the same command.")
|
137
|
+
|
138
|
+
@interrupt_message_shown = true
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "cgi"
|
3
|
+
|
4
|
+
module CourseraDownloader
|
5
|
+
class FileStore
|
6
|
+
attr_reader :containing_dir
|
7
|
+
|
8
|
+
def initialize(containing_dir)
|
9
|
+
@containing_dir = containing_dir
|
10
|
+
end
|
11
|
+
|
12
|
+
def path(url, path_in_source = false)
|
13
|
+
path = url.path
|
14
|
+
dir_name = File.dirname(path)
|
15
|
+
extension = File.extname(path)
|
16
|
+
base_name = File.basename(path, extension)
|
17
|
+
|
18
|
+
extension = ".html" unless extension.length > 0
|
19
|
+
|
20
|
+
if url.query
|
21
|
+
query = "?#{url.query}"
|
22
|
+
query = CGI.escape(query)
|
23
|
+
else
|
24
|
+
query = ""
|
25
|
+
end
|
26
|
+
|
27
|
+
store_dir = File.join(@containing_dir, url.host, dir_name)
|
28
|
+
store_dir = Util.escape_path(store_dir) if path_in_source
|
29
|
+
|
30
|
+
file_name = "#{base_name}#{query}#{extension}"
|
31
|
+
file_name = Util.escape_path(file_name) if path_in_source
|
32
|
+
|
33
|
+
file_path = File.join(store_dir, file_name)
|
34
|
+
|
35
|
+
[store_dir, file_path]
|
36
|
+
end
|
37
|
+
|
38
|
+
def relative_resource_path(containing_url, resource_url, path_in_source = true)
|
39
|
+
_, containing_path = path(containing_url, path_in_source)
|
40
|
+
_, resource_path = path(resource_url, path_in_source)
|
41
|
+
|
42
|
+
Util.path_relative_to_path(containing_path, resource_path)
|
43
|
+
end
|
44
|
+
|
45
|
+
def write(document)
|
46
|
+
store_dir, file_path = path(document.url)
|
47
|
+
|
48
|
+
FileUtils.mkdir_p(store_dir)
|
49
|
+
|
50
|
+
File.open(file_path, "wb") do |file|
|
51
|
+
file.write(document.body)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "logger"
|
2
|
+
require "colored"
|
3
|
+
|
4
|
+
module CourseraDownloader
|
5
|
+
class LogFormatter < ::Logger::Formatter
|
6
|
+
def call(severity, time, progname, msg)
|
7
|
+
case severity
|
8
|
+
when "INFO"
|
9
|
+
severity = severity.green
|
10
|
+
when "WARN"
|
11
|
+
severity = severity.yellow
|
12
|
+
when "ERROR"
|
13
|
+
severity = severity.red
|
14
|
+
end
|
15
|
+
|
16
|
+
"#{severity}: #{msg}\n"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Policy
|
5
|
+
def initialize(file)
|
6
|
+
@patterns = YAML::load(File.read(file))
|
7
|
+
|
8
|
+
@patterns.each_pair do |group, patterns|
|
9
|
+
@patterns[group] = patterns.map do |pattern|
|
10
|
+
Regexp.new(pattern)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def url_action(url)
|
16
|
+
return :none if !url || (url.scheme && !(url.scheme != "http" || url.scheme != "https"))
|
17
|
+
|
18
|
+
url = url.to_s
|
19
|
+
return :disable if disable_url?(url)
|
20
|
+
return :download if download_url?(url)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def download_url?(url)
|
26
|
+
@patterns["blacklist"].each do |pattern|
|
27
|
+
return false if url.match(pattern)
|
28
|
+
end
|
29
|
+
|
30
|
+
match = false
|
31
|
+
@patterns["whitelist"].each do |pattern|
|
32
|
+
match = true if url.match(pattern)
|
33
|
+
end
|
34
|
+
|
35
|
+
match
|
36
|
+
end
|
37
|
+
|
38
|
+
def disable_url?(url)
|
39
|
+
@patterns["disable"].each do |pattern|
|
40
|
+
return true if url.match(pattern)
|
41
|
+
end
|
42
|
+
|
43
|
+
false
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "logger"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Runner
|
5
|
+
def self.run
|
6
|
+
if ARGV.length < 4
|
7
|
+
$stderr.puts "Usage:\n coursera_downloader course-identifier email password destination-directory [policy-file]"
|
8
|
+
Process.exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
course_name = ARGV[0]
|
12
|
+
email = ARGV[1]
|
13
|
+
password = ARGV[2]
|
14
|
+
file_store_dir = ARGV[3]
|
15
|
+
|
16
|
+
logger = Logger.new(STDOUT)
|
17
|
+
logger.formatter = LogFormatter.new
|
18
|
+
|
19
|
+
course = Course.new(course_name)
|
20
|
+
if course.login(email, password)
|
21
|
+
policy_file = ARGV[4] || File.expand_path("../download_policy.yml", File.dirname(__FILE__))
|
22
|
+
policy = Policy.new(policy_file)
|
23
|
+
store = FileStore.new(file_store_dir)
|
24
|
+
downloader = Downloader.new(course.cookie_file, policy, store, logger)
|
25
|
+
downloader.get(course.index_url)
|
26
|
+
|
27
|
+
_, index_store_path = store.path(course.index_url)
|
28
|
+
logger.info("The course index can be found at #{index_store_path}")
|
29
|
+
else
|
30
|
+
logger.error("Failed to login.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "cgi"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
module Util
|
5
|
+
def self.path_relative_to_path(target_path, path)
|
6
|
+
target_dirs = target_path.split("/")
|
7
|
+
dirs = path.split("/")
|
8
|
+
|
9
|
+
while target_dirs.length > 1 && target_dirs[0] == dirs[0]
|
10
|
+
target_dirs.shift
|
11
|
+
dirs.shift
|
12
|
+
end
|
13
|
+
|
14
|
+
File.join(Array.new(target_dirs.length - 1, "..") + dirs)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.escape_path(path)
|
18
|
+
path.split("/").map{|e| CGI.escape(e)}.join("/")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: coursera_downloader
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Nick Ewing
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: curb
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0.8'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0.8'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: nokogiri
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '1.5'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '1.5'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: colored
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '1.2'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.2'
|
62
|
+
description: Download static versions of Coursera course websites..
|
63
|
+
email: ''
|
64
|
+
executables:
|
65
|
+
- coursera_downloader
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- Gemfile
|
70
|
+
- Gemfile.lock
|
71
|
+
- LICENSE
|
72
|
+
- README.markdown
|
73
|
+
- bin/coursera_downloader
|
74
|
+
- coursera_downloader.gemspec
|
75
|
+
- download_policy.yml
|
76
|
+
- lib/coursera_downloader.rb
|
77
|
+
- lib/coursera_downloader/course.rb
|
78
|
+
- lib/coursera_downloader/document.rb
|
79
|
+
- lib/coursera_downloader/document_processor.rb
|
80
|
+
- lib/coursera_downloader/downloader.rb
|
81
|
+
- lib/coursera_downloader/file_store.rb
|
82
|
+
- lib/coursera_downloader/log_formatter.rb
|
83
|
+
- lib/coursera_downloader/policy.rb
|
84
|
+
- lib/coursera_downloader/runner.rb
|
85
|
+
- lib/coursera_downloader/util.rb
|
86
|
+
homepage: https://github.com/nickewing/coursera_downloader
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project: ! '[none]'
|
106
|
+
rubygems_version: 1.8.24
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Download static versions of Coursera course websites.
|
110
|
+
test_files: []
|