coursera_downloader 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +3 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +19 -0
- data/README.markdown +63 -0
- data/bin/coursera_downloader +6 -0
- data/coursera_downloader.gemspec +19 -0
- data/download_policy.yml +22 -0
- data/lib/coursera_downloader.rb +4 -0
- data/lib/coursera_downloader/course.rb +58 -0
- data/lib/coursera_downloader/document.rb +26 -0
- data/lib/coursera_downloader/document_processor.rb +116 -0
- data/lib/coursera_downloader/downloader.rb +142 -0
- data/lib/coursera_downloader/file_store.rb +56 -0
- data/lib/coursera_downloader/log_formatter.rb +19 -0
- data/lib/coursera_downloader/policy.rb +46 -0
- data/lib/coursera_downloader/runner.rb +34 -0
- data/lib/coursera_downloader/util.rb +21 -0
- metadata +110 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
coursera_downloader (0.1.0)
|
5
|
+
curb (~> 0.8)
|
6
|
+
nokogiri (~> 1.5)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://www.rubygems.org/
|
10
|
+
specs:
|
11
|
+
curb (0.8.3)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
|
14
|
+
PLATFORMS
|
15
|
+
ruby
|
16
|
+
|
17
|
+
DEPENDENCIES
|
18
|
+
coursera_downloader!
|
data/LICENSE
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
Copyright (C) 2012 by Nick Ewing
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
5
|
+
in the Software without restriction, including without limitation the rights
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
8
|
+
furnished to do so, subject to the following conditions:
|
9
|
+
|
10
|
+
The above copyright notice and this permission notice shall be included in
|
11
|
+
all copies or substantial portions of the Software.
|
12
|
+
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
19
|
+
THE SOFTWARE.
|
data/README.markdown
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
Coursera Downloader
|
2
|
+
===================
|
3
|
+
|
4
|
+
Download static versions of Coursera course websites for offline reference.
|
5
|
+
|
6
|
+
Install
|
7
|
+
-------
|
8
|
+
Download and install with the following command:
|
9
|
+
|
10
|
+
gem install coursera_downloader
|
11
|
+
|
12
|
+
Use
|
13
|
+
---
|
14
|
+
After installing the Ruby gem, use the `coursera_downloader` command to download
|
15
|
+
a course like so:
|
16
|
+
|
17
|
+
coursera_downloader course-identifier email password destination-directory [policy-file]
|
18
|
+
|
19
|
+
For example, to download the first offering of the Neural Networks class, run
|
20
|
+
the following command:
|
21
|
+
|
22
|
+
coursera_downloader neuralnets-2012-001 foo.bar@example.com password123 neuralnets
|
23
|
+
|
24
|
+
Default Policy File
|
25
|
+
-------------------
|
26
|
+
The policy file defines Ruby regular expressions used to determine how to handle
|
27
|
+
new URLs found on the course page. It is in the YAML format.
|
28
|
+
|
29
|
+
The default policy file is shown below.
|
30
|
+
|
31
|
+
---
|
32
|
+
whitelist:
|
33
|
+
- ^https?://class\.coursera\.org/[^/]+/
|
34
|
+
- ^https?://[^\.]+\.s3\.amazonaws\.com
|
35
|
+
- ^https?://s3\.amazonaws\.com
|
36
|
+
- ^https?://[^\.]+\.cloudfront\.net
|
37
|
+
blacklist:
|
38
|
+
- \.(exe|dmg)(\?.*)?$
|
39
|
+
disable:
|
40
|
+
# - ^https?://class\.coursera\.org/[^/]+/quiz
|
41
|
+
# - ^https?://class\.coursera\.org/[^/]+/forum
|
42
|
+
# - ^https?://class\.coursera\.org/[^/]+/lecture
|
43
|
+
# - ^https?://class\.coursera\.org/[^/]+/wiki
|
44
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/thread?.*view=.*
|
45
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag?.*view=.*
|
46
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/list?.*view=.*
|
47
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/toggle
|
48
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag_modify
|
49
|
+
- ^https?://class\.coursera\.org/[^/]+/quiz/start
|
50
|
+
- ^https?://class\.coursera\.org/[^/]+/generic/apply_late_days
|
51
|
+
- ^https?://class\.coursera\.org/[^/]+/auth/logout
|
52
|
+
- ^https?://class\.coursera\.org/[^/]+/class/preferences
|
53
|
+
|
54
|
+
License
|
55
|
+
-------
|
56
|
+
Coursera Downloader is available under an MIT-style license.
|
57
|
+
|
58
|
+
See LICENSE.
|
59
|
+
|
60
|
+
This software should only be used in accordance to Coursera's Terms of Service.
|
61
|
+
Please see: https://www.coursera.org/about/terms
|
62
|
+
|
63
|
+
Copyright (C) 2012 by Nick Ewing
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = "coursera_downloader"
|
3
|
+
s.version = "1.0.0"
|
4
|
+
s.authors = ["Nick Ewing"]
|
5
|
+
s.email = ""
|
6
|
+
s.homepage = "https://github.com/nickewing/coursera_downloader"
|
7
|
+
s.summary = "Download static versions of Coursera course websites."
|
8
|
+
s.description = "#{s.summary}."
|
9
|
+
|
10
|
+
s.files = `git ls-files`.split("\n").reject {|path| path =~ /\.gitignore$/ }
|
11
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
12
|
+
s.platform = Gem::Platform::RUBY
|
13
|
+
s.require_path = "lib"
|
14
|
+
s.rubyforge_project = "[none]"
|
15
|
+
|
16
|
+
s.add_dependency "curb", "~> 0.8"
|
17
|
+
s.add_dependency "nokogiri", "~> 1.5"
|
18
|
+
s.add_dependency "colored", "~> 1.2"
|
19
|
+
end
|
data/download_policy.yml
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
---
|
2
|
+
whitelist:
|
3
|
+
- ^https?://class\.coursera\.org/[^/]+/
|
4
|
+
- ^https?://[^\.]+\.s3\.amazonaws\.com
|
5
|
+
- ^https?://s3\.amazonaws\.com
|
6
|
+
- ^https?://[^\.]+\.cloudfront\.net
|
7
|
+
blacklist:
|
8
|
+
- \.(exe|dmg)(\?.*)?$
|
9
|
+
disable:
|
10
|
+
# - ^https?://class\.coursera\.org/[^/]+/quiz
|
11
|
+
# - ^https?://class\.coursera\.org/[^/]+/forum
|
12
|
+
# - ^https?://class\.coursera\.org/[^/]+/lecture
|
13
|
+
# - ^https?://class\.coursera\.org/[^/]+/wiki
|
14
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/thread?.*view=.*
|
15
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag?.*view=.*
|
16
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/list?.*view=.*
|
17
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/toggle
|
18
|
+
- ^https?://class\.coursera\.org/[^/]+/forum/tag_modify
|
19
|
+
- ^https?://class\.coursera\.org/[^/]+/quiz/start
|
20
|
+
- ^https?://class\.coursera\.org/[^/]+/generic/apply_late_days
|
21
|
+
- ^https?://class\.coursera\.org/[^/]+/auth/logout
|
22
|
+
- ^https?://class\.coursera\.org/[^/]+/class/preferences
|
@@ -0,0 +1,58 @@
|
|
1
|
+
require "curb"
|
2
|
+
require "uri"
|
3
|
+
require "tempfile"
|
4
|
+
|
5
|
+
module CourseraDownloader
|
6
|
+
class Course
|
7
|
+
attr_reader :cookie_file
|
8
|
+
|
9
|
+
def initialize(name)
|
10
|
+
@name = name
|
11
|
+
@cookie_file = Tempfile.new('coursera_cookies')
|
12
|
+
end
|
13
|
+
|
14
|
+
def login(email, password)
|
15
|
+
curl = Curl::Easy.new
|
16
|
+
|
17
|
+
curl.verbose = false
|
18
|
+
curl.enable_cookies = true
|
19
|
+
curl.cookiefile = @cookie_file.path
|
20
|
+
curl.cookiejar = @cookie_file.path
|
21
|
+
curl.follow_location = true
|
22
|
+
|
23
|
+
curl.url = login_redirect_url
|
24
|
+
curl.http_get
|
25
|
+
|
26
|
+
curl.url = curl.last_effective_url
|
27
|
+
curl.http_post([
|
28
|
+
Curl::PostField.content('email', email),
|
29
|
+
Curl::PostField.content('password', password),
|
30
|
+
Curl::PostField.content('login', "Login")
|
31
|
+
])
|
32
|
+
|
33
|
+
curl.follow_location = false
|
34
|
+
curl.url = index_url.to_s
|
35
|
+
curl.http_get
|
36
|
+
|
37
|
+
response_code = curl.response_code
|
38
|
+
curl.close
|
39
|
+
|
40
|
+
response_code == 200
|
41
|
+
end
|
42
|
+
|
43
|
+
def index_url
|
44
|
+
URI.parse("#{host_url}/#{@name}/class/index")
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
def host_url
|
50
|
+
"https://class.coursera.org"
|
51
|
+
end
|
52
|
+
|
53
|
+
def login_redirect_url
|
54
|
+
"#{host_url}/#{@name}/auth/auth_redirector?type=login&subtype=normal&email=&visiting=&minimal=true"
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Document
|
5
|
+
attr_reader :url, :content_type
|
6
|
+
attr_accessor :body
|
7
|
+
|
8
|
+
def initialize(url, body, content_type)
|
9
|
+
@url = url
|
10
|
+
@body = body
|
11
|
+
@content_type = content_type
|
12
|
+
end
|
13
|
+
|
14
|
+
def is_html?
|
15
|
+
content_type =~ /text\/html/
|
16
|
+
end
|
17
|
+
|
18
|
+
def is_css?
|
19
|
+
content_type =~ /text\/css/
|
20
|
+
end
|
21
|
+
|
22
|
+
def is_javascript?
|
23
|
+
content_type =~ /javascript/
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class DocumentProcessor
|
5
|
+
DISABLED_HREF = "javascript:alert('Link disabled during download.');"
|
6
|
+
|
7
|
+
attr_reader :resource_urls, :document
|
8
|
+
|
9
|
+
def initialize(document, store, policy)
|
10
|
+
@document = document
|
11
|
+
@store = store
|
12
|
+
@policy = policy
|
13
|
+
@resource_urls = Set.new
|
14
|
+
end
|
15
|
+
|
16
|
+
def process
|
17
|
+
if @document.is_html?
|
18
|
+
@document.body = process_html(@document.body)
|
19
|
+
elsif @document.is_css?
|
20
|
+
@document.body = process_css(@document.body)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
|
26
|
+
def process_html(html)
|
27
|
+
doc = Nokogiri::HTML(html)
|
28
|
+
|
29
|
+
urls = Set.new
|
30
|
+
doc.css("a, img, script, link").each do |element|
|
31
|
+
url = element.attr("href") || element.attr("src")
|
32
|
+
|
33
|
+
url = normalize_url(url)
|
34
|
+
|
35
|
+
case @policy.url_action(url)
|
36
|
+
when :disable
|
37
|
+
disable_html_element(element)
|
38
|
+
when :download
|
39
|
+
urls << url unless urls.include?(url)
|
40
|
+
localize_element(element, url)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
doc.xpath("//*[@style]").each do |element|
|
45
|
+
element["style"] = process_css(element.attr("style"))
|
46
|
+
end
|
47
|
+
|
48
|
+
@resource_urls += urls
|
49
|
+
|
50
|
+
doc.to_s
|
51
|
+
end
|
52
|
+
|
53
|
+
def localize_element(element, url)
|
54
|
+
path = @store.relative_resource_path(@document.url, url, true)
|
55
|
+
if element.attr("href")
|
56
|
+
element["href"] = path
|
57
|
+
elsif element.attr("src")
|
58
|
+
element["src"] = path
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def disable_html_element(element)
|
63
|
+
if element.attr("href")
|
64
|
+
element["href"] = DISABLED_HREF
|
65
|
+
elsif element.attr("src")
|
66
|
+
element.remove
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def process_css(css)
|
71
|
+
matches = css.scan(/url\(((")([^"]*)"|(')([^']*)'|[^\)]*)\)/)
|
72
|
+
|
73
|
+
urls = Set.new
|
74
|
+
matches.each do |match|
|
75
|
+
quote = match[1] || match[3]
|
76
|
+
raw_url = match[2] || match[4] || match[0]
|
77
|
+
|
78
|
+
# puts ">>>> #{raw_url}"
|
79
|
+
url = normalize_url(raw_url)
|
80
|
+
|
81
|
+
# p @policy.url_action(url)
|
82
|
+
|
83
|
+
case @policy.url_action(url)
|
84
|
+
when :disable
|
85
|
+
css.gsub!("url(#{quote}#{raw_url}#{quote})", "url()")
|
86
|
+
when :download
|
87
|
+
urls << url unless urls.include?(url)
|
88
|
+
path = @store.relative_resource_path(@document.url, url, true)
|
89
|
+
css.gsub!("url(#{quote}#{raw_url}#{quote})", "url('#{path}')")
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
@resource_urls += urls
|
94
|
+
|
95
|
+
css
|
96
|
+
end
|
97
|
+
|
98
|
+
def normalize_url(url)
|
99
|
+
return nil if !url || url.length == 0
|
100
|
+
|
101
|
+
begin
|
102
|
+
url = URI.parse(url)
|
103
|
+
rescue URI::InvalidURIError => e
|
104
|
+
return nil
|
105
|
+
end
|
106
|
+
|
107
|
+
if url.host
|
108
|
+
url = url
|
109
|
+
else
|
110
|
+
url = URI.join(@document.url, url)
|
111
|
+
end
|
112
|
+
|
113
|
+
url
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
require "set"
|
2
|
+
require "yaml"
|
3
|
+
require "uri"
|
4
|
+
|
5
|
+
module CourseraDownloader
|
6
|
+
class Downloader
|
7
|
+
MAX_BATCH_SIZE = 10
|
8
|
+
|
9
|
+
def initialize(cookie_file, policy, store, logger)
|
10
|
+
@cookie_file = cookie_file
|
11
|
+
@store = store
|
12
|
+
@policy = policy
|
13
|
+
@logger = logger
|
14
|
+
|
15
|
+
@queue = []
|
16
|
+
@enqueued = Set.new # all URLs that have been ever enqueued during this run
|
17
|
+
|
18
|
+
read_state
|
19
|
+
end
|
20
|
+
|
21
|
+
def get(url)
|
22
|
+
url = URI.parse(url) unless url.is_a?(URI)
|
23
|
+
|
24
|
+
enqueue_new_url(url)
|
25
|
+
fetch_all
|
26
|
+
@logger.info("Downloaded #{@enqueued.length} total files")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def state_save_path
|
32
|
+
File.join(@store.containing_dir, "manifest.yml")
|
33
|
+
end
|
34
|
+
|
35
|
+
def write_state
|
36
|
+
if File.exists?(@store.containing_dir)
|
37
|
+
File.open(state_save_path, "wb") do |file|
|
38
|
+
state = {
|
39
|
+
:queue => @queue,
|
40
|
+
:enqueued => @enqueued.to_a
|
41
|
+
}
|
42
|
+
file.write(YAML::dump(state))
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def read_state
|
48
|
+
if File.exists?(state_save_path)
|
49
|
+
saved_state = YAML::load(File.read(state_save_path))
|
50
|
+
|
51
|
+
@queue = saved_state[:queue]
|
52
|
+
@enqueued = Set.new(saved_state[:enqueued])
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def handle_document(document)
|
57
|
+
if document.is_html? || document.is_css?
|
58
|
+
processor = DocumentProcessor.new(document, @store, @policy)
|
59
|
+
processor.process
|
60
|
+
|
61
|
+
processor.resource_urls.map do |resource_url|
|
62
|
+
enqueue_new_url(resource_url)
|
63
|
+
end
|
64
|
+
|
65
|
+
body = processor.document.body
|
66
|
+
end
|
67
|
+
|
68
|
+
@store.write(document)
|
69
|
+
end
|
70
|
+
|
71
|
+
def fetch_all
|
72
|
+
begin
|
73
|
+
interrupted = false
|
74
|
+
trap("INT") do
|
75
|
+
interrupted = true
|
76
|
+
show_interrupt_message
|
77
|
+
end
|
78
|
+
|
79
|
+
while @queue.length > 0 && !interrupted
|
80
|
+
batch = @queue.shift(MAX_BATCH_SIZE)
|
81
|
+
fetch_batch(batch)
|
82
|
+
end
|
83
|
+
|
84
|
+
trap("INT", "DEFAULT")
|
85
|
+
ensure
|
86
|
+
write_state
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def fetch_batch(batch)
|
91
|
+
m = Curl::Multi.new
|
92
|
+
|
93
|
+
batch.each do |url|
|
94
|
+
curl = Curl::Easy.new(url.to_s) do |curl|
|
95
|
+
curl.enable_cookies = true
|
96
|
+
curl.cookiefile = @cookie_file.path
|
97
|
+
curl.cookiejar = @cookie_file.path
|
98
|
+
curl.follow_location = true
|
99
|
+
|
100
|
+
curl.on_complete do |result|
|
101
|
+
begin
|
102
|
+
if result.response_code == 200
|
103
|
+
@logger.info("Downloaded: #{url}")
|
104
|
+
|
105
|
+
content_type = result.content_type
|
106
|
+
content_type.force_encoding("ASCII") if content_type.respond_to?(:force_encoding)
|
107
|
+
|
108
|
+
document = Document.new(url, result.body_str, content_type)
|
109
|
+
handle_document(document)
|
110
|
+
else
|
111
|
+
@logger.warn("Failed to get URL '#{url}'. Response code #{result.response_code}")
|
112
|
+
end
|
113
|
+
rescue => e
|
114
|
+
@logger.error(e.message + "\n " + e.backtrace.join("\n "))
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
m.add(curl)
|
120
|
+
end
|
121
|
+
|
122
|
+
m.perform
|
123
|
+
end
|
124
|
+
|
125
|
+
def enqueue_new_url(url)
|
126
|
+
if !@enqueued.include?(url)
|
127
|
+
@enqueued << url
|
128
|
+
@queue.push(url)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
def show_interrupt_message
|
133
|
+
return if @interrupt_message_shown
|
134
|
+
|
135
|
+
@logger.warn("Finishing current download batch.")
|
136
|
+
@logger.warn("This download can be resumed by rerunning the same command.")
|
137
|
+
|
138
|
+
@interrupt_message_shown = true
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require "fileutils"
|
2
|
+
require "cgi"
|
3
|
+
|
4
|
+
module CourseraDownloader
|
5
|
+
class FileStore
|
6
|
+
attr_reader :containing_dir
|
7
|
+
|
8
|
+
def initialize(containing_dir)
|
9
|
+
@containing_dir = containing_dir
|
10
|
+
end
|
11
|
+
|
12
|
+
def path(url, path_in_source = false)
|
13
|
+
path = url.path
|
14
|
+
dir_name = File.dirname(path)
|
15
|
+
extension = File.extname(path)
|
16
|
+
base_name = File.basename(path, extension)
|
17
|
+
|
18
|
+
extension = ".html" unless extension.length > 0
|
19
|
+
|
20
|
+
if url.query
|
21
|
+
query = "?#{url.query}"
|
22
|
+
query = CGI.escape(query)
|
23
|
+
else
|
24
|
+
query = ""
|
25
|
+
end
|
26
|
+
|
27
|
+
store_dir = File.join(@containing_dir, url.host, dir_name)
|
28
|
+
store_dir = Util.escape_path(store_dir) if path_in_source
|
29
|
+
|
30
|
+
file_name = "#{base_name}#{query}#{extension}"
|
31
|
+
file_name = Util.escape_path(file_name) if path_in_source
|
32
|
+
|
33
|
+
file_path = File.join(store_dir, file_name)
|
34
|
+
|
35
|
+
[store_dir, file_path]
|
36
|
+
end
|
37
|
+
|
38
|
+
def relative_resource_path(containing_url, resource_url, path_in_source = true)
|
39
|
+
_, containing_path = path(containing_url, path_in_source)
|
40
|
+
_, resource_path = path(resource_url, path_in_source)
|
41
|
+
|
42
|
+
Util.path_relative_to_path(containing_path, resource_path)
|
43
|
+
end
|
44
|
+
|
45
|
+
def write(document)
|
46
|
+
store_dir, file_path = path(document.url)
|
47
|
+
|
48
|
+
FileUtils.mkdir_p(store_dir)
|
49
|
+
|
50
|
+
File.open(file_path, "wb") do |file|
|
51
|
+
file.write(document.body)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
require "logger"
|
2
|
+
require "colored"
|
3
|
+
|
4
|
+
module CourseraDownloader
|
5
|
+
class LogFormatter < ::Logger::Formatter
|
6
|
+
def call(severity, time, progname, msg)
|
7
|
+
case severity
|
8
|
+
when "INFO"
|
9
|
+
severity = severity.green
|
10
|
+
when "WARN"
|
11
|
+
severity = severity.yellow
|
12
|
+
when "ERROR"
|
13
|
+
severity = severity.red
|
14
|
+
end
|
15
|
+
|
16
|
+
"#{severity}: #{msg}\n"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require "yaml"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Policy
|
5
|
+
def initialize(file)
|
6
|
+
@patterns = YAML::load(File.read(file))
|
7
|
+
|
8
|
+
@patterns.each_pair do |group, patterns|
|
9
|
+
@patterns[group] = patterns.map do |pattern|
|
10
|
+
Regexp.new(pattern)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
def url_action(url)
|
16
|
+
return :none if !url || (url.scheme && !(url.scheme != "http" || url.scheme != "https"))
|
17
|
+
|
18
|
+
url = url.to_s
|
19
|
+
return :disable if disable_url?(url)
|
20
|
+
return :download if download_url?(url)
|
21
|
+
end
|
22
|
+
|
23
|
+
private
|
24
|
+
|
25
|
+
def download_url?(url)
|
26
|
+
@patterns["blacklist"].each do |pattern|
|
27
|
+
return false if url.match(pattern)
|
28
|
+
end
|
29
|
+
|
30
|
+
match = false
|
31
|
+
@patterns["whitelist"].each do |pattern|
|
32
|
+
match = true if url.match(pattern)
|
33
|
+
end
|
34
|
+
|
35
|
+
match
|
36
|
+
end
|
37
|
+
|
38
|
+
def disable_url?(url)
|
39
|
+
@patterns["disable"].each do |pattern|
|
40
|
+
return true if url.match(pattern)
|
41
|
+
end
|
42
|
+
|
43
|
+
false
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
require "logger"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
class Runner
|
5
|
+
def self.run
|
6
|
+
if ARGV.length < 4
|
7
|
+
$stderr.puts "Usage:\n coursera_downloader course-identifier email password destination-directory [policy-file]"
|
8
|
+
Process.exit(1)
|
9
|
+
end
|
10
|
+
|
11
|
+
course_name = ARGV[0]
|
12
|
+
email = ARGV[1]
|
13
|
+
password = ARGV[2]
|
14
|
+
file_store_dir = ARGV[3]
|
15
|
+
|
16
|
+
logger = Logger.new(STDOUT)
|
17
|
+
logger.formatter = LogFormatter.new
|
18
|
+
|
19
|
+
course = Course.new(course_name)
|
20
|
+
if course.login(email, password)
|
21
|
+
policy_file = ARGV[4] || File.expand_path("../download_policy.yml", File.dirname(__FILE__))
|
22
|
+
policy = Policy.new(policy_file)
|
23
|
+
store = FileStore.new(file_store_dir)
|
24
|
+
downloader = Downloader.new(course.cookie_file, policy, store, logger)
|
25
|
+
downloader.get(course.index_url)
|
26
|
+
|
27
|
+
_, index_store_path = store.path(course.index_url)
|
28
|
+
logger.info("The course index can be found at #{index_store_path}")
|
29
|
+
else
|
30
|
+
logger.error("Failed to login.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "cgi"
|
2
|
+
|
3
|
+
module CourseraDownloader
|
4
|
+
module Util
|
5
|
+
def self.path_relative_to_path(target_path, path)
|
6
|
+
target_dirs = target_path.split("/")
|
7
|
+
dirs = path.split("/")
|
8
|
+
|
9
|
+
while target_dirs.length > 1 && target_dirs[0] == dirs[0]
|
10
|
+
target_dirs.shift
|
11
|
+
dirs.shift
|
12
|
+
end
|
13
|
+
|
14
|
+
File.join(Array.new(target_dirs.length - 1, "..") + dirs)
|
15
|
+
end
|
16
|
+
|
17
|
+
def self.escape_path(path)
|
18
|
+
path.split("/").map{|e| CGI.escape(e)}.join("/")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,110 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: coursera_downloader
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Nick Ewing
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-12-10 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: curb
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0.8'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ~>
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0.8'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: nokogiri
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ~>
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '1.5'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ~>
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '1.5'
|
46
|
+
- !ruby/object:Gem::Dependency
|
47
|
+
name: colored
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
49
|
+
none: false
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '1.2'
|
54
|
+
type: :runtime
|
55
|
+
prerelease: false
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '1.2'
|
62
|
+
description: Download static versions of Coursera course websites..
|
63
|
+
email: ''
|
64
|
+
executables:
|
65
|
+
- coursera_downloader
|
66
|
+
extensions: []
|
67
|
+
extra_rdoc_files: []
|
68
|
+
files:
|
69
|
+
- Gemfile
|
70
|
+
- Gemfile.lock
|
71
|
+
- LICENSE
|
72
|
+
- README.markdown
|
73
|
+
- bin/coursera_downloader
|
74
|
+
- coursera_downloader.gemspec
|
75
|
+
- download_policy.yml
|
76
|
+
- lib/coursera_downloader.rb
|
77
|
+
- lib/coursera_downloader/course.rb
|
78
|
+
- lib/coursera_downloader/document.rb
|
79
|
+
- lib/coursera_downloader/document_processor.rb
|
80
|
+
- lib/coursera_downloader/downloader.rb
|
81
|
+
- lib/coursera_downloader/file_store.rb
|
82
|
+
- lib/coursera_downloader/log_formatter.rb
|
83
|
+
- lib/coursera_downloader/policy.rb
|
84
|
+
- lib/coursera_downloader/runner.rb
|
85
|
+
- lib/coursera_downloader/util.rb
|
86
|
+
homepage: https://github.com/nickewing/coursera_downloader
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
99
|
+
none: false
|
100
|
+
requirements:
|
101
|
+
- - ! '>='
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
requirements: []
|
105
|
+
rubyforge_project: ! '[none]'
|
106
|
+
rubygems_version: 1.8.24
|
107
|
+
signing_key:
|
108
|
+
specification_version: 3
|
109
|
+
summary: Download static versions of Coursera course websites.
|
110
|
+
test_files: []
|