mkwebook 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/mkwebook/app.rb +79 -16
- data/lib/mkwebook/cli.rb +1 -1
- data/lib/mkwebook/config.rb +2 -1
- data/lib/mkwebook/ext/string.rb +7 -5
- data/lib/mkwebook/version.rb +1 -1
- data/lib/template/mkwebook.yml +2 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7e29166ba302805e68e70779ef8de58870671aab0ae684d1cec2290f5a0b4bf
|
4
|
+
data.tar.gz: 5e530d48d11ce6c26ac5255b7b294b15b6f90bde7b4ecc4e36ee2bc0e0ea7d54
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b90f0fbd51ad20e65847ca7fde950fc40651c3639a24f28b73c52580547e19e9b93f8e8a60247e3d56046afd2cfb9d758a5903c569b871093c841ad2513a52b
|
7
|
+
data.tar.gz: f5f17d96c4700ddd423fffe592a702812049a21e65ee113cccf12ba5c38b3dbc8af9a1307711ec96c03e4757e12a68c586e15497d33ccbf99e036078962e7cca
|
data/Gemfile.lock
CHANGED
data/lib/mkwebook/app.rb
CHANGED
@@ -16,6 +16,8 @@ module Mkwebook
|
|
16
16
|
end
|
17
17
|
@cli_options = cli_options
|
18
18
|
@config = Mkwebook::Config.new(@cli_options)
|
19
|
+
@downloaded_depth = 0
|
20
|
+
@downloaded_pages = []
|
19
21
|
end
|
20
22
|
|
21
23
|
def create_config
|
@@ -28,7 +30,10 @@ module Mkwebook
|
|
28
30
|
|
29
31
|
def download
|
30
32
|
download_index
|
33
|
+
append_extra_pages
|
31
34
|
download_pages
|
35
|
+
modify_page_links
|
36
|
+
post_process
|
32
37
|
end
|
33
38
|
|
34
39
|
def prepare_browser
|
@@ -51,10 +56,14 @@ module Mkwebook
|
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
54
|
-
def download_index
|
59
|
+
def download_index(only_index = false)
|
55
60
|
prepare_browser
|
56
61
|
index_page = @browser_context.create_page
|
62
|
+
begin
|
57
63
|
index_page.go_to(@config[:index_page][:url])
|
64
|
+
rescue Ferrum::PendingConnectionsError => e
|
65
|
+
index_page.go_to(@config[:index_page][:url])
|
66
|
+
end
|
58
67
|
index_page.network.wait_for_idle(timeout: 10) rescue nil
|
59
68
|
modifier = @config[:index_page][:modifier]
|
60
69
|
if modifier && File.file?(modifier)
|
@@ -67,8 +76,15 @@ module Mkwebook
|
|
67
76
|
@page_urls = index_elements.flat_map do |element|
|
68
77
|
url = element.css(@config[:index_page][:link_selector]).map { |a| a.evaluate('this.href') }
|
69
78
|
element.css(@config[:index_page][:link_selector]).each do |a|
|
70
|
-
u = a.evaluate('this.href')
|
71
|
-
|
79
|
+
u = a.evaluate('this.href')
|
80
|
+
href = u.normalize_uri('.html').relative_path_from(@config[:index_page][:output])
|
81
|
+
file = @config[:index_page][:output]
|
82
|
+
a.evaluate <<~JS
|
83
|
+
(function(that) {
|
84
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
85
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
86
|
+
})(this);
|
87
|
+
JS
|
72
88
|
end
|
73
89
|
url
|
74
90
|
end.uniq
|
@@ -77,9 +93,6 @@ module Mkwebook
|
|
77
93
|
@config[:pages].any? { |page| url =~ Regexp.new(page[:url_pattern]) }
|
78
94
|
end
|
79
95
|
|
80
|
-
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
81
|
-
|
82
|
-
|
83
96
|
@config[:index_page][:title].try do |title|
|
84
97
|
index_page.execute("document.title = '#{title}'")
|
85
98
|
end
|
@@ -98,19 +111,25 @@ module Mkwebook
|
|
98
111
|
end.join("\n").tap do |html|
|
99
112
|
File.write(@config[:index_page][:output], html)
|
100
113
|
end
|
114
|
+
@downloaded_pages << {file: @config[:index_page][:output], url: @config[:index_page][:url]}
|
115
|
+
modify_page_links if only_index
|
101
116
|
rescue Ferrum::Error => e
|
102
117
|
binding.pry
|
103
118
|
end
|
104
119
|
|
105
120
|
def download_pages
|
106
|
-
|
107
|
-
append_extra_pages
|
121
|
+
return unless @downloaded_depth < @config[:max_recursion]
|
108
122
|
|
109
123
|
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
110
124
|
|
125
|
+
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
126
|
+
|
127
|
+
@page_links = @page_urls.map { |url| [url, []] }.to_h
|
128
|
+
|
111
129
|
@page_urls.each do |url|
|
112
130
|
page_config = @config[:pages].find { |page| url =~ Regexp.new(page[:url_pattern]) }
|
113
131
|
next unless page_config
|
132
|
+
next if @downloaded_pages.any? { |page| page[:url] == url }
|
114
133
|
|
115
134
|
pool.post do
|
116
135
|
page = @browser_context.create_page
|
@@ -131,6 +150,13 @@ module Mkwebook
|
|
131
150
|
page.execute("document.title = '#{title}'")
|
132
151
|
end
|
133
152
|
|
153
|
+
if page_link_selector = page_config[:page_link_selector]
|
154
|
+
page_links = page_elements.flat_map do |element|
|
155
|
+
element.css(page_link_selector).map { |a| a.evaluate('this.href') }
|
156
|
+
end.uniq
|
157
|
+
@page_links[url] = page_links
|
158
|
+
end
|
159
|
+
|
134
160
|
page.execute <<-JS
|
135
161
|
for (var e of document.querySelectorAll('[integrity]')) {
|
136
162
|
e.removeAttribute('integrity');
|
@@ -142,18 +168,25 @@ module Mkwebook
|
|
142
168
|
|
143
169
|
page_elements.map do |element|
|
144
170
|
element.css('a').each do |a|
|
145
|
-
u = a.evaluate('this.href')
|
146
|
-
next unless
|
147
|
-
|
148
|
-
|
149
|
-
a.evaluate
|
171
|
+
u = a.evaluate('this.href') rescue nil
|
172
|
+
next unless u.present?
|
173
|
+
href = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
|
174
|
+
file = u.normalize_file_path('.html')
|
175
|
+
a.evaluate <<~JS
|
176
|
+
(function(that) {
|
177
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
178
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
179
|
+
})(this);
|
180
|
+
JS
|
150
181
|
end
|
151
182
|
element.evaluate('this.outerHTML')
|
152
183
|
end.join("\n").tap do |html|
|
153
184
|
FileUtils.mkdir_p(File.dirname(output))
|
154
185
|
File.write(output, html)
|
155
186
|
end
|
156
|
-
|
187
|
+
|
188
|
+
@downloaded_pages << {file: output, url: url}
|
189
|
+
rescue => e
|
157
190
|
$stderr.puts e.message
|
158
191
|
$stderr.puts e.backtrace
|
159
192
|
binding.pry if @cli_options[:pause_on_error]
|
@@ -161,13 +194,14 @@ module Mkwebook
|
|
161
194
|
page.close
|
162
195
|
end
|
163
196
|
end
|
164
|
-
|
165
197
|
end
|
166
198
|
|
167
199
|
pool.shutdown
|
168
200
|
pool.wait_for_termination
|
169
201
|
|
170
|
-
|
202
|
+
@page_urls = @page_links.flat_map(&:last).uniq
|
203
|
+
@downloaded_depth += 1
|
204
|
+
download_pages
|
171
205
|
end
|
172
206
|
|
173
207
|
def post_process
|
@@ -305,6 +339,35 @@ module Mkwebook
|
|
305
339
|
puts IO.read("#{__dir__}/entry_types.txt")
|
306
340
|
end
|
307
341
|
|
342
|
+
def modify_page_links
|
343
|
+
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
344
|
+
downloaded_files = @downloaded_pages.map { |page| page[:file] }
|
345
|
+
downloaded_files.each do |file|
|
346
|
+
pool.post do
|
347
|
+
begin
|
348
|
+
page = @browser_context.create_page
|
349
|
+
page.go_to("file://#{File.expand_path(file)}")
|
350
|
+
page.css('a').each do |a|
|
351
|
+
href = a.evaluate('this.getAttribute("data-mkwebook-href")') rescue nil
|
352
|
+
next unless href
|
353
|
+
f = a.evaluate('this.getAttribute("data-mkwebook-file")')
|
354
|
+
next unless href && f && downloaded_files.include?(f)
|
355
|
+
a.evaluate("this.href = this.getAttribute('data-mkwebook-href')")
|
356
|
+
end
|
357
|
+
File.write(file, page.evaluate('document.querySelector("html").outerHTML'))
|
358
|
+
rescue Ferrum::Error => e
|
359
|
+
$stderr.puts e.message
|
360
|
+
$stderr.puts e.backtrace
|
361
|
+
binding.pry if @cli_options[:pause_on_error]
|
362
|
+
ensure
|
363
|
+
page.close
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
pool.shutdown
|
368
|
+
pool.wait_for_termination
|
369
|
+
end
|
370
|
+
|
308
371
|
private
|
309
372
|
|
310
373
|
def browser_options
|
data/lib/mkwebook/cli.rb
CHANGED
@@ -19,7 +19,7 @@ module Mkwebook
|
|
19
19
|
option :pause, :type => :boolean, :aliases => '-p', :desc => 'Pause after processing index page'
|
20
20
|
desc 'download_index', 'Download and process index page'
|
21
21
|
def download_index
|
22
|
-
Mkwebook::App.new(options).download_index
|
22
|
+
Mkwebook::App.new(options).download_index(true)
|
23
23
|
end
|
24
24
|
|
25
25
|
option :limit, :type => :numeric, :aliases => '-l', :desc => 'Limit number of pages, specially for debugging'
|
data/lib/mkwebook/config.rb
CHANGED
@@ -22,7 +22,8 @@ module Mkwebook
|
|
22
22
|
'browser' => {
|
23
23
|
'headless' => true,
|
24
24
|
},
|
25
|
-
'concurrency': 1
|
25
|
+
'concurrency': 1,
|
26
|
+
'max-recursion': 1
|
26
27
|
}
|
27
28
|
config = YAML.load_file(config_file)
|
28
29
|
config = default_config.deep_merge(config).deep_transform_keys! { |k| k.to_s.underscore.to_sym }
|
data/lib/mkwebook/ext/string.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
1
3
|
class String
|
2
4
|
def p
|
3
5
|
puts self
|
@@ -12,24 +14,24 @@ class String
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def normalize_file_path(force_extname = nil)
|
17
|
+
return self unless present?
|
15
18
|
uri = URI.parse(self)
|
16
19
|
file_path = uri.path[1..]
|
17
|
-
extname = File.extname(file_path)
|
20
|
+
extname = force_extname || File.extname(file_path)
|
18
21
|
basename = File.basename(file_path, extname)
|
19
22
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
20
23
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
21
|
-
|
22
|
-
File.join(origin, File.dirname(file_path), basename + extname)
|
24
|
+
URI.decode_www_form_component(File.join(origin, File.dirname(file_path), basename + extname))
|
23
25
|
end
|
24
26
|
|
25
27
|
def normalize_uri(force_extname = nil)
|
28
|
+
return self unless present?
|
26
29
|
uri = URI.parse(self)
|
27
30
|
file_path = uri.path[1..]
|
28
|
-
extname = File.extname(file_path)
|
31
|
+
extname = force_extname || File.extname(file_path)
|
29
32
|
basename = File.basename(file_path, extname)
|
30
33
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
31
34
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
32
|
-
extname = force_extname if force_extname && extname.empty?
|
33
35
|
file_path = File.join(origin, File.dirname(file_path), basename + extname)
|
34
36
|
if uri.fragment.present?
|
35
37
|
file_path += "##{uri.fragment}"
|
data/lib/mkwebook/version.rb
CHANGED
data/lib/template/mkwebook.yml
CHANGED
@@ -30,6 +30,7 @@ index-page: # index page settings
|
|
30
30
|
- selector: "script[src]"
|
31
31
|
attr: src
|
32
32
|
|
33
|
+
max-recursion: 2 # max depth of recursive downloading
|
33
34
|
|
34
35
|
pages: # settings for content pages
|
35
36
|
- url-pattern: '.*' # URL pattern for content page, only pages' URL matching this pattern will be processed
|
@@ -41,6 +42,7 @@ pages: # settings for content pages
|
|
41
42
|
style.innerHTML = '.clj-content-container { margin-left: 0; }';
|
42
43
|
document.body.appendChild(style);
|
43
44
|
selector: html # CSS selector for the content to be saved
|
45
|
+
page-link-selector: "a:not([href='../guides'])" # links to be downloaded recursively which are extracted from page content
|
44
46
|
assets: # assets to be downloaded
|
45
47
|
- selector: img # CSS selector for assets
|
46
48
|
attr: src # attribute name for the asset URL
|