mkwebook 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/mkwebook/app.rb +79 -16
- data/lib/mkwebook/cli.rb +1 -1
- data/lib/mkwebook/config.rb +2 -1
- data/lib/mkwebook/ext/string.rb +7 -5
- data/lib/mkwebook/version.rb +1 -1
- data/lib/template/mkwebook.yml +2 -0
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a7e29166ba302805e68e70779ef8de58870671aab0ae684d1cec2290f5a0b4bf
|
4
|
+
data.tar.gz: 5e530d48d11ce6c26ac5255b7b294b15b6f90bde7b4ecc4e36ee2bc0e0ea7d54
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b90f0fbd51ad20e65847ca7fde950fc40651c3639a24f28b73c52580547e19e9b93f8e8a60247e3d56046afd2cfb9d758a5903c569b871093c841ad2513a52b
|
7
|
+
data.tar.gz: f5f17d96c4700ddd423fffe592a702812049a21e65ee113cccf12ba5c38b3dbc8af9a1307711ec96c03e4757e12a68c586e15497d33ccbf99e036078962e7cca
|
data/Gemfile.lock
CHANGED
data/lib/mkwebook/app.rb
CHANGED
@@ -16,6 +16,8 @@ module Mkwebook
|
|
16
16
|
end
|
17
17
|
@cli_options = cli_options
|
18
18
|
@config = Mkwebook::Config.new(@cli_options)
|
19
|
+
@downloaded_depth = 0
|
20
|
+
@downloaded_pages = []
|
19
21
|
end
|
20
22
|
|
21
23
|
def create_config
|
@@ -28,7 +30,10 @@ module Mkwebook
|
|
28
30
|
|
29
31
|
def download
|
30
32
|
download_index
|
33
|
+
append_extra_pages
|
31
34
|
download_pages
|
35
|
+
modify_page_links
|
36
|
+
post_process
|
32
37
|
end
|
33
38
|
|
34
39
|
def prepare_browser
|
@@ -51,10 +56,14 @@ module Mkwebook
|
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
54
|
-
def download_index
|
59
|
+
def download_index(only_index = false)
|
55
60
|
prepare_browser
|
56
61
|
index_page = @browser_context.create_page
|
62
|
+
begin
|
57
63
|
index_page.go_to(@config[:index_page][:url])
|
64
|
+
rescue Ferrum::PendingConnectionsError => e
|
65
|
+
index_page.go_to(@config[:index_page][:url])
|
66
|
+
end
|
58
67
|
index_page.network.wait_for_idle(timeout: 10) rescue nil
|
59
68
|
modifier = @config[:index_page][:modifier]
|
60
69
|
if modifier && File.file?(modifier)
|
@@ -67,8 +76,15 @@ module Mkwebook
|
|
67
76
|
@page_urls = index_elements.flat_map do |element|
|
68
77
|
url = element.css(@config[:index_page][:link_selector]).map { |a| a.evaluate('this.href') }
|
69
78
|
element.css(@config[:index_page][:link_selector]).each do |a|
|
70
|
-
u = a.evaluate('this.href')
|
71
|
-
|
79
|
+
u = a.evaluate('this.href')
|
80
|
+
href = u.normalize_uri('.html').relative_path_from(@config[:index_page][:output])
|
81
|
+
file = @config[:index_page][:output]
|
82
|
+
a.evaluate <<~JS
|
83
|
+
(function(that) {
|
84
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
85
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
86
|
+
})(this);
|
87
|
+
JS
|
72
88
|
end
|
73
89
|
url
|
74
90
|
end.uniq
|
@@ -77,9 +93,6 @@ module Mkwebook
|
|
77
93
|
@config[:pages].any? { |page| url =~ Regexp.new(page[:url_pattern]) }
|
78
94
|
end
|
79
95
|
|
80
|
-
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
81
|
-
|
82
|
-
|
83
96
|
@config[:index_page][:title].try do |title|
|
84
97
|
index_page.execute("document.title = '#{title}'")
|
85
98
|
end
|
@@ -98,19 +111,25 @@ module Mkwebook
|
|
98
111
|
end.join("\n").tap do |html|
|
99
112
|
File.write(@config[:index_page][:output], html)
|
100
113
|
end
|
114
|
+
@downloaded_pages << {file: @config[:index_page][:output], url: @config[:index_page][:url]}
|
115
|
+
modify_page_links if only_index
|
101
116
|
rescue Ferrum::Error => e
|
102
117
|
binding.pry
|
103
118
|
end
|
104
119
|
|
105
120
|
def download_pages
|
106
|
-
|
107
|
-
append_extra_pages
|
121
|
+
return unless @downloaded_depth < @config[:max_recursion]
|
108
122
|
|
109
123
|
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
110
124
|
|
125
|
+
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
126
|
+
|
127
|
+
@page_links = @page_urls.map { |url| [url, []] }.to_h
|
128
|
+
|
111
129
|
@page_urls.each do |url|
|
112
130
|
page_config = @config[:pages].find { |page| url =~ Regexp.new(page[:url_pattern]) }
|
113
131
|
next unless page_config
|
132
|
+
next if @downloaded_pages.any? { |page| page[:url] == url }
|
114
133
|
|
115
134
|
pool.post do
|
116
135
|
page = @browser_context.create_page
|
@@ -131,6 +150,13 @@ module Mkwebook
|
|
131
150
|
page.execute("document.title = '#{title}'")
|
132
151
|
end
|
133
152
|
|
153
|
+
if page_link_selector = page_config[:page_link_selector]
|
154
|
+
page_links = page_elements.flat_map do |element|
|
155
|
+
element.css(page_link_selector).map { |a| a.evaluate('this.href') }
|
156
|
+
end.uniq
|
157
|
+
@page_links[url] = page_links
|
158
|
+
end
|
159
|
+
|
134
160
|
page.execute <<-JS
|
135
161
|
for (var e of document.querySelectorAll('[integrity]')) {
|
136
162
|
e.removeAttribute('integrity');
|
@@ -142,18 +168,25 @@ module Mkwebook
|
|
142
168
|
|
143
169
|
page_elements.map do |element|
|
144
170
|
element.css('a').each do |a|
|
145
|
-
u = a.evaluate('this.href')
|
146
|
-
next unless
|
147
|
-
|
148
|
-
|
149
|
-
a.evaluate
|
171
|
+
u = a.evaluate('this.href') rescue nil
|
172
|
+
next unless u.present?
|
173
|
+
href = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
|
174
|
+
file = u.normalize_file_path('.html')
|
175
|
+
a.evaluate <<~JS
|
176
|
+
(function(that) {
|
177
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
178
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
179
|
+
})(this);
|
180
|
+
JS
|
150
181
|
end
|
151
182
|
element.evaluate('this.outerHTML')
|
152
183
|
end.join("\n").tap do |html|
|
153
184
|
FileUtils.mkdir_p(File.dirname(output))
|
154
185
|
File.write(output, html)
|
155
186
|
end
|
156
|
-
|
187
|
+
|
188
|
+
@downloaded_pages << {file: output, url: url}
|
189
|
+
rescue => e
|
157
190
|
$stderr.puts e.message
|
158
191
|
$stderr.puts e.backtrace
|
159
192
|
binding.pry if @cli_options[:pause_on_error]
|
@@ -161,13 +194,14 @@ module Mkwebook
|
|
161
194
|
page.close
|
162
195
|
end
|
163
196
|
end
|
164
|
-
|
165
197
|
end
|
166
198
|
|
167
199
|
pool.shutdown
|
168
200
|
pool.wait_for_termination
|
169
201
|
|
170
|
-
|
202
|
+
@page_urls = @page_links.flat_map(&:last).uniq
|
203
|
+
@downloaded_depth += 1
|
204
|
+
download_pages
|
171
205
|
end
|
172
206
|
|
173
207
|
def post_process
|
@@ -305,6 +339,35 @@ module Mkwebook
|
|
305
339
|
puts IO.read("#{__dir__}/entry_types.txt")
|
306
340
|
end
|
307
341
|
|
342
|
+
def modify_page_links
|
343
|
+
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
344
|
+
downloaded_files = @downloaded_pages.map { |page| page[:file] }
|
345
|
+
downloaded_files.each do |file|
|
346
|
+
pool.post do
|
347
|
+
begin
|
348
|
+
page = @browser_context.create_page
|
349
|
+
page.go_to("file://#{File.expand_path(file)}")
|
350
|
+
page.css('a').each do |a|
|
351
|
+
href = a.evaluate('this.getAttribute("data-mkwebook-href")') rescue nil
|
352
|
+
next unless href
|
353
|
+
f = a.evaluate('this.getAttribute("data-mkwebook-file")')
|
354
|
+
next unless href && f && downloaded_files.include?(f)
|
355
|
+
a.evaluate("this.href = this.getAttribute('data-mkwebook-href')")
|
356
|
+
end
|
357
|
+
File.write(file, page.evaluate('document.querySelector("html").outerHTML'))
|
358
|
+
rescue Ferrum::Error => e
|
359
|
+
$stderr.puts e.message
|
360
|
+
$stderr.puts e.backtrace
|
361
|
+
binding.pry if @cli_options[:pause_on_error]
|
362
|
+
ensure
|
363
|
+
page.close
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
pool.shutdown
|
368
|
+
pool.wait_for_termination
|
369
|
+
end
|
370
|
+
|
308
371
|
private
|
309
372
|
|
310
373
|
def browser_options
|
data/lib/mkwebook/cli.rb
CHANGED
@@ -19,7 +19,7 @@ module Mkwebook
|
|
19
19
|
option :pause, :type => :boolean, :aliases => '-p', :desc => 'Pause after processing index page'
|
20
20
|
desc 'download_index', 'Download and process index page'
|
21
21
|
def download_index
|
22
|
-
Mkwebook::App.new(options).download_index
|
22
|
+
Mkwebook::App.new(options).download_index(true)
|
23
23
|
end
|
24
24
|
|
25
25
|
option :limit, :type => :numeric, :aliases => '-l', :desc => 'Limit number of pages, specially for debugging'
|
data/lib/mkwebook/config.rb
CHANGED
@@ -22,7 +22,8 @@ module Mkwebook
|
|
22
22
|
'browser' => {
|
23
23
|
'headless' => true,
|
24
24
|
},
|
25
|
-
'concurrency': 1
|
25
|
+
'concurrency': 1,
|
26
|
+
'max-recursion': 1
|
26
27
|
}
|
27
28
|
config = YAML.load_file(config_file)
|
28
29
|
config = default_config.deep_merge(config).deep_transform_keys! { |k| k.to_s.underscore.to_sym }
|
data/lib/mkwebook/ext/string.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
1
3
|
class String
|
2
4
|
def p
|
3
5
|
puts self
|
@@ -12,24 +14,24 @@ class String
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def normalize_file_path(force_extname = nil)
|
17
|
+
return self unless present?
|
15
18
|
uri = URI.parse(self)
|
16
19
|
file_path = uri.path[1..]
|
17
|
-
extname = File.extname(file_path)
|
20
|
+
extname = force_extname || File.extname(file_path)
|
18
21
|
basename = File.basename(file_path, extname)
|
19
22
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
20
23
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
21
|
-
|
22
|
-
File.join(origin, File.dirname(file_path), basename + extname)
|
24
|
+
URI.decode_www_form_component(File.join(origin, File.dirname(file_path), basename + extname))
|
23
25
|
end
|
24
26
|
|
25
27
|
def normalize_uri(force_extname = nil)
|
28
|
+
return self unless present?
|
26
29
|
uri = URI.parse(self)
|
27
30
|
file_path = uri.path[1..]
|
28
|
-
extname = File.extname(file_path)
|
31
|
+
extname = force_extname || File.extname(file_path)
|
29
32
|
basename = File.basename(file_path, extname)
|
30
33
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
31
34
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
32
|
-
extname = force_extname if force_extname && extname.empty?
|
33
35
|
file_path = File.join(origin, File.dirname(file_path), basename + extname)
|
34
36
|
if uri.fragment.present?
|
35
37
|
file_path += "##{uri.fragment}"
|
data/lib/mkwebook/version.rb
CHANGED
data/lib/template/mkwebook.yml
CHANGED
@@ -30,6 +30,7 @@ index-page: # index page settings
|
|
30
30
|
- selector: "script[src]"
|
31
31
|
attr: src
|
32
32
|
|
33
|
+
max-recursion: 2 # max depth of recursive downloading
|
33
34
|
|
34
35
|
pages: # settings for content pages
|
35
36
|
- url-pattern: '.*' # URL pattern for content page, only pages' URL matching this pattern will be processed
|
@@ -41,6 +42,7 @@ pages: # settings for content pages
|
|
41
42
|
style.innerHTML = '.clj-content-container { margin-left: 0; }';
|
42
43
|
document.body.appendChild(style);
|
43
44
|
selector: html # CSS selector for the content to be saved
|
45
|
+
page-link-selector: "a:not([href='../guides'])" # links to be downloaded recursively which are extracted from page content
|
44
46
|
assets: # assets to be downloaded
|
45
47
|
- selector: img # CSS selector for assets
|
46
48
|
attr: src # attribute name for the asset URL
|