mkwebook 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/mkwebook/app.rb +79 -16
- data/lib/mkwebook/cli.rb +1 -1
- data/lib/mkwebook/config.rb +2 -1
- data/lib/mkwebook/ext/string.rb +7 -5
- data/lib/mkwebook/version.rb +1 -1
- data/lib/template/mkwebook.yml +24 -22
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0caf9753412a2f7479f03a442361ad896c31982ca7225f604311f286d1111685
|
4
|
+
data.tar.gz: 2502a70592588c4b2e62c3c36a5ae38b9801109ac8a8eba3d449f9be50088261
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c32259253e673b94040eea46bcbbd4356b5dcc9622943892c32a343029863eb6c059e7a6b45981d3c03e8fc90a7aaf475f5d952d2e4fa315796f5658dcb1e7b4
|
7
|
+
data.tar.gz: f18163c7ec7a720ba600ae9d3a66d48f7df350d3c59dfd2b903f17bdeacc81bf419e6e1ecab858ee26c674078ed853c3f60c99a70643f01f9b100546828a324b
|
data/Gemfile.lock
CHANGED
data/lib/mkwebook/app.rb
CHANGED
@@ -16,6 +16,8 @@ module Mkwebook
|
|
16
16
|
end
|
17
17
|
@cli_options = cli_options
|
18
18
|
@config = Mkwebook::Config.new(@cli_options)
|
19
|
+
@downloaded_depth = 0
|
20
|
+
@downloaded_pages = []
|
19
21
|
end
|
20
22
|
|
21
23
|
def create_config
|
@@ -28,7 +30,10 @@ module Mkwebook
|
|
28
30
|
|
29
31
|
def download
|
30
32
|
download_index
|
33
|
+
append_extra_pages
|
31
34
|
download_pages
|
35
|
+
modify_page_links
|
36
|
+
post_process
|
32
37
|
end
|
33
38
|
|
34
39
|
def prepare_browser
|
@@ -51,10 +56,14 @@ module Mkwebook
|
|
51
56
|
end
|
52
57
|
end
|
53
58
|
|
54
|
-
def download_index
|
59
|
+
def download_index(only_index = false)
|
55
60
|
prepare_browser
|
56
61
|
index_page = @browser_context.create_page
|
62
|
+
begin
|
57
63
|
index_page.go_to(@config[:index_page][:url])
|
64
|
+
rescue Ferrum::PendingConnectionsError => e
|
65
|
+
index_page.go_to(@config[:index_page][:url])
|
66
|
+
end
|
58
67
|
index_page.network.wait_for_idle(timeout: 10) rescue nil
|
59
68
|
modifier = @config[:index_page][:modifier]
|
60
69
|
if modifier && File.file?(modifier)
|
@@ -67,8 +76,15 @@ module Mkwebook
|
|
67
76
|
@page_urls = index_elements.flat_map do |element|
|
68
77
|
url = element.css(@config[:index_page][:link_selector]).map { |a| a.evaluate('this.href') }
|
69
78
|
element.css(@config[:index_page][:link_selector]).each do |a|
|
70
|
-
u = a.evaluate('this.href')
|
71
|
-
|
79
|
+
u = a.evaluate('this.href')
|
80
|
+
href = u.normalize_uri('.html').relative_path_from(@config[:index_page][:output])
|
81
|
+
file = @config[:index_page][:output]
|
82
|
+
a.evaluate <<~JS
|
83
|
+
(function(that) {
|
84
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
85
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
86
|
+
})(this);
|
87
|
+
JS
|
72
88
|
end
|
73
89
|
url
|
74
90
|
end.uniq
|
@@ -77,9 +93,6 @@ module Mkwebook
|
|
77
93
|
@config[:pages].any? { |page| url =~ Regexp.new(page[:url_pattern]) }
|
78
94
|
end
|
79
95
|
|
80
|
-
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
81
|
-
|
82
|
-
|
83
96
|
@config[:index_page][:title].try do |title|
|
84
97
|
index_page.execute("document.title = '#{title}'")
|
85
98
|
end
|
@@ -98,19 +111,25 @@ module Mkwebook
|
|
98
111
|
end.join("\n").tap do |html|
|
99
112
|
File.write(@config[:index_page][:output], html)
|
100
113
|
end
|
114
|
+
@downloaded_pages << {file: @config[:index_page][:output], url: @config[:index_page][:url]}
|
115
|
+
modify_page_links if only_index
|
101
116
|
rescue Ferrum::Error => e
|
102
117
|
binding.pry
|
103
118
|
end
|
104
119
|
|
105
120
|
def download_pages
|
106
|
-
|
107
|
-
append_extra_pages
|
121
|
+
return unless @downloaded_depth < @config[:max_recursion]
|
108
122
|
|
109
123
|
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
110
124
|
|
125
|
+
@page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
|
126
|
+
|
127
|
+
@page_links = @page_urls.map { |url| [url, []] }.to_h
|
128
|
+
|
111
129
|
@page_urls.each do |url|
|
112
130
|
page_config = @config[:pages].find { |page| url =~ Regexp.new(page[:url_pattern]) }
|
113
131
|
next unless page_config
|
132
|
+
next if @downloaded_pages.any? { |page| page[:url] == url }
|
114
133
|
|
115
134
|
pool.post do
|
116
135
|
page = @browser_context.create_page
|
@@ -131,6 +150,13 @@ module Mkwebook
|
|
131
150
|
page.execute("document.title = '#{title}'")
|
132
151
|
end
|
133
152
|
|
153
|
+
if page_link_selector = page_config[:page_link_selector]
|
154
|
+
page_links = page_elements.flat_map do |element|
|
155
|
+
element.css(page_link_selector).map { |a| a.evaluate('this.href') }
|
156
|
+
end.uniq
|
157
|
+
@page_links[url] = page_links
|
158
|
+
end
|
159
|
+
|
134
160
|
page.execute <<-JS
|
135
161
|
for (var e of document.querySelectorAll('[integrity]')) {
|
136
162
|
e.removeAttribute('integrity');
|
@@ -142,18 +168,25 @@ module Mkwebook
|
|
142
168
|
|
143
169
|
page_elements.map do |element|
|
144
170
|
element.css('a').each do |a|
|
145
|
-
u = a.evaluate('this.href')
|
146
|
-
next unless
|
147
|
-
|
148
|
-
|
149
|
-
a.evaluate
|
171
|
+
u = a.evaluate('this.href') rescue nil
|
172
|
+
next unless u.present?
|
173
|
+
href = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
|
174
|
+
file = u.normalize_file_path('.html')
|
175
|
+
a.evaluate <<~JS
|
176
|
+
(function(that) {
|
177
|
+
that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
|
178
|
+
that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
|
179
|
+
})(this);
|
180
|
+
JS
|
150
181
|
end
|
151
182
|
element.evaluate('this.outerHTML')
|
152
183
|
end.join("\n").tap do |html|
|
153
184
|
FileUtils.mkdir_p(File.dirname(output))
|
154
185
|
File.write(output, html)
|
155
186
|
end
|
156
|
-
|
187
|
+
|
188
|
+
@downloaded_pages << {file: output, url: url}
|
189
|
+
rescue => e
|
157
190
|
$stderr.puts e.message
|
158
191
|
$stderr.puts e.backtrace
|
159
192
|
binding.pry if @cli_options[:pause_on_error]
|
@@ -161,13 +194,14 @@ module Mkwebook
|
|
161
194
|
page.close
|
162
195
|
end
|
163
196
|
end
|
164
|
-
|
165
197
|
end
|
166
198
|
|
167
199
|
pool.shutdown
|
168
200
|
pool.wait_for_termination
|
169
201
|
|
170
|
-
|
202
|
+
@page_urls = @page_links.flat_map(&:last).uniq
|
203
|
+
@downloaded_depth += 1
|
204
|
+
download_pages
|
171
205
|
end
|
172
206
|
|
173
207
|
def post_process
|
@@ -305,6 +339,35 @@ module Mkwebook
|
|
305
339
|
puts IO.read("#{__dir__}/entry_types.txt")
|
306
340
|
end
|
307
341
|
|
342
|
+
def modify_page_links
|
343
|
+
pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
|
344
|
+
downloaded_files = @downloaded_pages.map { |page| page[:file] }
|
345
|
+
downloaded_files.each do |file|
|
346
|
+
pool.post do
|
347
|
+
begin
|
348
|
+
page = @browser_context.create_page
|
349
|
+
page.go_to("file://#{File.expand_path(file)}")
|
350
|
+
page.css('a').each do |a|
|
351
|
+
href = a.evaluate('this.getAttribute("data-mkwebook-href")') rescue nil
|
352
|
+
next unless href
|
353
|
+
f = a.evaluate('this.getAttribute("data-mkwebook-file")')
|
354
|
+
next unless href && f && downloaded_files.include?(f)
|
355
|
+
a.evaluate("this.href = this.getAttribute('data-mkwebook-href')")
|
356
|
+
end
|
357
|
+
File.write(file, page.evaluate('document.querySelector("html").outerHTML'))
|
358
|
+
rescue Ferrum::Error => e
|
359
|
+
$stderr.puts e.message
|
360
|
+
$stderr.puts e.backtrace
|
361
|
+
binding.pry if @cli_options[:pause_on_error]
|
362
|
+
ensure
|
363
|
+
page.close
|
364
|
+
end
|
365
|
+
end
|
366
|
+
end
|
367
|
+
pool.shutdown
|
368
|
+
pool.wait_for_termination
|
369
|
+
end
|
370
|
+
|
308
371
|
private
|
309
372
|
|
310
373
|
def browser_options
|
data/lib/mkwebook/cli.rb
CHANGED
@@ -19,7 +19,7 @@ module Mkwebook
|
|
19
19
|
option :pause, :type => :boolean, :aliases => '-p', :desc => 'Pause after processing index page'
|
20
20
|
desc 'download_index', 'Download and process index page'
|
21
21
|
def download_index
|
22
|
-
Mkwebook::App.new(options).download_index
|
22
|
+
Mkwebook::App.new(options).download_index(true)
|
23
23
|
end
|
24
24
|
|
25
25
|
option :limit, :type => :numeric, :aliases => '-l', :desc => 'Limit number of pages, specially for debugging'
|
data/lib/mkwebook/config.rb
CHANGED
@@ -22,7 +22,8 @@ module Mkwebook
|
|
22
22
|
'browser' => {
|
23
23
|
'headless' => true,
|
24
24
|
},
|
25
|
-
'concurrency': 1
|
25
|
+
'concurrency': 1,
|
26
|
+
'max-recursion': 1
|
26
27
|
}
|
27
28
|
config = YAML.load_file(config_file)
|
28
29
|
config = default_config.deep_merge(config).deep_transform_keys! { |k| k.to_s.underscore.to_sym }
|
data/lib/mkwebook/ext/string.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
require 'uri'
|
2
|
+
|
1
3
|
class String
|
2
4
|
def p
|
3
5
|
puts self
|
@@ -12,24 +14,24 @@ class String
|
|
12
14
|
end
|
13
15
|
|
14
16
|
def normalize_file_path(force_extname = nil)
|
17
|
+
return self unless present?
|
15
18
|
uri = URI.parse(self)
|
16
19
|
file_path = uri.path[1..]
|
17
|
-
extname = File.extname(file_path)
|
20
|
+
extname = force_extname || File.extname(file_path)
|
18
21
|
basename = File.basename(file_path, extname)
|
19
22
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
20
23
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
21
|
-
|
22
|
-
File.join(origin, File.dirname(file_path), basename + extname)
|
24
|
+
URI.decode_www_form_component(File.join(origin, File.dirname(file_path), basename + extname))
|
23
25
|
end
|
24
26
|
|
25
27
|
def normalize_uri(force_extname = nil)
|
28
|
+
return self unless present?
|
26
29
|
uri = URI.parse(self)
|
27
30
|
file_path = uri.path[1..]
|
28
|
-
extname = File.extname(file_path)
|
31
|
+
extname = force_extname || File.extname(file_path)
|
29
32
|
basename = File.basename(file_path, extname)
|
30
33
|
basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
|
31
34
|
origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
|
32
|
-
extname = force_extname if force_extname && extname.empty?
|
33
35
|
file_path = File.join(origin, File.dirname(file_path), basename + extname)
|
34
36
|
if uri.fragment.present?
|
35
37
|
file_path += "##{uri.fragment}"
|
data/lib/mkwebook/version.rb
CHANGED
data/lib/template/mkwebook.yml
CHANGED
@@ -1,47 +1,49 @@
|
|
1
|
-
browser: # browser settings
|
2
|
-
headless: false # headless mode
|
3
|
-
window_size: [
|
1
|
+
browser: # browser settings, this setting is optional
|
2
|
+
headless: false # headless mode, the -H CLI option could be used to override this setting
|
3
|
+
window_size: [1920, 1200] # browser window size
|
4
4
|
timeout: 30 # timeout for waiting for page loading
|
5
5
|
# Any options accepted by Ferum::Browser.new are allowed here
|
6
6
|
|
7
7
|
concurrency: 16 # number of concurrent threads, default is no conccurency
|
8
8
|
|
9
|
-
authentication: # authentication settings
|
9
|
+
authentication: # authentication settings, this setting is optional
|
10
10
|
url: https://example.com/login # any page url which for inject cookie and local storage
|
11
11
|
cookies: "auth_cookie_id=demo" # cookie string to be injected
|
12
12
|
local-storage: # local storage to be injected
|
13
13
|
username: demo # key and value
|
14
14
|
auth_token: demo # key and value
|
15
15
|
|
16
|
-
index-page: # index page settings
|
16
|
+
index-page: # index page settings, this setting is mandatory
|
17
17
|
url: https://clojure.org/guides/repl/introduction # URL of index page
|
18
18
|
title: Clojure Guides # title for the book, use page's title if not set
|
19
|
-
modifier: | # JavaScript code to modify the page
|
19
|
+
modifier: | # JavaScript code to modify the page, this setting is optional
|
20
20
|
document.body.innerHTML = document.querySelector('.clj-section-nav-container').outerHTML;
|
21
21
|
document.querySelector('.clj-section-nav-container').style.width = '100%';
|
22
22
|
document.body.style.backgroundColor = 'white';
|
23
23
|
|
24
|
-
selector: "html" # CSS selector for the content to be saved
|
25
|
-
output: "index.html" # output file name
|
26
|
-
link-selector: "a:not([href='../guides'])" # CSS selector for links of content pages
|
27
|
-
assets: # assets to be downloaded
|
24
|
+
selector: "html" # CSS selector for the content to be saved, this setting is mandatory
|
25
|
+
output: "index.html" # output file name, this setting is mandatory
|
26
|
+
link-selector: "a:not([href='../guides'])" # CSS selector for links of content pages, this setting is mandatory
|
27
|
+
assets: # assets to be downloaded, this setting is optional
|
28
28
|
- selector: "link[rel=stylesheet]" # CSS selector for assets
|
29
29
|
attr: href # attribute name for the asset URL
|
30
30
|
- selector: "script[src]"
|
31
31
|
attr: src
|
32
32
|
|
33
|
+
max-recursion: 2 # max depth of recursive downloading, default is 1
|
33
34
|
|
34
|
-
pages: # settings for content pages
|
35
|
+
pages: # settings for content pages, this setting is mandatory
|
35
36
|
- url-pattern: '.*' # URL pattern for content page, only pages' URL matching this pattern will be processed
|
36
|
-
modifier: | # JavaScript code to modify the page
|
37
|
+
modifier: | # JavaScript code to modify the page, this setting is optional
|
37
38
|
document.body.innerHTML = document.querySelector('.clj-content-container').outerHTML;
|
38
39
|
document.querySelector('.clj-content-container').style.width = '100%';
|
39
40
|
document.body.style.backgroundColor = 'white';
|
40
41
|
var style = document.createElement('style');
|
41
42
|
style.innerHTML = '.clj-content-container { margin-left: 0; }';
|
42
43
|
document.body.appendChild(style);
|
43
|
-
selector: html # CSS selector for the content to be saved
|
44
|
-
|
44
|
+
selector: html # CSS selector for the content to be saved, this setting is mandatory
|
45
|
+
page-link-selector: "a:not([href='../guides'])" # links to be downloaded recursively which are extracted from page content, this setting is optional, if this setting is set, consider also set max-recursion
|
46
|
+
assets: # assets to be downloaded, this setting is optional
|
45
47
|
- selector: img # CSS selector for assets
|
46
48
|
attr: src # attribute name for the asset URL
|
47
49
|
- selector: "link[rel=stylesheet]"
|
@@ -49,20 +51,20 @@ pages: # settings for content pages
|
|
49
51
|
- selector: "script[src]"
|
50
52
|
attr: src
|
51
53
|
|
52
|
-
extra-pages: # settings for extra pages
|
54
|
+
extra-pages: # settings for extra pages, this setting is optional
|
53
55
|
- https://www.example.com/extra-page-1
|
54
56
|
|
55
|
-
post-process: | # Shell script to be executed after the book is downloaded
|
57
|
+
post-process: | # Shell script to be executed after the book is downloaded, this setting is optional
|
56
58
|
find . -name '*.html' -exec sed -i 's/https:\/\/clojure.org\/guides\/repl\/introduction/..\/index.html/g' {} \;
|
57
59
|
|
58
|
-
docset: # config for generate docset
|
59
|
-
name: "Clojure Guides" # docset name
|
60
|
-
keyword: "clojure" # docset keyword
|
61
|
-
icon: "clojure.png" # docset icon
|
62
|
-
index: "/index.html" # docset index page
|
60
|
+
docset: # config for generate docset, mandatory if run docset command
|
61
|
+
name: "Clojure Guides" # docset name, this setting is mandatory
|
62
|
+
keyword: "clojure" # docset keyword, this setting is mandatory
|
63
|
+
icon: "clojure.png" # docset icon, this setting is mandatory
|
64
|
+
index: "/index.html" # docset index page, this setting is mandatory
|
63
65
|
pages: # docset pages config
|
64
66
|
- url-pattern: 'index.html' # URL pattern for docset page, will match against local downloaded pages
|
65
|
-
extractor: | # JavaScript code to extract the content for docset page
|
67
|
+
extractor: | # JavaScript code to extract the content for docset page, this setting is mandatory
|
66
68
|
(function() {
|
67
69
|
var links = [...document.querySelectorAll('a.data-url')];
|
68
70
|
return links.map(link => {
|