mkwebook 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cac7d7fac942ec8612546dbb09f9a7063ef5370bb8760f925364d9c7105d707a
4
- data.tar.gz: 522c06fa9782203daedfdb2d97ac1b1e820b9f9d079b746281c876c63672bc7f
3
+ metadata.gz: a7e29166ba302805e68e70779ef8de58870671aab0ae684d1cec2290f5a0b4bf
4
+ data.tar.gz: 5e530d48d11ce6c26ac5255b7b294b15b6f90bde7b4ecc4e36ee2bc0e0ea7d54
5
5
  SHA512:
6
- metadata.gz: 84fbb2098303f5a4781fb9c1940c90d292099705dbbee3481c5ec48f2bd43b1fa219ed66a53418a7b8103555864ca3b00e1e09339aa6e3f7ea50c92259269c6b
7
- data.tar.gz: 03a676487e80b8bfb2daf1fe2680369c2fcb064d22cc22ca408682477adb13eca00b472bd7299c759a092cbde4551d495af5ba8c6294cf94801f14a4c11213f8
6
+ metadata.gz: 1b90f0fbd51ad20e65847ca7fde950fc40651c3639a24f28b73c52580547e19e9b93f8e8a60247e3d56046afd2cfb9d758a5903c569b871093c841ad2513a52b
7
+ data.tar.gz: f5f17d96c4700ddd423fffe592a702812049a21e65ee113cccf12ba5c38b3dbc8af9a1307711ec96c03e4757e12a68c586e15497d33ccbf99e036078962e7cca
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mkwebook (0.1.1)
4
+ mkwebook (0.1.2)
5
5
  activesupport (>= 6.1.5)
6
6
  concurrent-ruby
7
7
  ferrum (>= 0.13)
data/lib/mkwebook/app.rb CHANGED
@@ -16,6 +16,8 @@ module Mkwebook
16
16
  end
17
17
  @cli_options = cli_options
18
18
  @config = Mkwebook::Config.new(@cli_options)
19
+ @downloaded_depth = 0
20
+ @downloaded_pages = []
19
21
  end
20
22
 
21
23
  def create_config
@@ -28,7 +30,10 @@ module Mkwebook
28
30
 
29
31
  def download
30
32
  download_index
33
+ append_extra_pages
31
34
  download_pages
35
+ modify_page_links
36
+ post_process
32
37
  end
33
38
 
34
39
  def prepare_browser
@@ -51,10 +56,14 @@ module Mkwebook
51
56
  end
52
57
  end
53
58
 
54
- def download_index
59
+ def download_index(only_index = false)
55
60
  prepare_browser
56
61
  index_page = @browser_context.create_page
62
+ begin
57
63
  index_page.go_to(@config[:index_page][:url])
64
+ rescue Ferrum::PendingConnectionsError => e
65
+ index_page.go_to(@config[:index_page][:url])
66
+ end
58
67
  index_page.network.wait_for_idle(timeout: 10) rescue nil
59
68
  modifier = @config[:index_page][:modifier]
60
69
  if modifier && File.file?(modifier)
@@ -67,8 +76,15 @@ module Mkwebook
67
76
  @page_urls = index_elements.flat_map do |element|
68
77
  url = element.css(@config[:index_page][:link_selector]).map { |a| a.evaluate('this.href') }
69
78
  element.css(@config[:index_page][:link_selector]).each do |a|
70
- u = a.evaluate('this.href').normalize_uri('.html').relative_path_from(@config[:index_page][:output])
71
- a.evaluate("this.href = '#{u}'")
79
+ u = a.evaluate('this.href')
80
+ href = u.normalize_uri('.html').relative_path_from(@config[:index_page][:output])
81
+ file = @config[:index_page][:output]
82
+ a.evaluate <<~JS
83
+ (function(that) {
84
+ that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
85
+ that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
86
+ })(this);
87
+ JS
72
88
  end
73
89
  url
74
90
  end.uniq
@@ -77,9 +93,6 @@ module Mkwebook
77
93
  @config[:pages].any? { |page| url =~ Regexp.new(page[:url_pattern]) }
78
94
  end
79
95
 
80
- @page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
81
-
82
-
83
96
  @config[:index_page][:title].try do |title|
84
97
  index_page.execute("document.title = '#{title}'")
85
98
  end
@@ -98,19 +111,25 @@ module Mkwebook
98
111
  end.join("\n").tap do |html|
99
112
  File.write(@config[:index_page][:output], html)
100
113
  end
114
+ @downloaded_pages << {file: @config[:index_page][:output], url: @config[:index_page][:url]}
115
+ modify_page_links if only_index
101
116
  rescue Ferrum::Error => e
102
117
  binding.pry
103
118
  end
104
119
 
105
120
  def download_pages
106
-
107
- append_extra_pages
121
+ return unless @downloaded_depth < @config[:max_recursion]
108
122
 
109
123
  pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
110
124
 
125
+ @page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
126
+
127
+ @page_links = @page_urls.map { |url| [url, []] }.to_h
128
+
111
129
  @page_urls.each do |url|
112
130
  page_config = @config[:pages].find { |page| url =~ Regexp.new(page[:url_pattern]) }
113
131
  next unless page_config
132
+ next if @downloaded_pages.any? { |page| page[:url] == url }
114
133
 
115
134
  pool.post do
116
135
  page = @browser_context.create_page
@@ -131,6 +150,13 @@ module Mkwebook
131
150
  page.execute("document.title = '#{title}'")
132
151
  end
133
152
 
153
+ if page_link_selector = page_config[:page_link_selector]
154
+ page_links = page_elements.flat_map do |element|
155
+ element.css(page_link_selector).map { |a| a.evaluate('this.href') }
156
+ end.uniq
157
+ @page_links[url] = page_links
158
+ end
159
+
134
160
  page.execute <<-JS
135
161
  for (var e of document.querySelectorAll('[integrity]')) {
136
162
  e.removeAttribute('integrity');
@@ -142,18 +168,25 @@ module Mkwebook
142
168
 
143
169
  page_elements.map do |element|
144
170
  element.css('a').each do |a|
145
- u = a.evaluate('this.href')
146
- next unless @page_urls.include?(u)
147
-
148
- u = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
149
- a.evaluate("this.href = '#{u}'")
171
+ u = a.evaluate('this.href') rescue nil
172
+ next unless u.present?
173
+ href = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
174
+ file = u.normalize_file_path('.html')
175
+ a.evaluate <<~JS
176
+ (function(that) {
177
+ that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
178
+ that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
179
+ })(this);
180
+ JS
150
181
  end
151
182
  element.evaluate('this.outerHTML')
152
183
  end.join("\n").tap do |html|
153
184
  FileUtils.mkdir_p(File.dirname(output))
154
185
  File.write(output, html)
155
186
  end
156
- rescue Ferrum::Error => e
187
+
188
+ @downloaded_pages << {file: output, url: url}
189
+ rescue => e
157
190
  $stderr.puts e.message
158
191
  $stderr.puts e.backtrace
159
192
  binding.pry if @cli_options[:pause_on_error]
@@ -161,13 +194,14 @@ module Mkwebook
161
194
  page.close
162
195
  end
163
196
  end
164
-
165
197
  end
166
198
 
167
199
  pool.shutdown
168
200
  pool.wait_for_termination
169
201
 
170
- post_process
202
+ @page_urls = @page_links.flat_map(&:last).uniq
203
+ @downloaded_depth += 1
204
+ download_pages
171
205
  end
172
206
 
173
207
  def post_process
@@ -305,6 +339,35 @@ module Mkwebook
305
339
  puts IO.read("#{__dir__}/entry_types.txt")
306
340
  end
307
341
 
342
+ def modify_page_links
343
+ pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
344
+ downloaded_files = @downloaded_pages.map { |page| page[:file] }
345
+ downloaded_files.each do |file|
346
+ pool.post do
347
+ begin
348
+ page = @browser_context.create_page
349
+ page.go_to("file://#{File.expand_path(file)}")
350
+ page.css('a').each do |a|
351
+ href = a.evaluate('this.getAttribute("data-mkwebook-href")') rescue nil
352
+ next unless href
353
+ f = a.evaluate('this.getAttribute("data-mkwebook-file")')
354
+ next unless href && f && downloaded_files.include?(f)
355
+ a.evaluate("this.href = this.getAttribute('data-mkwebook-href')")
356
+ end
357
+ File.write(file, page.evaluate('document.querySelector("html").outerHTML'))
358
+ rescue Ferrum::Error => e
359
+ $stderr.puts e.message
360
+ $stderr.puts e.backtrace
361
+ binding.pry if @cli_options[:pause_on_error]
362
+ ensure
363
+ page.close
364
+ end
365
+ end
366
+ end
367
+ pool.shutdown
368
+ pool.wait_for_termination
369
+ end
370
+
308
371
  private
309
372
 
310
373
  def browser_options
data/lib/mkwebook/cli.rb CHANGED
@@ -19,7 +19,7 @@ module Mkwebook
19
19
  option :pause, :type => :boolean, :aliases => '-p', :desc => 'Pause after processing index page'
20
20
  desc 'download_index', 'Download and process index page'
21
21
  def download_index
22
- Mkwebook::App.new(options).download_index
22
+ Mkwebook::App.new(options).download_index(true)
23
23
  end
24
24
 
25
25
  option :limit, :type => :numeric, :aliases => '-l', :desc => 'Limit number of pages, specially for debugging'
@@ -22,7 +22,8 @@ module Mkwebook
22
22
  'browser' => {
23
23
  'headless' => true,
24
24
  },
25
- 'concurrency': 1
25
+ 'concurrency': 1,
26
+ 'max-recursion': 1
26
27
  }
27
28
  config = YAML.load_file(config_file)
28
29
  config = default_config.deep_merge(config).deep_transform_keys! { |k| k.to_s.underscore.to_sym }
@@ -1,3 +1,5 @@
1
+ require 'uri'
2
+
1
3
  class String
2
4
  def p
3
5
  puts self
@@ -12,24 +14,24 @@ class String
12
14
  end
13
15
 
14
16
  def normalize_file_path(force_extname = nil)
17
+ return self unless present?
15
18
  uri = URI.parse(self)
16
19
  file_path = uri.path[1..]
17
- extname = File.extname(file_path)
20
+ extname = force_extname || File.extname(file_path)
18
21
  basename = File.basename(file_path, extname)
19
22
  origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
20
23
  basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
21
- extname = force_extname if force_extname && extname.empty?
22
- File.join(origin, File.dirname(file_path), basename + extname)
24
+ URI.decode_www_form_component(File.join(origin, File.dirname(file_path), basename + extname))
23
25
  end
24
26
 
25
27
  def normalize_uri(force_extname = nil)
28
+ return self unless present?
26
29
  uri = URI.parse(self)
27
30
  file_path = uri.path[1..]
28
- extname = File.extname(file_path)
31
+ extname = force_extname || File.extname(file_path)
29
32
  basename = File.basename(file_path, extname)
30
33
  basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
31
34
  origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
32
- extname = force_extname if force_extname && extname.empty?
33
35
  file_path = File.join(origin, File.dirname(file_path), basename + extname)
34
36
  if uri.fragment.present?
35
37
  file_path += "##{uri.fragment}"
@@ -1,3 +1,3 @@
1
1
  module Mkwebook
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -30,6 +30,7 @@ index-page: # index page settings
30
30
  - selector: "script[src]"
31
31
  attr: src
32
32
 
33
+ max-recursion: 2 # max depth of recursive downloading
33
34
 
34
35
  pages: # settings for content pages
35
36
  - url-pattern: '.*' # URL pattern for content page, only pages' URL matching this pattern will be processed
@@ -41,6 +42,7 @@ pages: # settings for content pages
41
42
  style.innerHTML = '.clj-content-container { margin-left: 0; }';
42
43
  document.body.appendChild(style);
43
44
  selector: html # CSS selector for the content to be saved
45
+ page-link-selector: "a:not([href='../guides'])" # links to be downloaded recursively which are extracted from page content
44
46
  assets: # assets to be downloaded
45
47
  - selector: img # CSS selector for assets
46
48
  attr: src # attribute name for the asset URL
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mkwebook
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Liu Xiang