mkwebook 0.1.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cac7d7fac942ec8612546dbb09f9a7063ef5370bb8760f925364d9c7105d707a
4
- data.tar.gz: 522c06fa9782203daedfdb2d97ac1b1e820b9f9d079b746281c876c63672bc7f
3
+ metadata.gz: a7e29166ba302805e68e70779ef8de58870671aab0ae684d1cec2290f5a0b4bf
4
+ data.tar.gz: 5e530d48d11ce6c26ac5255b7b294b15b6f90bde7b4ecc4e36ee2bc0e0ea7d54
5
5
  SHA512:
6
- metadata.gz: 84fbb2098303f5a4781fb9c1940c90d292099705dbbee3481c5ec48f2bd43b1fa219ed66a53418a7b8103555864ca3b00e1e09339aa6e3f7ea50c92259269c6b
7
- data.tar.gz: 03a676487e80b8bfb2daf1fe2680369c2fcb064d22cc22ca408682477adb13eca00b472bd7299c759a092cbde4551d495af5ba8c6294cf94801f14a4c11213f8
6
+ metadata.gz: 1b90f0fbd51ad20e65847ca7fde950fc40651c3639a24f28b73c52580547e19e9b93f8e8a60247e3d56046afd2cfb9d758a5903c569b871093c841ad2513a52b
7
+ data.tar.gz: f5f17d96c4700ddd423fffe592a702812049a21e65ee113cccf12ba5c38b3dbc8af9a1307711ec96c03e4757e12a68c586e15497d33ccbf99e036078962e7cca
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- mkwebook (0.1.1)
4
+ mkwebook (0.1.2)
5
5
  activesupport (>= 6.1.5)
6
6
  concurrent-ruby
7
7
  ferrum (>= 0.13)
data/lib/mkwebook/app.rb CHANGED
@@ -16,6 +16,8 @@ module Mkwebook
16
16
  end
17
17
  @cli_options = cli_options
18
18
  @config = Mkwebook::Config.new(@cli_options)
19
+ @downloaded_depth = 0
20
+ @downloaded_pages = []
19
21
  end
20
22
 
21
23
  def create_config
@@ -28,7 +30,10 @@ module Mkwebook
28
30
 
29
31
  def download
30
32
  download_index
33
+ append_extra_pages
31
34
  download_pages
35
+ modify_page_links
36
+ post_process
32
37
  end
33
38
 
34
39
  def prepare_browser
@@ -51,10 +56,14 @@ module Mkwebook
51
56
  end
52
57
  end
53
58
 
54
- def download_index
59
+ def download_index(only_index = false)
55
60
  prepare_browser
56
61
  index_page = @browser_context.create_page
62
+ begin
57
63
  index_page.go_to(@config[:index_page][:url])
64
+ rescue Ferrum::PendingConnectionsError => e
65
+ index_page.go_to(@config[:index_page][:url])
66
+ end
58
67
  index_page.network.wait_for_idle(timeout: 10) rescue nil
59
68
  modifier = @config[:index_page][:modifier]
60
69
  if modifier && File.file?(modifier)
@@ -67,8 +76,15 @@ module Mkwebook
67
76
  @page_urls = index_elements.flat_map do |element|
68
77
  url = element.css(@config[:index_page][:link_selector]).map { |a| a.evaluate('this.href') }
69
78
  element.css(@config[:index_page][:link_selector]).each do |a|
70
- u = a.evaluate('this.href').normalize_uri('.html').relative_path_from(@config[:index_page][:output])
71
- a.evaluate("this.href = '#{u}'")
79
+ u = a.evaluate('this.href')
80
+ href = u.normalize_uri('.html').relative_path_from(@config[:index_page][:output])
81
+ file = @config[:index_page][:output]
82
+ a.evaluate <<~JS
83
+ (function(that) {
84
+ that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
85
+ that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
86
+ })(this);
87
+ JS
72
88
  end
73
89
  url
74
90
  end.uniq
@@ -77,9 +93,6 @@ module Mkwebook
77
93
  @config[:pages].any? { |page| url =~ Regexp.new(page[:url_pattern]) }
78
94
  end
79
95
 
80
- @page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
81
-
82
-
83
96
  @config[:index_page][:title].try do |title|
84
97
  index_page.execute("document.title = '#{title}'")
85
98
  end
@@ -98,19 +111,25 @@ module Mkwebook
98
111
  end.join("\n").tap do |html|
99
112
  File.write(@config[:index_page][:output], html)
100
113
  end
114
+ @downloaded_pages << {file: @config[:index_page][:output], url: @config[:index_page][:url]}
115
+ modify_page_links if only_index
101
116
  rescue Ferrum::Error => e
102
117
  binding.pry
103
118
  end
104
119
 
105
120
  def download_pages
106
-
107
- append_extra_pages
121
+ return unless @downloaded_depth < @config[:max_recursion]
108
122
 
109
123
  pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
110
124
 
125
+ @page_urls = @page_urls[0, @cli_options[:limit]] if @cli_options[:limit]
126
+
127
+ @page_links = @page_urls.map { |url| [url, []] }.to_h
128
+
111
129
  @page_urls.each do |url|
112
130
  page_config = @config[:pages].find { |page| url =~ Regexp.new(page[:url_pattern]) }
113
131
  next unless page_config
132
+ next if @downloaded_pages.any? { |page| page[:url] == url }
114
133
 
115
134
  pool.post do
116
135
  page = @browser_context.create_page
@@ -131,6 +150,13 @@ module Mkwebook
131
150
  page.execute("document.title = '#{title}'")
132
151
  end
133
152
 
153
+ if page_link_selector = page_config[:page_link_selector]
154
+ page_links = page_elements.flat_map do |element|
155
+ element.css(page_link_selector).map { |a| a.evaluate('this.href') }
156
+ end.uniq
157
+ @page_links[url] = page_links
158
+ end
159
+
134
160
  page.execute <<-JS
135
161
  for (var e of document.querySelectorAll('[integrity]')) {
136
162
  e.removeAttribute('integrity');
@@ -142,18 +168,25 @@ module Mkwebook
142
168
 
143
169
  page_elements.map do |element|
144
170
  element.css('a').each do |a|
145
- u = a.evaluate('this.href')
146
- next unless @page_urls.include?(u)
147
-
148
- u = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
149
- a.evaluate("this.href = '#{u}'")
171
+ u = a.evaluate('this.href') rescue nil
172
+ next unless u.present?
173
+ href = u.normalize_uri('.html').relative_path_from(url.normalize_uri('.html'))
174
+ file = u.normalize_file_path('.html')
175
+ a.evaluate <<~JS
176
+ (function(that) {
177
+ that.setAttribute('data-mkwebook-href', '#{href.gsub("'", "\\\\'")}')
178
+ that.setAttribute('data-mkwebook-file', '#{file.gsub("'", "\\\\'")}')
179
+ })(this);
180
+ JS
150
181
  end
151
182
  element.evaluate('this.outerHTML')
152
183
  end.join("\n").tap do |html|
153
184
  FileUtils.mkdir_p(File.dirname(output))
154
185
  File.write(output, html)
155
186
  end
156
- rescue Ferrum::Error => e
187
+
188
+ @downloaded_pages << {file: output, url: url}
189
+ rescue => e
157
190
  $stderr.puts e.message
158
191
  $stderr.puts e.backtrace
159
192
  binding.pry if @cli_options[:pause_on_error]
@@ -161,13 +194,14 @@ module Mkwebook
161
194
  page.close
162
195
  end
163
196
  end
164
-
165
197
  end
166
198
 
167
199
  pool.shutdown
168
200
  pool.wait_for_termination
169
201
 
170
- post_process
202
+ @page_urls = @page_links.flat_map(&:last).uniq
203
+ @downloaded_depth += 1
204
+ download_pages
171
205
  end
172
206
 
173
207
  def post_process
@@ -305,6 +339,35 @@ module Mkwebook
305
339
  puts IO.read("#{__dir__}/entry_types.txt")
306
340
  end
307
341
 
342
+ def modify_page_links
343
+ pool = Concurrent::FixedThreadPool.new(@config[:concurrency])
344
+ downloaded_files = @downloaded_pages.map { |page| page[:file] }
345
+ downloaded_files.each do |file|
346
+ pool.post do
347
+ begin
348
+ page = @browser_context.create_page
349
+ page.go_to("file://#{File.expand_path(file)}")
350
+ page.css('a').each do |a|
351
+ href = a.evaluate('this.getAttribute("data-mkwebook-href")') rescue nil
352
+ next unless href
353
+ f = a.evaluate('this.getAttribute("data-mkwebook-file")')
354
+ next unless href && f && downloaded_files.include?(f)
355
+ a.evaluate("this.href = this.getAttribute('data-mkwebook-href')")
356
+ end
357
+ File.write(file, page.evaluate('document.querySelector("html").outerHTML'))
358
+ rescue Ferrum::Error => e
359
+ $stderr.puts e.message
360
+ $stderr.puts e.backtrace
361
+ binding.pry if @cli_options[:pause_on_error]
362
+ ensure
363
+ page.close
364
+ end
365
+ end
366
+ end
367
+ pool.shutdown
368
+ pool.wait_for_termination
369
+ end
370
+
308
371
  private
309
372
 
310
373
  def browser_options
data/lib/mkwebook/cli.rb CHANGED
@@ -19,7 +19,7 @@ module Mkwebook
19
19
  option :pause, :type => :boolean, :aliases => '-p', :desc => 'Pause after processing index page'
20
20
  desc 'download_index', 'Download and process index page'
21
21
  def download_index
22
- Mkwebook::App.new(options).download_index
22
+ Mkwebook::App.new(options).download_index(true)
23
23
  end
24
24
 
25
25
  option :limit, :type => :numeric, :aliases => '-l', :desc => 'Limit number of pages, specially for debugging'
@@ -22,7 +22,8 @@ module Mkwebook
22
22
  'browser' => {
23
23
  'headless' => true,
24
24
  },
25
- 'concurrency': 1
25
+ 'concurrency': 1,
26
+ 'max-recursion': 1
26
27
  }
27
28
  config = YAML.load_file(config_file)
28
29
  config = default_config.deep_merge(config).deep_transform_keys! { |k| k.to_s.underscore.to_sym }
@@ -1,3 +1,5 @@
1
+ require 'uri'
2
+
1
3
  class String
2
4
  def p
3
5
  puts self
@@ -12,24 +14,24 @@ class String
12
14
  end
13
15
 
14
16
  def normalize_file_path(force_extname = nil)
17
+ return self unless present?
15
18
  uri = URI.parse(self)
16
19
  file_path = uri.path[1..]
17
- extname = File.extname(file_path)
20
+ extname = force_extname || File.extname(file_path)
18
21
  basename = File.basename(file_path, extname)
19
22
  origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
20
23
  basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
21
- extname = force_extname if force_extname && extname.empty?
22
- File.join(origin, File.dirname(file_path), basename + extname)
24
+ URI.decode_www_form_component(File.join(origin, File.dirname(file_path), basename + extname))
23
25
  end
24
26
 
25
27
  def normalize_uri(force_extname = nil)
28
+ return self unless present?
26
29
  uri = URI.parse(self)
27
30
  file_path = uri.path[1..]
28
- extname = File.extname(file_path)
31
+ extname = force_extname || File.extname(file_path)
29
32
  basename = File.basename(file_path, extname)
30
33
  basename += "_#{Digest::MD5.hexdigest(uri.query)}" if uri.query.present?
31
34
  origin = "#{uri.scheme.try { |s| s + '_' }}#{uri.host}#{uri.port.try { |p| '_' + p.to_s }}"
32
- extname = force_extname if force_extname && extname.empty?
33
35
  file_path = File.join(origin, File.dirname(file_path), basename + extname)
34
36
  if uri.fragment.present?
35
37
  file_path += "##{uri.fragment}"
@@ -1,3 +1,3 @@
1
1
  module Mkwebook
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.2"
3
3
  end
@@ -30,6 +30,7 @@ index-page: # index page settings
30
30
  - selector: "script[src]"
31
31
  attr: src
32
32
 
33
+ max-recursion: 2 # max depth of recursive downloading
33
34
 
34
35
  pages: # settings for content pages
35
36
  - url-pattern: '.*' # URL pattern for content page, only pages' URL matching this pattern will be processed
@@ -41,6 +42,7 @@ pages: # settings for content pages
41
42
  style.innerHTML = '.clj-content-container { margin-left: 0; }';
42
43
  document.body.appendChild(style);
43
44
  selector: html # CSS selector for the content to be saved
45
+ page-link-selector: "a:not([href='../guides'])" # links to be downloaded recursively which are extracted from page content
44
46
  assets: # assets to be downloaded
45
47
  - selector: img # CSS selector for assets
46
48
  attr: src # attribute name for the asset URL
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mkwebook
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Liu Xiang