html2doc 1.1.0 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.hound.yml +3 -1
- data/.rubocop.yml +4 -8
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc/base.rb +48 -46
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -79
- data/lib/html2doc/mime.rb +41 -34
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc.rb +0 -3
- data/spec/html2doc_spec.rb +566 -521
- metadata +42 -42
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9ed3f5d01d7910a104f86dfe54090ffc3ddf56730f5885293801b3848b24735
|
4
|
+
data.tar.gz: 98428b2016bba38f17cb66226e2fb8d96a28c6ad28bd47a3bc0b998ea1c81228
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ede857348aa47a2f09df5c0c1929056251729b358815130ed6c7823f14e9a49cbb1439d43eb45104cb6be2104f47b4dda15b156680dfefd687c4d6439e162c89
|
7
|
+
data.tar.gz: 4027da3d313f7efb834efc96666d6aedfa509d3b2fc7335b367259833a0050e29b13da92e40514b2afee76b9f84420b81951d1fb9d577643a077643823dcf23c
|
data/.github/workflows/rake.yml
CHANGED
@@ -16,19 +16,9 @@ jobs:
|
|
16
16
|
strategy:
|
17
17
|
fail-fast: false
|
18
18
|
matrix:
|
19
|
-
ruby: [ '
|
19
|
+
ruby: [ '3.0', '2.7', '2.6', '2.5' ]
|
20
20
|
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
21
|
experimental: [ false ]
|
22
|
-
include:
|
23
|
-
- ruby: '3.0'
|
24
|
-
os: 'ubuntu-latest'
|
25
|
-
experimental: true
|
26
|
-
- ruby: '3.0'
|
27
|
-
os: 'windows-latest'
|
28
|
-
experimental: true
|
29
|
-
- ruby: '3.0'
|
30
|
-
os: 'macos-latest'
|
31
|
-
experimental: true
|
32
22
|
steps:
|
33
23
|
- uses: actions/checkout@v2
|
34
24
|
with:
|
data/.hound.yml
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,14 +1,10 @@
|
|
1
|
-
#
|
2
|
-
# https://github.com/
|
3
|
-
# All project-specific additions and overrides should be specified in this file.
|
1
|
+
# Auto-generated by Cimas: Do not edit it manually!
|
2
|
+
# See https://github.com/metanorma/cimas
|
4
3
|
inherit_from:
|
5
4
|
- https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
|
6
5
|
|
7
6
|
# local repo-specific modifications
|
7
|
+
# ...
|
8
8
|
|
9
9
|
AllCops:
|
10
|
-
|
11
|
-
StyleGuideCopsOnly: false
|
12
|
-
TargetRubyVersion: 2.4
|
13
|
-
Rails:
|
14
|
-
Enabled: true
|
10
|
+
TargetRubyVersion: 2.5
|
data/Gemfile
CHANGED
@@ -10,6 +10,6 @@ end
|
|
10
10
|
|
11
11
|
gemspec
|
12
12
|
|
13
|
-
if File.exist?
|
14
|
-
eval File.read(
|
13
|
+
if File.exist? "Gemfile.devel"
|
14
|
+
eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
|
15
15
|
end
|
data/Rakefile
CHANGED
data/bin/html2doc
CHANGED
data/bin/rspec
CHANGED
data/html2doc.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
1
|
+
lib = File.expand_path("lib", __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require "html2doc/version"
|
5
4
|
|
@@ -16,23 +15,23 @@ Gem::Specification.new do |spec|
|
|
16
15
|
This gem is in active development.
|
17
16
|
DESCRIPTION
|
18
17
|
|
19
|
-
spec.homepage
|
20
|
-
spec.licenses
|
18
|
+
spec.homepage = "https://github.com/metanorma/html2doc"
|
19
|
+
spec.licenses = ["CC-BY-SA-3.0", "BSD-2-Clause"]
|
21
20
|
|
22
21
|
spec.bindir = "bin"
|
23
22
|
spec.require_paths = ["lib"]
|
24
23
|
spec.files = `git ls-files`.split("\n")
|
25
24
|
spec.test_files = `git ls-files -- {spec}/*`.split("\n")
|
26
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
25
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
27
26
|
|
27
|
+
spec.add_dependency "asciimath", "~> 2.0.2"
|
28
28
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
29
29
|
spec.add_dependency "image_size"
|
30
30
|
spec.add_dependency "mime-types"
|
31
|
-
spec.add_dependency "nokogiri", "~> 1.
|
31
|
+
spec.add_dependency "nokogiri", "~> 1.12"
|
32
|
+
spec.add_dependency "plane1converter", "~> 0.0.1"
|
32
33
|
spec.add_dependency "thread_safe"
|
33
34
|
spec.add_dependency "uuidtools"
|
34
|
-
spec.add_dependency "asciimath", "~> 2.0.2"
|
35
|
-
spec.add_dependency "plane1converter", "~> 0.0.1"
|
36
35
|
|
37
36
|
spec.add_development_dependency "byebug", "~> 9.1"
|
38
37
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
@@ -40,8 +39,8 @@ Gem::Specification.new do |spec|
|
|
40
39
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
41
40
|
spec.add_development_dependency "rake", "~> 12.0"
|
42
41
|
spec.add_development_dependency "rspec", "~> 3.6"
|
42
|
+
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
|
43
43
|
spec.add_development_dependency "rubocop", "~> 1.5.2"
|
44
44
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
45
45
|
spec.add_development_dependency "timecop", "~> 0.9"
|
46
|
-
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
|
47
46
|
end
|
data/lib/html2doc/base.rb
CHANGED
@@ -2,8 +2,6 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
-
#require "xml/xslt"
|
6
|
-
require "pp"
|
7
5
|
require "fileutils"
|
8
6
|
|
9
7
|
module Html2Doc
|
@@ -19,15 +17,17 @@ module Html2Doc
|
|
19
17
|
|
20
18
|
def self.process_header(headerfile, hash)
|
21
19
|
return if headerfile.nil?
|
20
|
+
|
22
21
|
doc = File.read(headerfile, encoding: "utf-8")
|
23
|
-
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
22
|
+
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
23
|
+
File.dirname(hash[:filename]))
|
24
24
|
File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
|
25
25
|
end
|
26
26
|
|
27
27
|
def self.clear_dir(dir)
|
28
28
|
Dir.foreach(dir) do |f|
|
29
29
|
fn = File.join(dir, f)
|
30
|
-
File.delete(fn) if f !=
|
30
|
+
File.delete(fn) if f != "." && f != ".."
|
31
31
|
end
|
32
32
|
dir
|
33
33
|
end
|
@@ -72,7 +72,7 @@ module Html2Doc
|
|
72
72
|
|
73
73
|
def self.to_xhtml(xml)
|
74
74
|
xml.gsub!(/<\?xml[^>]*>/, "")
|
75
|
-
unless /<!DOCTYPE /.match xml
|
75
|
+
unless /<!DOCTYPE /.match? xml
|
76
76
|
xml = '<!DOCTYPE html SYSTEM
|
77
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
78
78
|
end
|
@@ -84,34 +84,35 @@ module Html2Doc
|
|
84
84
|
DOCTYPE
|
85
85
|
|
86
86
|
def self.from_xhtml(xml)
|
87
|
-
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
-
sub(DOCTYPE, "")
|
89
|
-
gsub(%{ />}, "/>")
|
87
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
+
.sub(DOCTYPE, "")
|
89
|
+
.gsub(%{ />}, "/>")
|
90
90
|
end
|
91
91
|
|
92
|
-
def self.msword_fix(
|
92
|
+
def self.msword_fix(doc)
|
93
93
|
# brain damage in MSWord parser
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
94
|
+
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
|
+
'<span style="mso-special-character:footnote"></span>')
|
96
|
+
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
97
|
+
'<div style="mso-element:footnote-list"/>')
|
98
|
+
doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
99
|
+
doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
100
|
+
doc.gsub!(%r{<meta http-equiv="Content-Type"},
|
101
|
+
"<meta http-equiv=Content-Type")
|
102
|
+
doc.gsub!(%r{></m:jc>}, "/>")
|
103
|
+
doc.gsub!(%r{></v:stroke>}, "/>")
|
104
|
+
doc.gsub!(%r{></v:f>}, "/>")
|
105
|
+
doc.gsub!(%r{></v:path>}, "/>")
|
106
|
+
doc.gsub!(%r{></o:lock>}, "/>")
|
107
|
+
doc.gsub!(%r{></v:imagedata>}, "/>")
|
108
|
+
doc.gsub!(%r{></w:wrap>}, "/>")
|
109
|
+
doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
|
110
|
+
doc.gsub!(%r{&tab;|&tab;},
|
111
|
+
'<span style="mso-tab-count:1">  </span>')
|
112
|
+
doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
|
111
113
|
a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
|
112
114
|
a
|
113
115
|
end.join
|
114
|
-
r
|
115
116
|
end
|
116
117
|
|
117
118
|
PRINT_VIEW = <<~XML.freeze
|
@@ -127,30 +128,30 @@ module Html2Doc
|
|
127
128
|
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
128
129
|
XML
|
129
130
|
|
130
|
-
def self.define_head1(docxml,
|
131
|
+
def self.define_head1(docxml, _dir)
|
131
132
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
132
133
|
h.children.first.add_previous_sibling <<~XML
|
133
|
-
|
134
|
-
|
134
|
+
#{PRINT_VIEW}
|
135
|
+
<link rel="File-List" href="cid:filelist.xml"/>
|
135
136
|
XML
|
136
137
|
end
|
137
138
|
end
|
138
139
|
|
139
|
-
def self.filename_substitute(
|
140
|
-
if header_filename.nil?
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
/FILENAME/.match(m) ? "url(cid:header.html)" : m
|
140
|
+
def self.filename_substitute(head, header_filename)
|
141
|
+
return if header_filename.nil?
|
142
|
+
|
143
|
+
head.xpath(".//*[local-name() = 'style']").each do |s|
|
144
|
+
s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
|
145
|
+
/FILENAME/.match?(m) ? "url(cid:header.html)" : m
|
145
146
|
end
|
147
|
+
s.replace(s1)
|
146
148
|
end
|
147
149
|
end
|
148
150
|
|
149
|
-
def self.stylesheet(
|
150
|
-
(fn.nil? || fn.empty?)
|
151
|
+
def self.stylesheet(_filename, _header_filename, fn)
|
152
|
+
(fn.nil? || fn.empty?) and
|
151
153
|
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
152
154
|
stylesheet = File.read(fn, encoding: "UTF-8")
|
153
|
-
stylesheet = filename_substitute(stylesheet, header_filename, filename)
|
154
155
|
xml = Nokogiri::XML("<style/>")
|
155
156
|
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
|
156
157
|
xml.root.to_s
|
@@ -161,6 +162,7 @@ module Html2Doc
|
|
161
162
|
head = docxml.at("//*[local-name() = 'head']")
|
162
163
|
css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
|
163
164
|
add_stylesheet(head, title, css)
|
165
|
+
filename_substitute(head, hash[:header_file])
|
164
166
|
define_head1(docxml, hash[:dir1])
|
165
167
|
rootnamespace(docxml.root)
|
166
168
|
end
|
@@ -189,13 +191,13 @@ module Html2Doc
|
|
189
191
|
end
|
190
192
|
|
191
193
|
def self.bookmarks(docxml)
|
192
|
-
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
193
|
-
|
194
|
-
next if
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
194
|
+
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
195
|
+
.each do |x|
|
196
|
+
next if x["id"].empty? ||
|
197
|
+
%w(shapetype v:shapetype shape v:shape).include?(x.name)
|
198
|
+
|
199
|
+
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
200
|
+
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
199
201
|
end
|
200
202
|
x.delete("id")
|
201
203
|
end
|
data/lib/html2doc/lists.rb
CHANGED
@@ -2,83 +2,87 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
-
require "uuidtools"
|
6
5
|
|
7
6
|
module Html2Doc
|
8
|
-
def self.style_list(
|
7
|
+
def self.style_list(elem, level, liststyle, listnumber)
|
9
8
|
return unless liststyle
|
10
|
-
|
11
|
-
|
9
|
+
|
10
|
+
if elem["style"]
|
11
|
+
elem["style"] += ";"
|
12
12
|
else
|
13
|
-
|
13
|
+
elem["style"] = ""
|
14
14
|
end
|
15
|
-
|
15
|
+
elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
|
16
16
|
end
|
17
17
|
|
18
|
-
def self.list_add1(
|
19
|
-
if [
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
def self.list_add1(elem, liststyles, listtype, level)
|
19
|
+
if %i[ul ol].include? listtype
|
20
|
+
list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
|
21
|
+
liststyles, :ul, level + 1)
|
22
|
+
list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
|
23
|
+
liststyles, :ol, level + 1)
|
24
|
+
else
|
25
|
+
list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
|
26
|
+
liststyles, listtype, level + 1)
|
27
|
+
list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
|
28
|
+
liststyles, listtype, level + 1)
|
29
|
+
end
|
30
30
|
end
|
31
31
|
|
32
32
|
def self.list_add(xpath, liststyles, listtype, level)
|
33
|
-
xpath.each_with_index do |
|
33
|
+
xpath.each_with_index do |l, _i|
|
34
34
|
@listnumber += 1 if level == 1
|
35
|
-
|
36
|
-
|
37
|
-
(
|
35
|
+
l["seen"] = true if level == 1
|
36
|
+
l["id"] ||= UUIDTools::UUID.random_create
|
37
|
+
(l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
|
38
38
|
style_list(li, level, liststyles[listtype], @listnumber)
|
39
39
|
list_add1(li, liststyles, listtype, level)
|
40
40
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
|
42
|
+
".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
|
43
|
+
.each do |li|
|
44
|
+
list_add1(li.parent, liststyles, listtype, level - 1)
|
44
45
|
end
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def self.list2para(
|
49
|
-
return if
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
def self.list2para(list)
|
50
|
+
return if list.xpath("./li").empty?
|
51
|
+
|
52
|
+
list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
|
53
|
+
list.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
|
54
|
+
list.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
|
55
|
+
list.xpath("./li").each do |l|
|
54
56
|
l.name = "p"
|
55
57
|
l["class"] ||= "MsoListParagraphCxSpMiddle"
|
56
58
|
l&.first_element_child&.name == "p" and
|
57
59
|
l.first_element_child.replace(l.first_element_child.children)
|
58
60
|
end
|
59
|
-
|
61
|
+
list.replace(list.children)
|
60
62
|
end
|
61
63
|
|
62
64
|
TOPLIST = "[not(ancestor::ul) and not(ancestor::ol)]".freeze
|
63
65
|
|
64
|
-
def self.lists1(docxml, liststyles,
|
65
|
-
case
|
66
|
+
def self.lists1(docxml, liststyles, style)
|
67
|
+
case style
|
66
68
|
when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
|
67
|
-
|
69
|
+
liststyles, :ul, 1)
|
68
70
|
when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
|
69
71
|
liststyles, :ol, 1)
|
70
72
|
else
|
71
|
-
list_add(docxml.xpath("//ol[@class = '#{
|
72
|
-
"//ul[@class = '#{
|
73
|
-
liststyles,
|
73
|
+
list_add(docxml.xpath("//ol[@class = '#{style}']#{TOPLIST} | "\
|
74
|
+
"//ul[@class = '#{style}']#{TOPLIST}"),
|
75
|
+
liststyles, style, 1)
|
74
76
|
end
|
75
77
|
end
|
76
78
|
|
77
79
|
def self.lists_unstyled(docxml, liststyles)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
80
|
+
liststyles.has_key?(:ul) and
|
81
|
+
list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
|
82
|
+
liststyles, :ul, 1)
|
83
|
+
liststyles.has_key?(:ol) and
|
84
|
+
list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
|
85
|
+
liststyles, :ul, 1)
|
82
86
|
docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
|
83
87
|
l.delete("seen")
|
84
88
|
end
|
@@ -86,6 +90,7 @@ module Html2Doc
|
|
86
90
|
|
87
91
|
def self.lists(docxml, liststyles)
|
88
92
|
return if liststyles.nil?
|
93
|
+
|
89
94
|
@listnumber = 0
|
90
95
|
liststyles.each_key { |k| lists1(docxml, liststyles, k) }
|
91
96
|
lists_unstyled(docxml, liststyles)
|