html2doc 1.1.0 → 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.hound.yml +3 -1
- data/.rubocop.yml +4 -8
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc/base.rb +48 -46
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -79
- data/lib/html2doc/mime.rb +41 -34
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc.rb +0 -3
- data/spec/html2doc_spec.rb +566 -521
- metadata +42 -42
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b9ed3f5d01d7910a104f86dfe54090ffc3ddf56730f5885293801b3848b24735
|
4
|
+
data.tar.gz: 98428b2016bba38f17cb66226e2fb8d96a28c6ad28bd47a3bc0b998ea1c81228
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ede857348aa47a2f09df5c0c1929056251729b358815130ed6c7823f14e9a49cbb1439d43eb45104cb6be2104f47b4dda15b156680dfefd687c4d6439e162c89
|
7
|
+
data.tar.gz: 4027da3d313f7efb834efc96666d6aedfa509d3b2fc7335b367259833a0050e29b13da92e40514b2afee76b9f84420b81951d1fb9d577643a077643823dcf23c
|
data/.github/workflows/rake.yml
CHANGED
@@ -16,19 +16,9 @@ jobs:
|
|
16
16
|
strategy:
|
17
17
|
fail-fast: false
|
18
18
|
matrix:
|
19
|
-
ruby: [ '
|
19
|
+
ruby: [ '3.0', '2.7', '2.6', '2.5' ]
|
20
20
|
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
21
|
experimental: [ false ]
|
22
|
-
include:
|
23
|
-
- ruby: '3.0'
|
24
|
-
os: 'ubuntu-latest'
|
25
|
-
experimental: true
|
26
|
-
- ruby: '3.0'
|
27
|
-
os: 'windows-latest'
|
28
|
-
experimental: true
|
29
|
-
- ruby: '3.0'
|
30
|
-
os: 'macos-latest'
|
31
|
-
experimental: true
|
32
22
|
steps:
|
33
23
|
- uses: actions/checkout@v2
|
34
24
|
with:
|
data/.hound.yml
CHANGED
data/.rubocop.yml
CHANGED
@@ -1,14 +1,10 @@
|
|
1
|
-
#
|
2
|
-
# https://github.com/
|
3
|
-
# All project-specific additions and overrides should be specified in this file.
|
1
|
+
# Auto-generated by Cimas: Do not edit it manually!
|
2
|
+
# See https://github.com/metanorma/cimas
|
4
3
|
inherit_from:
|
5
4
|
- https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
|
6
5
|
|
7
6
|
# local repo-specific modifications
|
7
|
+
# ...
|
8
8
|
|
9
9
|
AllCops:
|
10
|
-
|
11
|
-
StyleGuideCopsOnly: false
|
12
|
-
TargetRubyVersion: 2.4
|
13
|
-
Rails:
|
14
|
-
Enabled: true
|
10
|
+
TargetRubyVersion: 2.5
|
data/Gemfile
CHANGED
@@ -10,6 +10,6 @@ end
|
|
10
10
|
|
11
11
|
gemspec
|
12
12
|
|
13
|
-
if File.exist?
|
14
|
-
eval File.read(
|
13
|
+
if File.exist? "Gemfile.devel"
|
14
|
+
eval File.read("Gemfile.devel"), nil, "Gemfile.devel" # rubocop:disable Security/Eval
|
15
15
|
end
|
data/Rakefile
CHANGED
data/bin/html2doc
CHANGED
data/bin/rspec
CHANGED
data/html2doc.gemspec
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
lib = File.expand_path("../lib", __FILE__)
|
1
|
+
lib = File.expand_path("lib", __dir__)
|
3
2
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
3
|
require "html2doc/version"
|
5
4
|
|
@@ -16,23 +15,23 @@ Gem::Specification.new do |spec|
|
|
16
15
|
This gem is in active development.
|
17
16
|
DESCRIPTION
|
18
17
|
|
19
|
-
spec.homepage
|
20
|
-
spec.licenses
|
18
|
+
spec.homepage = "https://github.com/metanorma/html2doc"
|
19
|
+
spec.licenses = ["CC-BY-SA-3.0", "BSD-2-Clause"]
|
21
20
|
|
22
21
|
spec.bindir = "bin"
|
23
22
|
spec.require_paths = ["lib"]
|
24
23
|
spec.files = `git ls-files`.split("\n")
|
25
24
|
spec.test_files = `git ls-files -- {spec}/*`.split("\n")
|
26
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 2.
|
25
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 2.5.0")
|
27
26
|
|
27
|
+
spec.add_dependency "asciimath", "~> 2.0.2"
|
28
28
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
29
29
|
spec.add_dependency "image_size"
|
30
30
|
spec.add_dependency "mime-types"
|
31
|
-
spec.add_dependency "nokogiri", "~> 1.
|
31
|
+
spec.add_dependency "nokogiri", "~> 1.12"
|
32
|
+
spec.add_dependency "plane1converter", "~> 0.0.1"
|
32
33
|
spec.add_dependency "thread_safe"
|
33
34
|
spec.add_dependency "uuidtools"
|
34
|
-
spec.add_dependency "asciimath", "~> 2.0.2"
|
35
|
-
spec.add_dependency "plane1converter", "~> 0.0.1"
|
36
35
|
|
37
36
|
spec.add_development_dependency "byebug", "~> 9.1"
|
38
37
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
@@ -40,8 +39,8 @@ Gem::Specification.new do |spec|
|
|
40
39
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
41
40
|
spec.add_development_dependency "rake", "~> 12.0"
|
42
41
|
spec.add_development_dependency "rspec", "~> 3.6"
|
42
|
+
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
|
43
43
|
spec.add_development_dependency "rubocop", "~> 1.5.2"
|
44
44
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
45
45
|
spec.add_development_dependency "timecop", "~> 0.9"
|
46
|
-
spec.add_development_dependency "rspec-match_fuzzy", "~> 0.1.3"
|
47
46
|
end
|
data/lib/html2doc/base.rb
CHANGED
@@ -2,8 +2,6 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
-
#require "xml/xslt"
|
6
|
-
require "pp"
|
7
5
|
require "fileutils"
|
8
6
|
|
9
7
|
module Html2Doc
|
@@ -19,15 +17,17 @@ module Html2Doc
|
|
19
17
|
|
20
18
|
def self.process_header(headerfile, hash)
|
21
19
|
return if headerfile.nil?
|
20
|
+
|
22
21
|
doc = File.read(headerfile, encoding: "utf-8")
|
23
|
-
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
22
|
+
doc = header_image_cleanup(doc, hash[:dir1], hash[:filename],
|
23
|
+
File.dirname(hash[:filename]))
|
24
24
|
File.open("#{hash[:dir1]}/header.html", "w:UTF-8") { |f| f.write(doc) }
|
25
25
|
end
|
26
26
|
|
27
27
|
def self.clear_dir(dir)
|
28
28
|
Dir.foreach(dir) do |f|
|
29
29
|
fn = File.join(dir, f)
|
30
|
-
File.delete(fn) if f !=
|
30
|
+
File.delete(fn) if f != "." && f != ".."
|
31
31
|
end
|
32
32
|
dir
|
33
33
|
end
|
@@ -72,7 +72,7 @@ module Html2Doc
|
|
72
72
|
|
73
73
|
def self.to_xhtml(xml)
|
74
74
|
xml.gsub!(/<\?xml[^>]*>/, "")
|
75
|
-
unless /<!DOCTYPE /.match xml
|
75
|
+
unless /<!DOCTYPE /.match? xml
|
76
76
|
xml = '<!DOCTYPE html SYSTEM
|
77
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
78
78
|
end
|
@@ -84,34 +84,35 @@ module Html2Doc
|
|
84
84
|
DOCTYPE
|
85
85
|
|
86
86
|
def self.from_xhtml(xml)
|
87
|
-
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
-
sub(DOCTYPE, "")
|
89
|
-
gsub(%{ />}, "/>")
|
87
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
+
.sub(DOCTYPE, "")
|
89
|
+
.gsub(%{ />}, "/>")
|
90
90
|
end
|
91
91
|
|
92
|
-
def self.msword_fix(
|
92
|
+
def self.msword_fix(doc)
|
93
93
|
# brain damage in MSWord parser
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
94
|
+
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
|
+
'<span style="mso-special-character:footnote"></span>')
|
96
|
+
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
97
|
+
'<div style="mso-element:footnote-list"/>')
|
98
|
+
doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
99
|
+
doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
100
|
+
doc.gsub!(%r{<meta http-equiv="Content-Type"},
|
101
|
+
"<meta http-equiv=Content-Type")
|
102
|
+
doc.gsub!(%r{></m:jc>}, "/>")
|
103
|
+
doc.gsub!(%r{></v:stroke>}, "/>")
|
104
|
+
doc.gsub!(%r{></v:f>}, "/>")
|
105
|
+
doc.gsub!(%r{></v:path>}, "/>")
|
106
|
+
doc.gsub!(%r{></o:lock>}, "/>")
|
107
|
+
doc.gsub!(%r{></v:imagedata>}, "/>")
|
108
|
+
doc.gsub!(%r{></w:wrap>}, "/>")
|
109
|
+
doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
|
110
|
+
doc.gsub!(%r{&tab;|&tab;},
|
111
|
+
'<span style="mso-tab-count:1">  </span>')
|
112
|
+
doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
|
111
113
|
a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
|
112
114
|
a
|
113
115
|
end.join
|
114
|
-
r
|
115
116
|
end
|
116
117
|
|
117
118
|
PRINT_VIEW = <<~XML.freeze
|
@@ -127,30 +128,30 @@ module Html2Doc
|
|
127
128
|
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
128
129
|
XML
|
129
130
|
|
130
|
-
def self.define_head1(docxml,
|
131
|
+
def self.define_head1(docxml, _dir)
|
131
132
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
132
133
|
h.children.first.add_previous_sibling <<~XML
|
133
|
-
|
134
|
-
|
134
|
+
#{PRINT_VIEW}
|
135
|
+
<link rel="File-List" href="cid:filelist.xml"/>
|
135
136
|
XML
|
136
137
|
end
|
137
138
|
end
|
138
139
|
|
139
|
-
def self.filename_substitute(
|
140
|
-
if header_filename.nil?
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
/FILENAME/.match(m) ? "url(cid:header.html)" : m
|
140
|
+
def self.filename_substitute(head, header_filename)
|
141
|
+
return if header_filename.nil?
|
142
|
+
|
143
|
+
head.xpath(".//*[local-name() = 'style']").each do |s|
|
144
|
+
s1 = s.to_xml.gsub(/url\("[^"]+"\)/) do |m|
|
145
|
+
/FILENAME/.match?(m) ? "url(cid:header.html)" : m
|
145
146
|
end
|
147
|
+
s.replace(s1)
|
146
148
|
end
|
147
149
|
end
|
148
150
|
|
149
|
-
def self.stylesheet(
|
150
|
-
(fn.nil? || fn.empty?)
|
151
|
+
def self.stylesheet(_filename, _header_filename, fn)
|
152
|
+
(fn.nil? || fn.empty?) and
|
151
153
|
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
152
154
|
stylesheet = File.read(fn, encoding: "UTF-8")
|
153
|
-
stylesheet = filename_substitute(stylesheet, header_filename, filename)
|
154
155
|
xml = Nokogiri::XML("<style/>")
|
155
156
|
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
|
156
157
|
xml.root.to_s
|
@@ -161,6 +162,7 @@ module Html2Doc
|
|
161
162
|
head = docxml.at("//*[local-name() = 'head']")
|
162
163
|
css = stylesheet(hash[:filename], hash[:header_file], hash[:stylesheet])
|
163
164
|
add_stylesheet(head, title, css)
|
165
|
+
filename_substitute(head, hash[:header_file])
|
164
166
|
define_head1(docxml, hash[:dir1])
|
165
167
|
rootnamespace(docxml.root)
|
166
168
|
end
|
@@ -189,13 +191,13 @@ module Html2Doc
|
|
189
191
|
end
|
190
192
|
|
191
193
|
def self.bookmarks(docxml)
|
192
|
-
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
193
|
-
|
194
|
-
next if
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
194
|
+
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
195
|
+
.each do |x|
|
196
|
+
next if x["id"].empty? ||
|
197
|
+
%w(shapetype v:shapetype shape v:shape).include?(x.name)
|
198
|
+
|
199
|
+
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
200
|
+
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
199
201
|
end
|
200
202
|
x.delete("id")
|
201
203
|
end
|
data/lib/html2doc/lists.rb
CHANGED
@@ -2,83 +2,87 @@ require "uuidtools"
|
|
2
2
|
require "asciimath"
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
|
-
require "uuidtools"
|
6
5
|
|
7
6
|
module Html2Doc
|
8
|
-
def self.style_list(
|
7
|
+
def self.style_list(elem, level, liststyle, listnumber)
|
9
8
|
return unless liststyle
|
10
|
-
|
11
|
-
|
9
|
+
|
10
|
+
if elem["style"]
|
11
|
+
elem["style"] += ";"
|
12
12
|
else
|
13
|
-
|
13
|
+
elem["style"] = ""
|
14
14
|
end
|
15
|
-
|
15
|
+
elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
|
16
16
|
end
|
17
17
|
|
18
|
-
def self.list_add1(
|
19
|
-
if [
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
18
|
+
def self.list_add1(elem, liststyles, listtype, level)
|
19
|
+
if %i[ul ol].include? listtype
|
20
|
+
list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
|
21
|
+
liststyles, :ul, level + 1)
|
22
|
+
list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
|
23
|
+
liststyles, :ol, level + 1)
|
24
|
+
else
|
25
|
+
list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
|
26
|
+
liststyles, listtype, level + 1)
|
27
|
+
list_add(elem.xpath(".//ol") - elem.xpath(".//ul//ol | .//ol//ol"),
|
28
|
+
liststyles, listtype, level + 1)
|
29
|
+
end
|
30
30
|
end
|
31
31
|
|
32
32
|
def self.list_add(xpath, liststyles, listtype, level)
|
33
|
-
xpath.each_with_index do |
|
33
|
+
xpath.each_with_index do |l, _i|
|
34
34
|
@listnumber += 1 if level == 1
|
35
|
-
|
36
|
-
|
37
|
-
(
|
35
|
+
l["seen"] = true if level == 1
|
36
|
+
l["id"] ||= UUIDTools::UUID.random_create
|
37
|
+
(l.xpath(".//li") - l.xpath(".//ol//li | .//ul//li")).each do |li|
|
38
38
|
style_list(li, level, liststyles[listtype], @listnumber)
|
39
39
|
list_add1(li, liststyles, listtype, level)
|
40
40
|
end
|
41
|
-
|
42
|
-
|
43
|
-
|
41
|
+
l.xpath(".//ul[not(ancestor::li/ancestor::*/@id = '#{l['id']}')] | "\
|
42
|
+
".//ol[not(ancestor::li/ancestor::*/@id = '#{l['id']}')]")
|
43
|
+
.each do |li|
|
44
|
+
list_add1(li.parent, liststyles, listtype, level - 1)
|
44
45
|
end
|
45
46
|
end
|
46
47
|
end
|
47
48
|
|
48
|
-
def self.list2para(
|
49
|
-
return if
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
49
|
+
def self.list2para(list)
|
50
|
+
return if list.xpath("./li").empty?
|
51
|
+
|
52
|
+
list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
|
53
|
+
list.xpath("./li").last["class"] ||= "MsoListParagraphCxSpLast"
|
54
|
+
list.xpath("./li/p").each { |p| p["class"] ||= "MsoListParagraphCxSpMiddle" }
|
55
|
+
list.xpath("./li").each do |l|
|
54
56
|
l.name = "p"
|
55
57
|
l["class"] ||= "MsoListParagraphCxSpMiddle"
|
56
58
|
l&.first_element_child&.name == "p" and
|
57
59
|
l.first_element_child.replace(l.first_element_child.children)
|
58
60
|
end
|
59
|
-
|
61
|
+
list.replace(list.children)
|
60
62
|
end
|
61
63
|
|
62
64
|
TOPLIST = "[not(ancestor::ul) and not(ancestor::ol)]".freeze
|
63
65
|
|
64
|
-
def self.lists1(docxml, liststyles,
|
65
|
-
case
|
66
|
+
def self.lists1(docxml, liststyles, style)
|
67
|
+
case style
|
66
68
|
when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
|
67
|
-
|
69
|
+
liststyles, :ul, 1)
|
68
70
|
when :ol then list_add(docxml.xpath("//ol[not(@class)]#{TOPLIST}"),
|
69
71
|
liststyles, :ol, 1)
|
70
72
|
else
|
71
|
-
list_add(docxml.xpath("//ol[@class = '#{
|
72
|
-
"//ul[@class = '#{
|
73
|
-
liststyles,
|
73
|
+
list_add(docxml.xpath("//ol[@class = '#{style}']#{TOPLIST} | "\
|
74
|
+
"//ul[@class = '#{style}']#{TOPLIST}"),
|
75
|
+
liststyles, style, 1)
|
74
76
|
end
|
75
77
|
end
|
76
78
|
|
77
79
|
def self.lists_unstyled(docxml, liststyles)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
80
|
+
liststyles.has_key?(:ul) and
|
81
|
+
list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
|
82
|
+
liststyles, :ul, 1)
|
83
|
+
liststyles.has_key?(:ol) and
|
84
|
+
list_add(docxml.xpath("//ol#{TOPLIST}[not(@seen)]"),
|
85
|
+
liststyles, :ul, 1)
|
82
86
|
docxml.xpath("//ul[@seen] | //ol[@seen]").each do |l|
|
83
87
|
l.delete("seen")
|
84
88
|
end
|
@@ -86,6 +90,7 @@ module Html2Doc
|
|
86
90
|
|
87
91
|
def self.lists(docxml, liststyles)
|
88
92
|
return if liststyles.nil?
|
93
|
+
|
89
94
|
@listnumber = 0
|
90
95
|
liststyles.each_key { |k| lists1(docxml, liststyles, k) }
|
91
96
|
lists_unstyled(docxml, liststyles)
|