html2doc 1.5.4 → 1.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/html2doc/base.rb +10 -91
- data/lib/html2doc/mime.rb +19 -10
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc/xml.rb +83 -0
- data/lib/html2doc.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46856bf56ad5dd95f8f5781dc11049bb4600060c28c49715a262837ece8028bf
|
4
|
+
data.tar.gz: 866ba19867f233b45aeee436df719679623d671902b30476b61952f7a6357e1f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b949f47c356437ce418f65ce7fd1c497648d0d0e960fe1e05d7318d280ddf6de23ddad8e5ab94a18f447b3eaba948b2a4db69ca2d00f0dcfebd692933a64c1da
|
7
|
+
data.tar.gz: 953999bd39aa1c1b6a0e1a34c939dcbdba01242c898f5a587eab546b4c9a4578e8051c9c68b49900390fcb3ab5d2de4ad6f0f1bce1af5a8ee6e7bd4daf300966
|
data/lib/html2doc/base.rb
CHANGED
@@ -30,8 +30,7 @@ class Html2Doc
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def process_header(headerfile)
|
33
|
-
|
34
|
-
|
33
|
+
headerfile.nil? and return
|
35
34
|
doc = File.read(headerfile, encoding: "utf-8")
|
36
35
|
doc = header_image_cleanup(doc, @dir1, @filename,
|
37
36
|
File.dirname(@filename))
|
@@ -66,6 +65,7 @@ class Html2Doc
|
|
66
65
|
end
|
67
66
|
|
68
67
|
def cleanup(docxml)
|
68
|
+
locate_landscape(docxml)
|
69
69
|
namespace(docxml.root)
|
70
70
|
image_cleanup(docxml, @dir1, @imagedir)
|
71
71
|
mathml_to_ooml(docxml)
|
@@ -76,76 +76,11 @@ class Html2Doc
|
|
76
76
|
docxml
|
77
77
|
end
|
78
78
|
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
<body> </body> </html>
|
85
|
-
HERE
|
86
|
-
|
87
|
-
def to_xhtml(xml)
|
88
|
-
xml.gsub!(/<\?xml[^>]*>/, "")
|
89
|
-
unless /<!DOCTYPE /.match? xml
|
90
|
-
xml = '<!DOCTYPE html SYSTEM
|
91
|
-
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
92
|
-
end
|
93
|
-
xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
|
94
|
-
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
|
95
|
-
Nokogiri::XML.parse(xml)
|
96
|
-
end
|
97
|
-
|
98
|
-
DOCTYPE = <<~"DOCTYPE".freeze
|
99
|
-
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
100
|
-
DOCTYPE
|
101
|
-
|
102
|
-
def from_xhtml(xml)
|
103
|
-
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
104
|
-
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
105
|
-
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
106
|
-
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
|
107
|
-
.gsub("\n-->\n", "\n-->\n")
|
108
|
-
end
|
109
|
-
|
110
|
-
def msword_fix(doc)
|
111
|
-
# brain damage in MSWord parser
|
112
|
-
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
113
|
-
"<w:DoNotOptimizeForBrowser/>")
|
114
|
-
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
115
|
-
'<span style="mso-special-character:footnote"></span>')
|
116
|
-
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
117
|
-
'<div style="mso-element:footnote-list"/>')
|
118
|
-
doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
119
|
-
doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
120
|
-
doc.gsub!(%r{<meta http-equiv="Content-Type"},
|
121
|
-
"<meta http-equiv=Content-Type")
|
122
|
-
doc.gsub!(%r{></m:jc>}, "/>")
|
123
|
-
doc.gsub!(%r{></v:stroke>}, "/>")
|
124
|
-
doc.gsub!(%r{></v:f>}, "/>")
|
125
|
-
doc.gsub!(%r{></v:path>}, "/>")
|
126
|
-
doc.gsub!(%r{></o:lock>}, "/>")
|
127
|
-
doc.gsub!(%r{></v:imagedata>}, "/>")
|
128
|
-
doc.gsub!(%r{></w:wrap>}, "/>")
|
129
|
-
doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
|
130
|
-
doc.gsub!(%r{&tab;|&tab;},
|
131
|
-
'<span style="mso-tab-count:1">  </span>')
|
132
|
-
doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
|
133
|
-
a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
|
134
|
-
a
|
135
|
-
end.join
|
136
|
-
end
|
137
|
-
|
138
|
-
PRINT_VIEW = <<~XML.freeze
|
139
|
-
|
140
|
-
<xml>
|
141
|
-
<w:WordDocument>
|
142
|
-
<w:View>Print</w:View>
|
143
|
-
<w:Zoom>100</w:Zoom>
|
144
|
-
<w:DoNotOptimizeForBrowser/>
|
145
|
-
</w:WordDocument>
|
146
|
-
</xml>
|
147
|
-
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
148
|
-
XML
|
79
|
+
def locate_landscape(_docxml)
|
80
|
+
css = read_stylesheet(@stylesheet)
|
81
|
+
@landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
|
82
|
+
.map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
|
83
|
+
end
|
149
84
|
|
150
85
|
def define_head1(docxml, _dir)
|
151
86
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
@@ -174,7 +109,6 @@ class Html2Doc
|
|
174
109
|
# xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
175
110
|
xml.children.first << Nokogiri::XML::CDATA
|
176
111
|
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
177
|
-
|
178
112
|
xml.root.to_s
|
179
113
|
end
|
180
114
|
|
@@ -199,30 +133,15 @@ class Html2Doc
|
|
199
133
|
head.add_child css
|
200
134
|
elsif title.nil?
|
201
135
|
head.children.first.add_previous_sibling css
|
202
|
-
else
|
203
|
-
title.add_next_sibling css
|
136
|
+
else title.add_next_sibling css
|
204
137
|
end
|
205
138
|
end
|
206
139
|
|
207
|
-
def namespace(root)
|
208
|
-
{
|
209
|
-
o: "urn:schemas-microsoft-com:office:office",
|
210
|
-
w: "urn:schemas-microsoft-com:office:word",
|
211
|
-
v: "urn:schemas-microsoft-com:vml",
|
212
|
-
m: "http://schemas.microsoft.com/office/2004/12/omml",
|
213
|
-
}.each { |k, v| root.add_namespace_definition(k.to_s, v) }
|
214
|
-
end
|
215
|
-
|
216
|
-
def rootnamespace(root)
|
217
|
-
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
|
218
|
-
end
|
219
|
-
|
220
140
|
def bookmarks(docxml)
|
221
141
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
222
142
|
.each do |x|
|
223
|
-
|
224
|
-
%w(shapetype v:shapetype shape v:shape).include?(x.name)
|
225
|
-
|
143
|
+
(x["id"].empty? ||
|
144
|
+
%w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
|
226
145
|
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
227
146
|
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
228
147
|
end
|
data/lib/html2doc/mime.rb
CHANGED
@@ -78,8 +78,7 @@ class Html2Doc
|
|
78
78
|
|
79
79
|
def image_resize(img, path, maxheight, maxwidth)
|
80
80
|
s, realsize = get_image_size(img, path)
|
81
|
-
|
82
|
-
|
81
|
+
s[0] == nil && s[1] == nil and return s
|
83
82
|
if img.name == "svg" && !img["viewBox"]
|
84
83
|
img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
|
85
84
|
end
|
@@ -118,12 +117,24 @@ class Html2Doc
|
|
118
117
|
docxml.traverse do |i|
|
119
118
|
skip_image_cleanup?(i) and next
|
120
119
|
local_filename = rename_image(i, dir, localdir)
|
121
|
-
i["width"], i["height"] =
|
122
|
-
|
120
|
+
i["width"], i["height"] =
|
121
|
+
if landscape?(i)
|
122
|
+
image_resize(i, local_filename, maxwidth, maxheight)
|
123
|
+
else
|
124
|
+
image_resize(i, local_filename, maxheight, maxwidth)
|
125
|
+
end
|
123
126
|
end
|
124
127
|
docxml
|
125
128
|
end
|
126
129
|
|
130
|
+
def landscape?(img)
|
131
|
+
img.ancestors.each do |a|
|
132
|
+
a.name == "div" or next
|
133
|
+
@landscape.include?(a["class"]) and return true
|
134
|
+
end
|
135
|
+
false
|
136
|
+
end
|
137
|
+
|
127
138
|
def rename_image(img, dir, localdir)
|
128
139
|
local_filename = localname(img["src"], localdir)
|
129
140
|
new_filename = "#{mkuuid}#{File.extname(img['src'])}"
|
@@ -134,10 +145,9 @@ class Html2Doc
|
|
134
145
|
|
135
146
|
def skip_image_cleanup?(img)
|
136
147
|
src = img["src"]
|
137
|
-
|
138
|
-
|
139
|
-
%r{^data:(image|application)/[^;]+;base64}.match?(src)
|
140
|
-
|
148
|
+
(img.element? && %w(img v:imagedata).include?(img.name)) or return true
|
149
|
+
(src.nil? || src.empty? || /^http/.match?(src) ||
|
150
|
+
%r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
|
141
151
|
false
|
142
152
|
end
|
143
153
|
|
@@ -222,8 +232,7 @@ class Html2Doc
|
|
222
232
|
f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
|
223
233
|
<o:MainFile HRef="../#{filename}.htm"/>}
|
224
234
|
Dir.entries(dir).sort.each do |item|
|
225
|
-
|
226
|
-
|
235
|
+
(item == "." || item == ".." || /^\./.match(item)) and next
|
227
236
|
f.write %{ <o:File HRef="#{item}"/>\n}
|
228
237
|
end
|
229
238
|
f.write("</xml>\n")
|
data/lib/html2doc/version.rb
CHANGED
data/lib/html2doc/xml.rb
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
class Html2Doc
|
2
|
+
NOKOHEAD = <<~HERE.freeze
|
3
|
+
<!DOCTYPE html SYSTEM
|
4
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
6
|
+
<head> <title></title> <meta charset="UTF-8" /> </head>
|
7
|
+
<body> </body> </html>
|
8
|
+
HERE
|
9
|
+
|
10
|
+
def to_xhtml(xml)
|
11
|
+
xml.gsub!(/<\?xml[^>]*>/, "")
|
12
|
+
unless /<!DOCTYPE /.match? xml
|
13
|
+
xml = '<!DOCTYPE html SYSTEM
|
14
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
15
|
+
end
|
16
|
+
xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
|
17
|
+
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
|
18
|
+
Nokogiri::XML.parse(xml)
|
19
|
+
end
|
20
|
+
|
21
|
+
DOCTYPE = <<~DOCTYPE.freeze
|
22
|
+
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
23
|
+
DOCTYPE
|
24
|
+
|
25
|
+
def from_xhtml(xml)
|
26
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
27
|
+
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
28
|
+
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
29
|
+
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
|
30
|
+
.gsub("\n-->\n", "\n-->\n")
|
31
|
+
end
|
32
|
+
|
33
|
+
def msword_fix(doc)
|
34
|
+
# brain damage in MSWord parser
|
35
|
+
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
36
|
+
"<w:DoNotOptimizeForBrowser/>")
|
37
|
+
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
38
|
+
'<span style="mso-special-character:footnote"></span>')
|
39
|
+
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
40
|
+
'<div style="mso-element:footnote-list"/>')
|
41
|
+
doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
42
|
+
doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
43
|
+
doc.gsub!(%r{<meta http-equiv="Content-Type"},
|
44
|
+
"<meta http-equiv=Content-Type")
|
45
|
+
doc.gsub!(%r{></m:jc>}, "/>")
|
46
|
+
doc.gsub!(%r{></v:stroke>}, "/>")
|
47
|
+
doc.gsub!(%r{></v:f>}, "/>")
|
48
|
+
doc.gsub!(%r{></v:path>}, "/>")
|
49
|
+
doc.gsub!(%r{></o:lock>}, "/>")
|
50
|
+
doc.gsub!(%r{></v:imagedata>}, "/>")
|
51
|
+
doc.gsub!(%r{></w:wrap>}, "/>")
|
52
|
+
doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
|
53
|
+
doc.gsub!(%r{&tab;|&tab;},
|
54
|
+
'<span style="mso-tab-count:1">  </span>')
|
55
|
+
doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
|
56
|
+
a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
|
57
|
+
a
|
58
|
+
end.join
|
59
|
+
end
|
60
|
+
|
61
|
+
PRINT_VIEW = <<~XML.freeze
|
62
|
+
|
63
|
+
<xml>
|
64
|
+
<w:WordDocument>
|
65
|
+
<w:View>Print</w:View>
|
66
|
+
<w:Zoom>100</w:Zoom>
|
67
|
+
<w:DoNotOptimizeForBrowser/>
|
68
|
+
</w:WordDocument>
|
69
|
+
</xml>
|
70
|
+
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
71
|
+
XML
|
72
|
+
|
73
|
+
def namespace(root)
|
74
|
+
{ o: "urn:schemas-microsoft-com:office:office",
|
75
|
+
w: "urn:schemas-microsoft-com:office:word",
|
76
|
+
v: "urn:schemas-microsoft-com:vml",
|
77
|
+
m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
|
78
|
+
end
|
79
|
+
|
80
|
+
def rootnamespace(root)
|
81
|
+
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
|
82
|
+
end
|
83
|
+
end
|
data/lib/html2doc.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.5.
|
4
|
+
version: 1.5.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-06-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -293,6 +293,7 @@ files:
|
|
293
293
|
- lib/html2doc/notes.rb
|
294
294
|
- lib/html2doc/version.rb
|
295
295
|
- lib/html2doc/wordstyle.css
|
296
|
+
- lib/html2doc/xml.rb
|
296
297
|
homepage: https://github.com/metanorma/html2doc
|
297
298
|
licenses:
|
298
299
|
- CC-BY-SA-3.0
|