html2doc 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2doc.gemspec +2 -1
- data/lib/html2doc/base.rb +1 -1
- data/lib/html2doc/math.rb +106 -47
- data/lib/html2doc/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
|
4
|
+
data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
|
7
|
+
data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
|
data/html2doc.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
|
29
29
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
30
30
|
spec.add_dependency "image_size", ">= 3.2.0"
|
31
|
+
spec.add_dependency "metanorma-utils"
|
31
32
|
spec.add_dependency "mime-types"
|
32
33
|
spec.add_dependency "nokogiri", "~> 1.14"
|
33
34
|
spec.add_dependency "plane1converter", "~> 0.0.1"
|
34
|
-
spec.add_dependency "plurimath"
|
35
|
+
spec.add_dependency "plurimath", "~> 0.5.0"
|
35
36
|
spec.add_dependency "thread_safe"
|
36
37
|
spec.add_dependency "uuidtools"
|
37
38
|
|
data/lib/html2doc/base.rb
CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
|
|
141
141
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
142
142
|
.each do |x|
|
143
143
|
(x["id"].empty? ||
|
144
|
-
%w(shapetype v:
|
144
|
+
%w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
|
145
145
|
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
146
146
|
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
147
147
|
end
|
data/lib/html2doc/math.rb
CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
|
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
5
|
require "plane1converter"
|
6
|
+
require "metanorma-utils"
|
7
|
+
|
8
|
+
module Nokogiri
|
9
|
+
module XML
|
10
|
+
class Node
|
11
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
12
|
+
|
13
|
+
def ooxml_xpath(path)
|
14
|
+
p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
|
15
|
+
xpath(p, "m" => OOXML_NS)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
6
20
|
|
7
21
|
class Html2Doc
|
8
22
|
def progress_conv(idx, step, total, threshold, msg)
|
@@ -20,17 +34,30 @@ class Html2Doc
|
|
20
34
|
doc
|
21
35
|
end
|
22
36
|
|
37
|
+
MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
|
38
|
+
|
23
39
|
# random fixes to MathML input that OOXML needs to render properly
|
24
40
|
def ooxml_cleanup(math, docnamespaces)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
41
|
+
#encode_math(
|
42
|
+
unwrap_accents(
|
43
|
+
mathml_preserve_space(
|
44
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
45
|
+
),
|
46
|
+
)
|
47
|
+
#)
|
48
|
+
math.add_namespace(nil, MATHML_NS)
|
31
49
|
math
|
32
50
|
end
|
33
51
|
|
52
|
+
def encode_math(elem)
|
53
|
+
elem.traverse do |e|
|
54
|
+
e.text? or next
|
55
|
+
e.text.strip.empty? and next
|
56
|
+
e.replace(@c.encode(e.text, :hexadecimal))
|
57
|
+
end
|
58
|
+
elem
|
59
|
+
end
|
60
|
+
|
34
61
|
def mathml_insert_rows(math, docnamespaces)
|
35
62
|
math.xpath(%w(msup msub msubsup munder mover munderover)
|
36
63
|
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
@@ -50,47 +77,57 @@ class Html2Doc
|
|
50
77
|
|
51
78
|
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
52
79
|
|
80
|
+
def wrap_text(elem, wrapper)
|
81
|
+
elem.traverse do |e|
|
82
|
+
e.text? or next
|
83
|
+
e.text.strip.empty? and next
|
84
|
+
e.wrap(wrapper)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
53
88
|
def unitalic(math)
|
54
|
-
math.
|
55
|
-
x
|
89
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
|
90
|
+
wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
|
56
91
|
end
|
57
|
-
math.
|
58
|
-
x
|
92
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
|
93
|
+
wrap_text(x,
|
94
|
+
"<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
59
95
|
end
|
60
|
-
math.
|
61
|
-
x
|
96
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
|
97
|
+
wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
|
62
98
|
end
|
63
|
-
math.
|
64
|
-
x
|
99
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
|
100
|
+
wrap_text(x,
|
101
|
+
"<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
65
102
|
end
|
66
|
-
math.
|
103
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
|
67
104
|
to_plane1(x, :monospace)
|
68
105
|
end
|
69
|
-
math.
|
106
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
|
70
107
|
to_plane1(x, :doublestruck)
|
71
108
|
end
|
72
|
-
math.
|
109
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
|
73
110
|
to_plane1(x, :script)
|
74
111
|
end
|
75
|
-
math.
|
112
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
|
76
113
|
to_plane1(x, :scriptbold)
|
77
114
|
end
|
78
|
-
math.
|
115
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
|
79
116
|
to_plane1(x, :fraktur)
|
80
117
|
end
|
81
|
-
math.
|
118
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
|
82
119
|
to_plane1(x, :frakturbold)
|
83
120
|
end
|
84
|
-
math.
|
121
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
|
85
122
|
to_plane1(x, :sans)
|
86
123
|
end
|
87
|
-
math.
|
124
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
|
88
125
|
to_plane1(x, :sansbold)
|
89
126
|
end
|
90
|
-
math.
|
127
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
|
91
128
|
to_plane1(x, :sansitalic)
|
92
129
|
end
|
93
|
-
math.
|
130
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
|
94
131
|
to_plane1(x, :sansbolditalic)
|
95
132
|
end
|
96
133
|
math
|
@@ -119,22 +156,26 @@ class Html2Doc
|
|
119
156
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
120
157
|
# insists on inheriting namespace from parent.
|
121
158
|
def ooml_clean(xml)
|
122
|
-
xml.
|
159
|
+
xml.to_xml(indent: 0)
|
123
160
|
.gsub(/<\?[^>]+>\s*/, "")
|
124
161
|
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
125
|
-
|
162
|
+
# .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
126
163
|
end
|
127
164
|
|
128
165
|
def mathml_to_ooml1(xml, docnamespaces)
|
129
166
|
doc = Nokogiri::XML::Document::new
|
130
167
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
131
|
-
ooxml =
|
132
|
-
|
168
|
+
# ooxml = @xsltemplate.transform(doc)
|
169
|
+
d = xml.parent["block"] != "false" # display_style
|
170
|
+
ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
|
171
|
+
:mathml).to_omml)
|
172
|
+
ooxml = unitalic(accent_tr(ooxml))
|
173
|
+
ooxml = ooml_clean(uncenter(xml, ooxml))
|
133
174
|
xml.swap(ooxml)
|
134
175
|
end
|
135
176
|
|
136
177
|
def accent_tr(xml)
|
137
|
-
xml.
|
178
|
+
xml.ooxml_xpath(".//accPr/chr").each do |x|
|
138
179
|
x["m:val"] &&= accent_tr1(x["m:val"])
|
139
180
|
x["val"] &&= accent_tr1(x["val"])
|
140
181
|
end
|
@@ -150,30 +191,48 @@ class Html2Doc
|
|
150
191
|
end
|
151
192
|
end
|
152
193
|
|
153
|
-
|
154
|
-
# XML indentation
|
155
|
-
def esc_space(xml)
|
156
|
-
xml.traverse do |n|
|
157
|
-
next unless n.text?
|
194
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
158
195
|
|
159
|
-
|
160
|
-
|
161
|
-
|
196
|
+
def math_only_para?(node)
|
197
|
+
x = node.dup
|
198
|
+
x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
|
199
|
+
x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
|
200
|
+
x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
|
201
|
+
# namespace can go missing during processing
|
202
|
+
x.text.strip.empty?
|
203
|
+
end
|
204
|
+
|
205
|
+
def math_block?(_ooxml, mathml)
|
206
|
+
# ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
207
|
+
mathml["displaystyle"] == "true"
|
162
208
|
end
|
163
209
|
|
210
|
+
STYLE_BEARING_NODE =
|
211
|
+
%w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
|
212
|
+
|
164
213
|
# if oomml has no siblings, by default it is centered; override this with
|
165
214
|
# left/right if parent is so tagged
|
215
|
+
# also if ooml has mathPara already, or is in para with only oMath content
|
166
216
|
def uncenter(math, ooxml)
|
167
|
-
alignnode = math.
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
end
|
217
|
+
alignnode = math.xpath(STYLE_BEARING_NODE).last
|
218
|
+
ooxml.document? and ooxml = ooxml.root
|
219
|
+
ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
|
220
|
+
dir = "left"
|
221
|
+
alignnode["style"]&.include?("text-align:right") and dir = "right"
|
222
|
+
ooxml.name == "oMathPara" or
|
223
|
+
ooxml.wrap("<m:oMathPara></m:oMathPara>")
|
224
|
+
ooxml.elements.first.previous =
|
225
|
+
"<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
|
177
226
|
ooxml
|
178
227
|
end
|
228
|
+
|
229
|
+
def uncenter_unneeded(math, ooxml, alignnode)
|
230
|
+
(math_block?(ooxml, math) || !alignnode) and return ooxml
|
231
|
+
if !math_only_para?(alignnode)
|
232
|
+
ooxml.name == "oMathPara" and
|
233
|
+
ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
|
234
|
+
return ooxml
|
235
|
+
end
|
236
|
+
nil
|
237
|
+
end
|
179
238
|
end
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: metanorma-utils
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mime-types
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +98,16 @@ dependencies:
|
|
84
98
|
name: plurimath
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
103
|
+
version: 0.5.0
|
90
104
|
type: :runtime
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
110
|
+
version: 0.5.0
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: thread_safe
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|