html2doc 1.6.1 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2doc.gemspec +2 -1
- data/lib/html2doc/base.rb +1 -1
- data/lib/html2doc/math.rb +89 -46
- data/lib/html2doc/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
|
4
|
+
data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
|
7
|
+
data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
|
data/html2doc.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
|
29
29
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
30
30
|
spec.add_dependency "image_size", ">= 3.2.0"
|
31
|
+
spec.add_dependency "metanorma-utils"
|
31
32
|
spec.add_dependency "mime-types"
|
32
33
|
spec.add_dependency "nokogiri", "~> 1.14"
|
33
34
|
spec.add_dependency "plane1converter", "~> 0.0.1"
|
34
|
-
spec.add_dependency "plurimath"
|
35
|
+
spec.add_dependency "plurimath", "~> 0.5.0"
|
35
36
|
spec.add_dependency "thread_safe"
|
36
37
|
spec.add_dependency "uuidtools"
|
37
38
|
|
data/lib/html2doc/base.rb
CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
|
|
141
141
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
142
142
|
.each do |x|
|
143
143
|
(x["id"].empty? ||
|
144
|
-
%w(shapetype v:
|
144
|
+
%w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
|
145
145
|
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
146
146
|
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
147
147
|
end
|
data/lib/html2doc/math.rb
CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
|
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
5
|
require "plane1converter"
|
6
|
+
require "metanorma-utils"
|
7
|
+
|
8
|
+
module Nokogiri
|
9
|
+
module XML
|
10
|
+
class Node
|
11
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
12
|
+
|
13
|
+
def ooxml_xpath(path)
|
14
|
+
p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
|
15
|
+
xpath(p, "m" => OOXML_NS)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
6
20
|
|
7
21
|
class Html2Doc
|
8
22
|
def progress_conv(idx, step, total, threshold, msg)
|
@@ -24,15 +38,26 @@ class Html2Doc
|
|
24
38
|
|
25
39
|
# random fixes to MathML input that OOXML needs to render properly
|
26
40
|
def ooxml_cleanup(math, docnamespaces)
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
41
|
+
#encode_math(
|
42
|
+
unwrap_accents(
|
43
|
+
mathml_preserve_space(
|
44
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
45
|
+
),
|
46
|
+
)
|
47
|
+
#)
|
32
48
|
math.add_namespace(nil, MATHML_NS)
|
33
49
|
math
|
34
50
|
end
|
35
51
|
|
52
|
+
def encode_math(elem)
|
53
|
+
elem.traverse do |e|
|
54
|
+
e.text? or next
|
55
|
+
e.text.strip.empty? and next
|
56
|
+
e.replace(@c.encode(e.text, :hexadecimal))
|
57
|
+
end
|
58
|
+
elem
|
59
|
+
end
|
60
|
+
|
36
61
|
def mathml_insert_rows(math, docnamespaces)
|
37
62
|
math.xpath(%w(msup msub msubsup munder mover munderover)
|
38
63
|
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
@@ -52,47 +77,57 @@ class Html2Doc
|
|
52
77
|
|
53
78
|
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
54
79
|
|
80
|
+
def wrap_text(elem, wrapper)
|
81
|
+
elem.traverse do |e|
|
82
|
+
e.text? or next
|
83
|
+
e.text.strip.empty? and next
|
84
|
+
e.wrap(wrapper)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
55
88
|
def unitalic(math)
|
56
|
-
math.
|
57
|
-
x
|
89
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
|
90
|
+
wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
|
58
91
|
end
|
59
|
-
math.
|
60
|
-
x
|
92
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
|
93
|
+
wrap_text(x,
|
94
|
+
"<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
61
95
|
end
|
62
|
-
math.
|
63
|
-
x
|
96
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
|
97
|
+
wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
|
64
98
|
end
|
65
|
-
math.
|
66
|
-
x
|
99
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
|
100
|
+
wrap_text(x,
|
101
|
+
"<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
67
102
|
end
|
68
|
-
math.
|
103
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
|
69
104
|
to_plane1(x, :monospace)
|
70
105
|
end
|
71
|
-
math.
|
106
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
|
72
107
|
to_plane1(x, :doublestruck)
|
73
108
|
end
|
74
|
-
math.
|
109
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
|
75
110
|
to_plane1(x, :script)
|
76
111
|
end
|
77
|
-
math.
|
112
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
|
78
113
|
to_plane1(x, :scriptbold)
|
79
114
|
end
|
80
|
-
math.
|
115
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
|
81
116
|
to_plane1(x, :fraktur)
|
82
117
|
end
|
83
|
-
math.
|
118
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
|
84
119
|
to_plane1(x, :frakturbold)
|
85
120
|
end
|
86
|
-
math.
|
121
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
|
87
122
|
to_plane1(x, :sans)
|
88
123
|
end
|
89
|
-
math.
|
124
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
|
90
125
|
to_plane1(x, :sansbold)
|
91
126
|
end
|
92
|
-
math.
|
127
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
|
93
128
|
to_plane1(x, :sansitalic)
|
94
129
|
end
|
95
|
-
math.
|
130
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
|
96
131
|
to_plane1(x, :sansbolditalic)
|
97
132
|
end
|
98
133
|
math
|
@@ -121,22 +156,26 @@ class Html2Doc
|
|
121
156
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
122
157
|
# insists on inheriting namespace from parent.
|
123
158
|
def ooml_clean(xml)
|
124
|
-
xml.
|
159
|
+
xml.to_xml(indent: 0)
|
125
160
|
.gsub(/<\?[^>]+>\s*/, "")
|
126
161
|
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
127
|
-
|
162
|
+
# .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
128
163
|
end
|
129
164
|
|
130
165
|
def mathml_to_ooml1(xml, docnamespaces)
|
131
166
|
doc = Nokogiri::XML::Document::new
|
132
167
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
133
|
-
ooxml =
|
168
|
+
# ooxml = @xsltemplate.transform(doc)
|
169
|
+
d = xml.parent["block"] != "false" # display_style
|
170
|
+
ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
|
171
|
+
:mathml).to_omml)
|
172
|
+
ooxml = unitalic(accent_tr(ooxml))
|
134
173
|
ooxml = ooml_clean(uncenter(xml, ooxml))
|
135
174
|
xml.swap(ooxml)
|
136
175
|
end
|
137
176
|
|
138
177
|
def accent_tr(xml)
|
139
|
-
xml.
|
178
|
+
xml.ooxml_xpath(".//accPr/chr").each do |x|
|
140
179
|
x["m:val"] &&= accent_tr1(x["m:val"])
|
141
180
|
x["val"] &&= accent_tr1(x["val"])
|
142
181
|
end
|
@@ -152,28 +191,20 @@ class Html2Doc
|
|
152
191
|
end
|
153
192
|
end
|
154
193
|
|
155
|
-
|
156
|
-
# XML indentation
|
157
|
-
def esc_space(xml)
|
158
|
-
xml.traverse do |n|
|
159
|
-
next unless n.text?
|
160
|
-
|
161
|
-
n = n.text.gsub(/ /, "2")
|
162
|
-
end
|
163
|
-
xml
|
164
|
-
end
|
165
|
-
|
166
|
-
OOXML_NS = "http://schemas.microsoft.com/office/2004/12/omml".freeze
|
194
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
167
195
|
|
168
196
|
def math_only_para?(node)
|
169
197
|
x = node.dup
|
170
198
|
x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
|
171
199
|
x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
|
200
|
+
x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
|
201
|
+
# namespace can go missing during processing
|
172
202
|
x.text.strip.empty?
|
173
203
|
end
|
174
204
|
|
175
|
-
def math_block?(
|
176
|
-
ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
205
|
+
def math_block?(_ooxml, mathml)
|
206
|
+
# ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
207
|
+
mathml["displaystyle"] == "true"
|
177
208
|
end
|
178
209
|
|
179
210
|
STYLE_BEARING_NODE =
|
@@ -184,12 +215,24 @@ class Html2Doc
|
|
184
215
|
# also if ooml has mathPara already, or is in para with only oMath content
|
185
216
|
def uncenter(math, ooxml)
|
186
217
|
alignnode = math.xpath(STYLE_BEARING_NODE).last
|
187
|
-
|
188
|
-
(
|
189
|
-
!alignnode) || !math_only_para?(alignnode) and return ret
|
218
|
+
ooxml.document? and ooxml = ooxml.root
|
219
|
+
ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
|
190
220
|
dir = "left"
|
191
221
|
alignnode["style"]&.include?("text-align:right") and dir = "right"
|
192
|
-
"
|
193
|
-
"m:
|
222
|
+
ooxml.name == "oMathPara" or
|
223
|
+
ooxml.wrap("<m:oMathPara></m:oMathPara>")
|
224
|
+
ooxml.elements.first.previous =
|
225
|
+
"<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
|
226
|
+
ooxml
|
227
|
+
end
|
228
|
+
|
229
|
+
def uncenter_unneeded(math, ooxml, alignnode)
|
230
|
+
(math_block?(ooxml, math) || !alignnode) and return ooxml
|
231
|
+
if !math_only_para?(alignnode)
|
232
|
+
ooxml.name == "oMathPara" and
|
233
|
+
ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
|
234
|
+
return ooxml
|
235
|
+
end
|
236
|
+
nil
|
194
237
|
end
|
195
238
|
end
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: metanorma-utils
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mime-types
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +98,16 @@ dependencies:
|
|
84
98
|
name: plurimath
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
103
|
+
version: 0.5.0
|
90
104
|
type: :runtime
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
110
|
+
version: 0.5.0
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: thread_safe
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|