html2doc 1.6.0 → 1.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/html2doc.gemspec +2 -1
- data/lib/html2doc/base.rb +1 -1
- data/lib/html2doc/math.rb +106 -47
- data/lib/html2doc/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
|
4
|
+
data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
|
7
|
+
data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
|
data/html2doc.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
|
29
29
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
30
30
|
spec.add_dependency "image_size", ">= 3.2.0"
|
31
|
+
spec.add_dependency "metanorma-utils"
|
31
32
|
spec.add_dependency "mime-types"
|
32
33
|
spec.add_dependency "nokogiri", "~> 1.14"
|
33
34
|
spec.add_dependency "plane1converter", "~> 0.0.1"
|
34
|
-
spec.add_dependency "plurimath"
|
35
|
+
spec.add_dependency "plurimath", "~> 0.5.0"
|
35
36
|
spec.add_dependency "thread_safe"
|
36
37
|
spec.add_dependency "uuidtools"
|
37
38
|
|
data/lib/html2doc/base.rb
CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
|
|
141
141
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
142
142
|
.each do |x|
|
143
143
|
(x["id"].empty? ||
|
144
|
-
%w(shapetype v:
|
144
|
+
%w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
|
145
145
|
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
146
146
|
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
147
147
|
end
|
data/lib/html2doc/math.rb
CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
|
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
5
|
require "plane1converter"
|
6
|
+
require "metanorma-utils"
|
7
|
+
|
8
|
+
module Nokogiri
|
9
|
+
module XML
|
10
|
+
class Node
|
11
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
12
|
+
|
13
|
+
def ooxml_xpath(path)
|
14
|
+
p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
|
15
|
+
xpath(p, "m" => OOXML_NS)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
6
20
|
|
7
21
|
class Html2Doc
|
8
22
|
def progress_conv(idx, step, total, threshold, msg)
|
@@ -20,17 +34,30 @@ class Html2Doc
|
|
20
34
|
doc
|
21
35
|
end
|
22
36
|
|
37
|
+
MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
|
38
|
+
|
23
39
|
# random fixes to MathML input that OOXML needs to render properly
|
24
40
|
def ooxml_cleanup(math, docnamespaces)
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
41
|
+
#encode_math(
|
42
|
+
unwrap_accents(
|
43
|
+
mathml_preserve_space(
|
44
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
45
|
+
),
|
46
|
+
)
|
47
|
+
#)
|
48
|
+
math.add_namespace(nil, MATHML_NS)
|
31
49
|
math
|
32
50
|
end
|
33
51
|
|
52
|
+
def encode_math(elem)
|
53
|
+
elem.traverse do |e|
|
54
|
+
e.text? or next
|
55
|
+
e.text.strip.empty? and next
|
56
|
+
e.replace(@c.encode(e.text, :hexadecimal))
|
57
|
+
end
|
58
|
+
elem
|
59
|
+
end
|
60
|
+
|
34
61
|
def mathml_insert_rows(math, docnamespaces)
|
35
62
|
math.xpath(%w(msup msub msubsup munder mover munderover)
|
36
63
|
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
@@ -50,47 +77,57 @@ class Html2Doc
|
|
50
77
|
|
51
78
|
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
52
79
|
|
80
|
+
def wrap_text(elem, wrapper)
|
81
|
+
elem.traverse do |e|
|
82
|
+
e.text? or next
|
83
|
+
e.text.strip.empty? and next
|
84
|
+
e.wrap(wrapper)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
53
88
|
def unitalic(math)
|
54
|
-
math.
|
55
|
-
x
|
89
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
|
90
|
+
wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
|
56
91
|
end
|
57
|
-
math.
|
58
|
-
x
|
92
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
|
93
|
+
wrap_text(x,
|
94
|
+
"<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
59
95
|
end
|
60
|
-
math.
|
61
|
-
x
|
96
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
|
97
|
+
wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
|
62
98
|
end
|
63
|
-
math.
|
64
|
-
x
|
99
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
|
100
|
+
wrap_text(x,
|
101
|
+
"<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
65
102
|
end
|
66
|
-
math.
|
103
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
|
67
104
|
to_plane1(x, :monospace)
|
68
105
|
end
|
69
|
-
math.
|
106
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
|
70
107
|
to_plane1(x, :doublestruck)
|
71
108
|
end
|
72
|
-
math.
|
109
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
|
73
110
|
to_plane1(x, :script)
|
74
111
|
end
|
75
|
-
math.
|
112
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
|
76
113
|
to_plane1(x, :scriptbold)
|
77
114
|
end
|
78
|
-
math.
|
115
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
|
79
116
|
to_plane1(x, :fraktur)
|
80
117
|
end
|
81
|
-
math.
|
118
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
|
82
119
|
to_plane1(x, :frakturbold)
|
83
120
|
end
|
84
|
-
math.
|
121
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
|
85
122
|
to_plane1(x, :sans)
|
86
123
|
end
|
87
|
-
math.
|
124
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
|
88
125
|
to_plane1(x, :sansbold)
|
89
126
|
end
|
90
|
-
math.
|
127
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
|
91
128
|
to_plane1(x, :sansitalic)
|
92
129
|
end
|
93
|
-
math.
|
130
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
|
94
131
|
to_plane1(x, :sansbolditalic)
|
95
132
|
end
|
96
133
|
math
|
@@ -119,22 +156,26 @@ class Html2Doc
|
|
119
156
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
120
157
|
# insists on inheriting namespace from parent.
|
121
158
|
def ooml_clean(xml)
|
122
|
-
xml.
|
159
|
+
xml.to_xml(indent: 0)
|
123
160
|
.gsub(/<\?[^>]+>\s*/, "")
|
124
161
|
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
125
|
-
|
162
|
+
# .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
126
163
|
end
|
127
164
|
|
128
165
|
def mathml_to_ooml1(xml, docnamespaces)
|
129
166
|
doc = Nokogiri::XML::Document::new
|
130
167
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
131
|
-
ooxml =
|
132
|
-
|
168
|
+
# ooxml = @xsltemplate.transform(doc)
|
169
|
+
d = xml.parent["block"] != "false" # display_style
|
170
|
+
ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
|
171
|
+
:mathml).to_omml)
|
172
|
+
ooxml = unitalic(accent_tr(ooxml))
|
173
|
+
ooxml = ooml_clean(uncenter(xml, ooxml))
|
133
174
|
xml.swap(ooxml)
|
134
175
|
end
|
135
176
|
|
136
177
|
def accent_tr(xml)
|
137
|
-
xml.
|
178
|
+
xml.ooxml_xpath(".//accPr/chr").each do |x|
|
138
179
|
x["m:val"] &&= accent_tr1(x["m:val"])
|
139
180
|
x["val"] &&= accent_tr1(x["val"])
|
140
181
|
end
|
@@ -150,30 +191,48 @@ class Html2Doc
|
|
150
191
|
end
|
151
192
|
end
|
152
193
|
|
153
|
-
|
154
|
-
# XML indentation
|
155
|
-
def esc_space(xml)
|
156
|
-
xml.traverse do |n|
|
157
|
-
next unless n.text?
|
194
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
158
195
|
|
159
|
-
|
160
|
-
|
161
|
-
|
196
|
+
def math_only_para?(node)
|
197
|
+
x = node.dup
|
198
|
+
x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
|
199
|
+
x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
|
200
|
+
x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
|
201
|
+
# namespace can go missing during processing
|
202
|
+
x.text.strip.empty?
|
203
|
+
end
|
204
|
+
|
205
|
+
def math_block?(_ooxml, mathml)
|
206
|
+
# ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
207
|
+
mathml["displaystyle"] == "true"
|
162
208
|
end
|
163
209
|
|
210
|
+
STYLE_BEARING_NODE =
|
211
|
+
%w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
|
212
|
+
|
164
213
|
# if oomml has no siblings, by default it is centered; override this with
|
165
214
|
# left/right if parent is so tagged
|
215
|
+
# also if ooml has mathPara already, or is in para with only oMath content
|
166
216
|
def uncenter(math, ooxml)
|
167
|
-
alignnode = math.
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
end
|
217
|
+
alignnode = math.xpath(STYLE_BEARING_NODE).last
|
218
|
+
ooxml.document? and ooxml = ooxml.root
|
219
|
+
ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
|
220
|
+
dir = "left"
|
221
|
+
alignnode["style"]&.include?("text-align:right") and dir = "right"
|
222
|
+
ooxml.name == "oMathPara" or
|
223
|
+
ooxml.wrap("<m:oMathPara></m:oMathPara>")
|
224
|
+
ooxml.elements.first.previous =
|
225
|
+
"<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
|
177
226
|
ooxml
|
178
227
|
end
|
228
|
+
|
229
|
+
def uncenter_unneeded(math, ooxml, alignnode)
|
230
|
+
(math_block?(ooxml, math) || !alignnode) and return ooxml
|
231
|
+
if !math_only_para?(alignnode)
|
232
|
+
ooxml.name == "oMathPara" and
|
233
|
+
ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
|
234
|
+
return ooxml
|
235
|
+
end
|
236
|
+
nil
|
237
|
+
end
|
179
238
|
end
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: metanorma-utils
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mime-types
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +98,16 @@ dependencies:
|
|
84
98
|
name: plurimath
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
103
|
+
version: 0.5.0
|
90
104
|
type: :runtime
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
110
|
+
version: 0.5.0
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: thread_safe
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|