html2doc 1.6.1 → 1.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/html2doc.gemspec +2 -1
- data/lib/html2doc/base.rb +1 -1
- data/lib/html2doc/math.rb +89 -46
- data/lib/html2doc/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
|
4
|
+
data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
|
7
|
+
data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
|
data/html2doc.gemspec
CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
|
|
28
28
|
|
29
29
|
spec.add_dependency "htmlentities", "~> 4.3.4"
|
30
30
|
spec.add_dependency "image_size", ">= 3.2.0"
|
31
|
+
spec.add_dependency "metanorma-utils"
|
31
32
|
spec.add_dependency "mime-types"
|
32
33
|
spec.add_dependency "nokogiri", "~> 1.14"
|
33
34
|
spec.add_dependency "plane1converter", "~> 0.0.1"
|
34
|
-
spec.add_dependency "plurimath"
|
35
|
+
spec.add_dependency "plurimath", "~> 0.5.0"
|
35
36
|
spec.add_dependency "thread_safe"
|
36
37
|
spec.add_dependency "uuidtools"
|
37
38
|
|
data/lib/html2doc/base.rb
CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
|
|
141
141
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
142
142
|
.each do |x|
|
143
143
|
(x["id"].empty? ||
|
144
|
-
%w(shapetype v:
|
144
|
+
%w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
|
145
145
|
if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
|
146
146
|
else x.children.first.previous = "<a name='#{x['id']}'></a>"
|
147
147
|
end
|
data/lib/html2doc/math.rb
CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
|
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
5
|
require "plane1converter"
|
6
|
+
require "metanorma-utils"
|
7
|
+
|
8
|
+
module Nokogiri
|
9
|
+
module XML
|
10
|
+
class Node
|
11
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
12
|
+
|
13
|
+
def ooxml_xpath(path)
|
14
|
+
p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
|
15
|
+
xpath(p, "m" => OOXML_NS)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
6
20
|
|
7
21
|
class Html2Doc
|
8
22
|
def progress_conv(idx, step, total, threshold, msg)
|
@@ -24,15 +38,26 @@ class Html2Doc
|
|
24
38
|
|
25
39
|
# random fixes to MathML input that OOXML needs to render properly
|
26
40
|
def ooxml_cleanup(math, docnamespaces)
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
41
|
+
#encode_math(
|
42
|
+
unwrap_accents(
|
43
|
+
mathml_preserve_space(
|
44
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
45
|
+
),
|
46
|
+
)
|
47
|
+
#)
|
32
48
|
math.add_namespace(nil, MATHML_NS)
|
33
49
|
math
|
34
50
|
end
|
35
51
|
|
52
|
+
def encode_math(elem)
|
53
|
+
elem.traverse do |e|
|
54
|
+
e.text? or next
|
55
|
+
e.text.strip.empty? and next
|
56
|
+
e.replace(@c.encode(e.text, :hexadecimal))
|
57
|
+
end
|
58
|
+
elem
|
59
|
+
end
|
60
|
+
|
36
61
|
def mathml_insert_rows(math, docnamespaces)
|
37
62
|
math.xpath(%w(msup msub msubsup munder mover munderover)
|
38
63
|
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
@@ -52,47 +77,57 @@ class Html2Doc
|
|
52
77
|
|
53
78
|
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
54
79
|
|
80
|
+
def wrap_text(elem, wrapper)
|
81
|
+
elem.traverse do |e|
|
82
|
+
e.text? or next
|
83
|
+
e.text.strip.empty? and next
|
84
|
+
e.wrap(wrapper)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
55
88
|
def unitalic(math)
|
56
|
-
math.
|
57
|
-
x
|
89
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
|
90
|
+
wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
|
58
91
|
end
|
59
|
-
math.
|
60
|
-
x
|
92
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
|
93
|
+
wrap_text(x,
|
94
|
+
"<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
61
95
|
end
|
62
|
-
math.
|
63
|
-
x
|
96
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
|
97
|
+
wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
|
64
98
|
end
|
65
|
-
math.
|
66
|
-
x
|
99
|
+
math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
|
100
|
+
wrap_text(x,
|
101
|
+
"<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
67
102
|
end
|
68
|
-
math.
|
103
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
|
69
104
|
to_plane1(x, :monospace)
|
70
105
|
end
|
71
|
-
math.
|
106
|
+
math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
|
72
107
|
to_plane1(x, :doublestruck)
|
73
108
|
end
|
74
|
-
math.
|
109
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
|
75
110
|
to_plane1(x, :script)
|
76
111
|
end
|
77
|
-
math.
|
112
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
|
78
113
|
to_plane1(x, :scriptbold)
|
79
114
|
end
|
80
|
-
math.
|
115
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
|
81
116
|
to_plane1(x, :fraktur)
|
82
117
|
end
|
83
|
-
math.
|
118
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
|
84
119
|
to_plane1(x, :frakturbold)
|
85
120
|
end
|
86
|
-
math.
|
121
|
+
math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
|
87
122
|
to_plane1(x, :sans)
|
88
123
|
end
|
89
|
-
math.
|
124
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
|
90
125
|
to_plane1(x, :sansbold)
|
91
126
|
end
|
92
|
-
math.
|
127
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
|
93
128
|
to_plane1(x, :sansitalic)
|
94
129
|
end
|
95
|
-
math.
|
130
|
+
math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
|
96
131
|
to_plane1(x, :sansbolditalic)
|
97
132
|
end
|
98
133
|
math
|
@@ -121,22 +156,26 @@ class Html2Doc
|
|
121
156
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
122
157
|
# insists on inheriting namespace from parent.
|
123
158
|
def ooml_clean(xml)
|
124
|
-
xml.
|
159
|
+
xml.to_xml(indent: 0)
|
125
160
|
.gsub(/<\?[^>]+>\s*/, "")
|
126
161
|
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
127
|
-
|
162
|
+
# .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
128
163
|
end
|
129
164
|
|
130
165
|
def mathml_to_ooml1(xml, docnamespaces)
|
131
166
|
doc = Nokogiri::XML::Document::new
|
132
167
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
133
|
-
ooxml =
|
168
|
+
# ooxml = @xsltemplate.transform(doc)
|
169
|
+
d = xml.parent["block"] != "false" # display_style
|
170
|
+
ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
|
171
|
+
:mathml).to_omml)
|
172
|
+
ooxml = unitalic(accent_tr(ooxml))
|
134
173
|
ooxml = ooml_clean(uncenter(xml, ooxml))
|
135
174
|
xml.swap(ooxml)
|
136
175
|
end
|
137
176
|
|
138
177
|
def accent_tr(xml)
|
139
|
-
xml.
|
178
|
+
xml.ooxml_xpath(".//accPr/chr").each do |x|
|
140
179
|
x["m:val"] &&= accent_tr1(x["m:val"])
|
141
180
|
x["val"] &&= accent_tr1(x["val"])
|
142
181
|
end
|
@@ -152,28 +191,20 @@ class Html2Doc
|
|
152
191
|
end
|
153
192
|
end
|
154
193
|
|
155
|
-
|
156
|
-
# XML indentation
|
157
|
-
def esc_space(xml)
|
158
|
-
xml.traverse do |n|
|
159
|
-
next unless n.text?
|
160
|
-
|
161
|
-
n = n.text.gsub(/ /, "2")
|
162
|
-
end
|
163
|
-
xml
|
164
|
-
end
|
165
|
-
|
166
|
-
OOXML_NS = "http://schemas.microsoft.com/office/2004/12/omml".freeze
|
194
|
+
OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
|
167
195
|
|
168
196
|
def math_only_para?(node)
|
169
197
|
x = node.dup
|
170
198
|
x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
|
171
199
|
x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
|
200
|
+
x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
|
201
|
+
# namespace can go missing during processing
|
172
202
|
x.text.strip.empty?
|
173
203
|
end
|
174
204
|
|
175
|
-
def math_block?(
|
176
|
-
ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
205
|
+
def math_block?(_ooxml, mathml)
|
206
|
+
# ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
|
207
|
+
mathml["displaystyle"] == "true"
|
177
208
|
end
|
178
209
|
|
179
210
|
STYLE_BEARING_NODE =
|
@@ -184,12 +215,24 @@ class Html2Doc
|
|
184
215
|
# also if ooml has mathPara already, or is in para with only oMath content
|
185
216
|
def uncenter(math, ooxml)
|
186
217
|
alignnode = math.xpath(STYLE_BEARING_NODE).last
|
187
|
-
|
188
|
-
(
|
189
|
-
!alignnode) || !math_only_para?(alignnode) and return ret
|
218
|
+
ooxml.document? and ooxml = ooxml.root
|
219
|
+
ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
|
190
220
|
dir = "left"
|
191
221
|
alignnode["style"]&.include?("text-align:right") and dir = "right"
|
192
|
-
"
|
193
|
-
"m:
|
222
|
+
ooxml.name == "oMathPara" or
|
223
|
+
ooxml.wrap("<m:oMathPara></m:oMathPara>")
|
224
|
+
ooxml.elements.first.previous =
|
225
|
+
"<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
|
226
|
+
ooxml
|
227
|
+
end
|
228
|
+
|
229
|
+
def uncenter_unneeded(math, ooxml, alignnode)
|
230
|
+
(math_block?(ooxml, math) || !alignnode) and return ooxml
|
231
|
+
if !math_only_para?(alignnode)
|
232
|
+
ooxml.name == "oMathPara" and
|
233
|
+
ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
|
234
|
+
return ooxml
|
235
|
+
end
|
236
|
+
nil
|
194
237
|
end
|
195
238
|
end
|
data/lib/html2doc/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08
|
11
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 3.2.0
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: metanorma-utils
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: mime-types
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -84,16 +98,16 @@ dependencies:
|
|
84
98
|
name: plurimath
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|
86
100
|
requirements:
|
87
|
-
- - "
|
101
|
+
- - "~>"
|
88
102
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
103
|
+
version: 0.5.0
|
90
104
|
type: :runtime
|
91
105
|
prerelease: false
|
92
106
|
version_requirements: !ruby/object:Gem::Requirement
|
93
107
|
requirements:
|
94
|
-
- - "
|
108
|
+
- - "~>"
|
95
109
|
- !ruby/object:Gem::Version
|
96
|
-
version:
|
110
|
+
version: 0.5.0
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: thread_safe
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|