html2doc 1.6.0 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47535bf46876ee49a732b6c136f78b58a9ac009f880b95c5a73c8770293f3735
4
- data.tar.gz: a052e0c3ba3ee27ca208b2624d7a832cba67bcc959f1ec5da36f9a7049c26c35
3
+ metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
4
+ data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
5
5
  SHA512:
6
- metadata.gz: '096dc5a7fe4b35e5afdec632f37b28f9980fcbcba4222e1ec1eb81fe4653a62cc00c5d9b90ed38ae54d4320ea8bf7e0fd0698625045504d371bcb90fb6247a54'
7
- data.tar.gz: 7aeebef3892dc2273bc4ab9899624fc113b1989c3af097204b4f73eb250a7d52e8b3cfe62439e84fb179c1e9bbc96563af669d87f56b15dd82b4fc99953a2227
6
+ metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
7
+ data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "htmlentities", "~> 4.3.4"
30
30
  spec.add_dependency "image_size", ">= 3.2.0"
31
+ spec.add_dependency "metanorma-utils"
31
32
  spec.add_dependency "mime-types"
32
33
  spec.add_dependency "nokogiri", "~> 1.14"
33
34
  spec.add_dependency "plane1converter", "~> 0.0.1"
34
- spec.add_dependency "plurimath"
35
+ spec.add_dependency "plurimath", "~> 0.5.0"
35
36
  spec.add_dependency "thread_safe"
36
37
  spec.add_dependency "uuidtools"
37
38
 
data/lib/html2doc/base.rb CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
141
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
142
142
  .each do |x|
143
143
  (x["id"].empty? ||
144
- %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
144
+ %w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
145
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
146
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
147
147
  end
data/lib/html2doc/math.rb CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "plane1converter"
6
+ require "metanorma-utils"
7
+
8
+ module Nokogiri
9
+ module XML
10
+ class Node
11
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
12
+
13
+ def ooxml_xpath(path)
14
+ p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
15
+ xpath(p, "m" => OOXML_NS)
16
+ end
17
+ end
18
+ end
19
+ end
6
20
 
7
21
  class Html2Doc
8
22
  def progress_conv(idx, step, total, threshold, msg)
@@ -20,17 +34,30 @@ class Html2Doc
20
34
  doc
21
35
  end
22
36
 
37
+ MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
38
+
23
39
  # random fixes to MathML input that OOXML needs to render properly
24
40
  def ooxml_cleanup(math, docnamespaces)
25
- math = unwrap_accents(
26
- mathml_preserve_space(
27
- mathml_insert_rows(math, docnamespaces), docnamespaces
28
- ),
29
- )
30
- math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
41
+ #encode_math(
42
+ unwrap_accents(
43
+ mathml_preserve_space(
44
+ mathml_insert_rows(math, docnamespaces), docnamespaces
45
+ ),
46
+ )
47
+ #)
48
+ math.add_namespace(nil, MATHML_NS)
31
49
  math
32
50
  end
33
51
 
52
+ def encode_math(elem)
53
+ elem.traverse do |e|
54
+ e.text? or next
55
+ e.text.strip.empty? and next
56
+ e.replace(@c.encode(e.text, :hexadecimal))
57
+ end
58
+ elem
59
+ end
60
+
34
61
  def mathml_insert_rows(math, docnamespaces)
35
62
  math.xpath(%w(msup msub msubsup munder mover munderover)
36
63
  .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
@@ -50,47 +77,57 @@ class Html2Doc
50
77
 
51
78
  HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
52
79
 
80
+ def wrap_text(elem, wrapper)
81
+ elem.traverse do |e|
82
+ e.text? or next
83
+ e.text.strip.empty? and next
84
+ e.wrap(wrapper)
85
+ end
86
+ end
87
+
53
88
  def unitalic(math)
54
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
55
- x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
89
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
90
+ wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
56
91
  end
57
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
58
- x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
92
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
93
+ wrap_text(x,
94
+ "<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
59
95
  end
60
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
61
- x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
96
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
97
+ wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
62
98
  end
63
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
64
- x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
99
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
100
+ wrap_text(x,
101
+ "<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
65
102
  end
66
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
103
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
67
104
  to_plane1(x, :monospace)
68
105
  end
69
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
106
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
70
107
  to_plane1(x, :doublestruck)
71
108
  end
72
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
109
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
73
110
  to_plane1(x, :script)
74
111
  end
75
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
112
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
76
113
  to_plane1(x, :scriptbold)
77
114
  end
78
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
115
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
79
116
  to_plane1(x, :fraktur)
80
117
  end
81
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
118
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
82
119
  to_plane1(x, :frakturbold)
83
120
  end
84
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
121
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
85
122
  to_plane1(x, :sans)
86
123
  end
87
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
124
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
88
125
  to_plane1(x, :sansbold)
89
126
  end
90
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
127
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
91
128
  to_plane1(x, :sansitalic)
92
129
  end
93
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
130
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
94
131
  to_plane1(x, :sansbolditalic)
95
132
  end
96
133
  math
@@ -119,22 +156,26 @@ class Html2Doc
119
156
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
120
157
  # insists on inheriting namespace from parent.
121
158
  def ooml_clean(xml)
122
- xml.to_s
159
+ xml.to_xml(indent: 0)
123
160
  .gsub(/<\?[^>]+>\s*/, "")
124
161
  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
125
- .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
162
+ # .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
126
163
  end
127
164
 
128
165
  def mathml_to_ooml1(xml, docnamespaces)
129
166
  doc = Nokogiri::XML::Document::new
130
167
  doc.root = ooxml_cleanup(xml, docnamespaces)
131
- ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
132
- ooxml = uncenter(xml, ooxml)
168
+ # ooxml = @xsltemplate.transform(doc)
169
+ d = xml.parent["block"] != "false" # display_style
170
+ ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
171
+ :mathml).to_omml)
172
+ ooxml = unitalic(accent_tr(ooxml))
173
+ ooxml = ooml_clean(uncenter(xml, ooxml))
133
174
  xml.swap(ooxml)
134
175
  end
135
176
 
136
177
  def accent_tr(xml)
137
- xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
178
+ xml.ooxml_xpath(".//accPr/chr").each do |x|
138
179
  x["m:val"] &&= accent_tr1(x["m:val"])
139
180
  x["val"] &&= accent_tr1(x["val"])
140
181
  end
@@ -150,30 +191,48 @@ class Html2Doc
150
191
  end
151
192
  end
152
193
 
153
- # escape space as &#x32;; we are removing any spaces generated by
154
- # XML indentation
155
- def esc_space(xml)
156
- xml.traverse do |n|
157
- next unless n.text?
194
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
158
195
 
159
- n = n.text.gsub(/ /, "&#x32;")
160
- end
161
- xml
196
+ def math_only_para?(node)
197
+ x = node.dup
198
+ x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
199
+ x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
200
+ x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
201
+ # namespace can go missing during processing
202
+ x.text.strip.empty?
203
+ end
204
+
205
+ def math_block?(_ooxml, mathml)
206
+ # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
207
+ mathml["displaystyle"] == "true"
162
208
  end
163
209
 
210
+ STYLE_BEARING_NODE =
211
+ %w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
212
+
164
213
  # if oomml has no siblings, by default it is centered; override this with
165
214
  # left/right if parent is so tagged
215
+ # also if ooml has mathPara already, or is in para with only oMath content
166
216
  def uncenter(math, ooxml)
167
- alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
168
- "local-name() = 'div' or local-name() = 'td']/@style")
169
- return ooxml unless alignnode && (math.next == nil && math.previous == nil)
170
-
171
- %w(left right).each do |dir|
172
- if alignnode.text.include? ("text-align:#{dir}")
173
- ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
174
- "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
175
- end
176
- end
217
+ alignnode = math.xpath(STYLE_BEARING_NODE).last
218
+ ooxml.document? and ooxml = ooxml.root
219
+ ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
220
+ dir = "left"
221
+ alignnode["style"]&.include?("text-align:right") and dir = "right"
222
+ ooxml.name == "oMathPara" or
223
+ ooxml.wrap("<m:oMathPara></m:oMathPara>")
224
+ ooxml.elements.first.previous =
225
+ "<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
177
226
  ooxml
178
227
  end
228
+
229
+ def uncenter_unneeded(math, ooxml, alignnode)
230
+ (math_block?(ooxml, math) || !alignnode) and return ooxml
231
+ if !math_only_para?(alignnode)
232
+ ooxml.name == "oMathPara" and
233
+ ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
234
+ return ooxml
235
+ end
236
+ nil
237
+ end
179
238
  end
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.6.0".freeze
2
+ VERSION = "1.7.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.0
4
+ version: 1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-07 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 3.2.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: metanorma-utils
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mime-types
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +98,16 @@ dependencies:
84
98
  name: plurimath
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: 0.5.0
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ">="
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: 0.5.0
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: thread_safe
99
113
  requirement: !ruby/object:Gem::Requirement