html2doc 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 47535bf46876ee49a732b6c136f78b58a9ac009f880b95c5a73c8770293f3735
4
- data.tar.gz: a052e0c3ba3ee27ca208b2624d7a832cba67bcc959f1ec5da36f9a7049c26c35
3
+ metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
4
+ data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
5
5
  SHA512:
6
- metadata.gz: '096dc5a7fe4b35e5afdec632f37b28f9980fcbcba4222e1ec1eb81fe4653a62cc00c5d9b90ed38ae54d4320ea8bf7e0fd0698625045504d371bcb90fb6247a54'
7
- data.tar.gz: 7aeebef3892dc2273bc4ab9899624fc113b1989c3af097204b4f73eb250a7d52e8b3cfe62439e84fb179c1e9bbc96563af669d87f56b15dd82b4fc99953a2227
6
+ metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
7
+ data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "htmlentities", "~> 4.3.4"
30
30
  spec.add_dependency "image_size", ">= 3.2.0"
31
+ spec.add_dependency "metanorma-utils"
31
32
  spec.add_dependency "mime-types"
32
33
  spec.add_dependency "nokogiri", "~> 1.14"
33
34
  spec.add_dependency "plane1converter", "~> 0.0.1"
34
- spec.add_dependency "plurimath"
35
+ spec.add_dependency "plurimath", "~> 0.5.0"
35
36
  spec.add_dependency "thread_safe"
36
37
  spec.add_dependency "uuidtools"
37
38
 
data/lib/html2doc/base.rb CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
141
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
142
142
  .each do |x|
143
143
  (x["id"].empty? ||
144
- %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
144
+ %w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
145
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
146
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
147
147
  end
data/lib/html2doc/math.rb CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "plane1converter"
6
+ require "metanorma-utils"
7
+
8
+ module Nokogiri
9
+ module XML
10
+ class Node
11
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
12
+
13
+ def ooxml_xpath(path)
14
+ p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
15
+ xpath(p, "m" => OOXML_NS)
16
+ end
17
+ end
18
+ end
19
+ end
6
20
 
7
21
  class Html2Doc
8
22
  def progress_conv(idx, step, total, threshold, msg)
@@ -20,17 +34,30 @@ class Html2Doc
20
34
  doc
21
35
  end
22
36
 
37
+ MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
38
+
23
39
  # random fixes to MathML input that OOXML needs to render properly
24
40
  def ooxml_cleanup(math, docnamespaces)
25
- math = unwrap_accents(
26
- mathml_preserve_space(
27
- mathml_insert_rows(math, docnamespaces), docnamespaces
28
- ),
29
- )
30
- math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
41
+ #encode_math(
42
+ unwrap_accents(
43
+ mathml_preserve_space(
44
+ mathml_insert_rows(math, docnamespaces), docnamespaces
45
+ ),
46
+ )
47
+ #)
48
+ math.add_namespace(nil, MATHML_NS)
31
49
  math
32
50
  end
33
51
 
52
+ def encode_math(elem)
53
+ elem.traverse do |e|
54
+ e.text? or next
55
+ e.text.strip.empty? and next
56
+ e.replace(@c.encode(e.text, :hexadecimal))
57
+ end
58
+ elem
59
+ end
60
+
34
61
  def mathml_insert_rows(math, docnamespaces)
35
62
  math.xpath(%w(msup msub msubsup munder mover munderover)
36
63
  .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
@@ -50,47 +77,57 @@ class Html2Doc
50
77
 
51
78
  HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
52
79
 
80
+ def wrap_text(elem, wrapper)
81
+ elem.traverse do |e|
82
+ e.text? or next
83
+ e.text.strip.empty? and next
84
+ e.wrap(wrapper)
85
+ end
86
+ end
87
+
53
88
  def unitalic(math)
54
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
55
- x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
89
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
90
+ wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
56
91
  end
57
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
58
- x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
92
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
93
+ wrap_text(x,
94
+ "<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
59
95
  end
60
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
61
- x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
96
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
97
+ wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
62
98
  end
63
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
64
- x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
99
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
100
+ wrap_text(x,
101
+ "<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
65
102
  end
66
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
103
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
67
104
  to_plane1(x, :monospace)
68
105
  end
69
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
106
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
70
107
  to_plane1(x, :doublestruck)
71
108
  end
72
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
109
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
73
110
  to_plane1(x, :script)
74
111
  end
75
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
112
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
76
113
  to_plane1(x, :scriptbold)
77
114
  end
78
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
115
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
79
116
  to_plane1(x, :fraktur)
80
117
  end
81
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
118
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
82
119
  to_plane1(x, :frakturbold)
83
120
  end
84
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
121
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
85
122
  to_plane1(x, :sans)
86
123
  end
87
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
124
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
88
125
  to_plane1(x, :sansbold)
89
126
  end
90
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
127
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
91
128
  to_plane1(x, :sansitalic)
92
129
  end
93
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
130
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
94
131
  to_plane1(x, :sansbolditalic)
95
132
  end
96
133
  math
@@ -119,22 +156,26 @@ class Html2Doc
119
156
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
120
157
  # insists on inheriting namespace from parent.
121
158
  def ooml_clean(xml)
122
- xml.to_s
159
+ xml.to_xml(indent: 0)
123
160
  .gsub(/<\?[^>]+>\s*/, "")
124
161
  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
125
- .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
162
+ # .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
126
163
  end
127
164
 
128
165
  def mathml_to_ooml1(xml, docnamespaces)
129
166
  doc = Nokogiri::XML::Document::new
130
167
  doc.root = ooxml_cleanup(xml, docnamespaces)
131
- ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
132
- ooxml = uncenter(xml, ooxml)
168
+ # ooxml = @xsltemplate.transform(doc)
169
+ d = xml.parent["block"] != "false" # display_style
170
+ ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
171
+ :mathml).to_omml)
172
+ ooxml = unitalic(accent_tr(ooxml))
173
+ ooxml = ooml_clean(uncenter(xml, ooxml))
133
174
  xml.swap(ooxml)
134
175
  end
135
176
 
136
177
  def accent_tr(xml)
137
- xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
178
+ xml.ooxml_xpath(".//accPr/chr").each do |x|
138
179
  x["m:val"] &&= accent_tr1(x["m:val"])
139
180
  x["val"] &&= accent_tr1(x["val"])
140
181
  end
@@ -150,30 +191,48 @@ class Html2Doc
150
191
  end
151
192
  end
152
193
 
153
- # escape space as &#x32;; we are removing any spaces generated by
154
- # XML indentation
155
- def esc_space(xml)
156
- xml.traverse do |n|
157
- next unless n.text?
194
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
158
195
 
159
- n = n.text.gsub(/ /, "&#x32;")
160
- end
161
- xml
196
+ def math_only_para?(node)
197
+ x = node.dup
198
+ x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
199
+ x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
200
+ x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
201
+ # namespace can go missing during processing
202
+ x.text.strip.empty?
203
+ end
204
+
205
+ def math_block?(_ooxml, mathml)
206
+ # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
207
+ mathml["displaystyle"] == "true"
162
208
  end
163
209
 
210
+ STYLE_BEARING_NODE =
211
+ %w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
212
+
164
213
  # if oomml has no siblings, by default it is centered; override this with
165
214
  # left/right if parent is so tagged
215
+ # also if ooml has mathPara already, or is in para with only oMath content
166
216
  def uncenter(math, ooxml)
167
- alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
168
- "local-name() = 'div' or local-name() = 'td']/@style")
169
- return ooxml unless alignnode && (math.next == nil && math.previous == nil)
170
-
171
- %w(left right).each do |dir|
172
- if alignnode.text.include? ("text-align:#{dir}")
173
- ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
174
- "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
175
- end
176
- end
217
+ alignnode = math.xpath(STYLE_BEARING_NODE).last
218
+ ooxml.document? and ooxml = ooxml.root
219
+ ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
220
+ dir = "left"
221
+ alignnode["style"]&.include?("text-align:right") and dir = "right"
222
+ ooxml.name == "oMathPara" or
223
+ ooxml.wrap("<m:oMathPara></m:oMathPara>")
224
+ ooxml.elements.first.previous =
225
+ "<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
177
226
  ooxml
178
227
  end
228
+
229
+ def uncenter_unneeded(math, ooxml, alignnode)
230
+ (math_block?(ooxml, math) || !alignnode) and return ooxml
231
+ if !math_only_para?(alignnode)
232
+ ooxml.name == "oMathPara" and
233
+ ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
234
+ return ooxml
235
+ end
236
+ nil
237
+ end
179
238
  end
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.6.0".freeze
2
+ VERSION = "1.7.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.0
4
+ version: 1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-07 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 3.2.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: metanorma-utils
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mime-types
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +98,16 @@ dependencies:
84
98
  name: plurimath
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: 0.5.0
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ">="
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: 0.5.0
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: thread_safe
99
113
  requirement: !ruby/object:Gem::Requirement