html2doc 1.6.1 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6b08c4ee810280f83835a9884c17d54a04837195c1d133bedd48f8a103780316
4
- data.tar.gz: f47560825288a3297700d048c3fff8c90ca61ba6bcb2b2e8fef3ac9749d9e094
3
+ metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
4
+ data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
5
5
  SHA512:
6
- metadata.gz: 5a94da368aa84ae4abcbcf6c3bc401b349811b32b1d8097b82e96b479cd684d545ac66fc9fb6f41367db85bfeb86590a15bbac7af0e989c41a0e1af0e2c79966
7
- data.tar.gz: 4dd0add251285b7c23a82b3b5d709d5421542b96ef0025357672616003d5e31ed35c4c064deb652925666a4a6e33962d6e73d72b137c8cea3592e5d4bb131574
6
+ metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
7
+ data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "htmlentities", "~> 4.3.4"
30
30
  spec.add_dependency "image_size", ">= 3.2.0"
31
+ spec.add_dependency "metanorma-utils"
31
32
  spec.add_dependency "mime-types"
32
33
  spec.add_dependency "nokogiri", "~> 1.14"
33
34
  spec.add_dependency "plane1converter", "~> 0.0.1"
34
- spec.add_dependency "plurimath"
35
+ spec.add_dependency "plurimath", "~> 0.5.0"
35
36
  spec.add_dependency "thread_safe"
36
37
  spec.add_dependency "uuidtools"
37
38
 
data/lib/html2doc/base.rb CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
141
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
142
142
  .each do |x|
143
143
  (x["id"].empty? ||
144
- %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
144
+ %w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
145
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
146
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
147
147
  end
data/lib/html2doc/math.rb CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "plane1converter"
6
+ require "metanorma-utils"
7
+
8
+ module Nokogiri
9
+ module XML
10
+ class Node
11
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
12
+
13
+ def ooxml_xpath(path)
14
+ p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
15
+ xpath(p, "m" => OOXML_NS)
16
+ end
17
+ end
18
+ end
19
+ end
6
20
 
7
21
  class Html2Doc
8
22
  def progress_conv(idx, step, total, threshold, msg)
@@ -24,15 +38,26 @@ class Html2Doc
24
38
 
25
39
  # random fixes to MathML input that OOXML needs to render properly
26
40
  def ooxml_cleanup(math, docnamespaces)
27
- math = unwrap_accents(
28
- mathml_preserve_space(
29
- mathml_insert_rows(math, docnamespaces), docnamespaces
30
- ),
31
- )
41
+ #encode_math(
42
+ unwrap_accents(
43
+ mathml_preserve_space(
44
+ mathml_insert_rows(math, docnamespaces), docnamespaces
45
+ ),
46
+ )
47
+ #)
32
48
  math.add_namespace(nil, MATHML_NS)
33
49
  math
34
50
  end
35
51
 
52
+ def encode_math(elem)
53
+ elem.traverse do |e|
54
+ e.text? or next
55
+ e.text.strip.empty? and next
56
+ e.replace(@c.encode(e.text, :hexadecimal))
57
+ end
58
+ elem
59
+ end
60
+
36
61
  def mathml_insert_rows(math, docnamespaces)
37
62
  math.xpath(%w(msup msub msubsup munder mover munderover)
38
63
  .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
@@ -52,47 +77,57 @@ class Html2Doc
52
77
 
53
78
  HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
54
79
 
80
+ def wrap_text(elem, wrapper)
81
+ elem.traverse do |e|
82
+ e.text? or next
83
+ e.text.strip.empty? and next
84
+ e.wrap(wrapper)
85
+ end
86
+ end
87
+
55
88
  def unitalic(math)
56
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
57
- x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
89
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
90
+ wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
58
91
  end
59
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
60
- x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
92
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
93
+ wrap_text(x,
94
+ "<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
61
95
  end
62
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
63
- x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
96
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
97
+ wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
64
98
  end
65
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
66
- x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
99
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
100
+ wrap_text(x,
101
+ "<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
67
102
  end
68
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
103
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
69
104
  to_plane1(x, :monospace)
70
105
  end
71
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
106
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
72
107
  to_plane1(x, :doublestruck)
73
108
  end
74
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
109
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
75
110
  to_plane1(x, :script)
76
111
  end
77
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
112
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
78
113
  to_plane1(x, :scriptbold)
79
114
  end
80
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
115
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
81
116
  to_plane1(x, :fraktur)
82
117
  end
83
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
118
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
84
119
  to_plane1(x, :frakturbold)
85
120
  end
86
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
121
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
87
122
  to_plane1(x, :sans)
88
123
  end
89
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
124
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
90
125
  to_plane1(x, :sansbold)
91
126
  end
92
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
127
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
93
128
  to_plane1(x, :sansitalic)
94
129
  end
95
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
130
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
96
131
  to_plane1(x, :sansbolditalic)
97
132
  end
98
133
  math
@@ -121,22 +156,26 @@ class Html2Doc
121
156
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
122
157
  # insists on inheriting namespace from parent.
123
158
  def ooml_clean(xml)
124
- xml.to_s
159
+ xml.to_xml(indent: 0)
125
160
  .gsub(/<\?[^>]+>\s*/, "")
126
161
  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
127
- .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
162
+ # .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
128
163
  end
129
164
 
130
165
  def mathml_to_ooml1(xml, docnamespaces)
131
166
  doc = Nokogiri::XML::Document::new
132
167
  doc.root = ooxml_cleanup(xml, docnamespaces)
133
- ooxml = unitalic(esc_space(accent_tr(@xsltemplate.transform(doc))))
168
+ # ooxml = @xsltemplate.transform(doc)
169
+ d = xml.parent["block"] != "false" # display_style
170
+ ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
171
+ :mathml).to_omml)
172
+ ooxml = unitalic(accent_tr(ooxml))
134
173
  ooxml = ooml_clean(uncenter(xml, ooxml))
135
174
  xml.swap(ooxml)
136
175
  end
137
176
 
138
177
  def accent_tr(xml)
139
- xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
178
+ xml.ooxml_xpath(".//accPr/chr").each do |x|
140
179
  x["m:val"] &&= accent_tr1(x["m:val"])
141
180
  x["val"] &&= accent_tr1(x["val"])
142
181
  end
@@ -152,28 +191,20 @@ class Html2Doc
152
191
  end
153
192
  end
154
193
 
155
- # escape space as &#x32;; we are removing any spaces generated by
156
- # XML indentation
157
- def esc_space(xml)
158
- xml.traverse do |n|
159
- next unless n.text?
160
-
161
- n = n.text.gsub(/ /, "&#x32;")
162
- end
163
- xml
164
- end
165
-
166
- OOXML_NS = "http://schemas.microsoft.com/office/2004/12/omml".freeze
194
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
167
195
 
168
196
  def math_only_para?(node)
169
197
  x = node.dup
170
198
  x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
171
199
  x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
200
+ x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
201
+ # namespace can go missing during processing
172
202
  x.text.strip.empty?
173
203
  end
174
204
 
175
- def math_block?(ooxml, mathml)
176
- ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
205
+ def math_block?(_ooxml, mathml)
206
+ # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
207
+ mathml["displaystyle"] == "true"
177
208
  end
178
209
 
179
210
  STYLE_BEARING_NODE =
@@ -184,12 +215,24 @@ class Html2Doc
184
215
  # also if ooml has mathPara already, or is in para with only oMath content
185
216
  def uncenter(math, ooxml)
186
217
  alignnode = math.xpath(STYLE_BEARING_NODE).last
187
- ret = ooxml.root.to_xml(indent: 0)
188
- (math_block?(ooxml, math) ||
189
- !alignnode) || !math_only_para?(alignnode) and return ret
218
+ ooxml.document? and ooxml = ooxml.root
219
+ ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
190
220
  dir = "left"
191
221
  alignnode["style"]&.include?("text-align:right") and dir = "right"
192
- "<oMathPara><oMathParaPr><jc " \
193
- "m:val='#{dir}'/></oMathParaPr>#{ret}</oMathPara>"
222
+ ooxml.name == "oMathPara" or
223
+ ooxml.wrap("<m:oMathPara></m:oMathPara>")
224
+ ooxml.elements.first.previous =
225
+ "<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
226
+ ooxml
227
+ end
228
+
229
+ def uncenter_unneeded(math, ooxml, alignnode)
230
+ (math_block?(ooxml, math) || !alignnode) and return ooxml
231
+ if !math_only_para?(alignnode)
232
+ ooxml.name == "oMathPara" and
233
+ ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
234
+ return ooxml
235
+ end
236
+ nil
194
237
  end
195
238
  end
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.6.1".freeze
2
+ VERSION = "1.7.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.1
4
+ version: 1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-12 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 3.2.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: metanorma-utils
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mime-types
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +98,16 @@ dependencies:
84
98
  name: plurimath
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: 0.5.0
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ">="
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: 0.5.0
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: thread_safe
99
113
  requirement: !ruby/object:Gem::Requirement