html2doc 1.6.1 → 1.7.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6b08c4ee810280f83835a9884c17d54a04837195c1d133bedd48f8a103780316
4
- data.tar.gz: f47560825288a3297700d048c3fff8c90ca61ba6bcb2b2e8fef3ac9749d9e094
3
+ metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
4
+ data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
5
5
  SHA512:
6
- metadata.gz: 5a94da368aa84ae4abcbcf6c3bc401b349811b32b1d8097b82e96b479cd684d545ac66fc9fb6f41367db85bfeb86590a15bbac7af0e989c41a0e1af0e2c79966
7
- data.tar.gz: 4dd0add251285b7c23a82b3b5d709d5421542b96ef0025357672616003d5e31ed35c4c064deb652925666a4a6e33962d6e73d72b137c8cea3592e5d4bb131574
6
+ metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
7
+ data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
data/html2doc.gemspec CHANGED
@@ -28,10 +28,11 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "htmlentities", "~> 4.3.4"
30
30
  spec.add_dependency "image_size", ">= 3.2.0"
31
+ spec.add_dependency "metanorma-utils"
31
32
  spec.add_dependency "mime-types"
32
33
  spec.add_dependency "nokogiri", "~> 1.14"
33
34
  spec.add_dependency "plane1converter", "~> 0.0.1"
34
- spec.add_dependency "plurimath"
35
+ spec.add_dependency "plurimath", "~> 0.5.0"
35
36
  spec.add_dependency "thread_safe"
36
37
  spec.add_dependency "uuidtools"
37
38
 
data/lib/html2doc/base.rb CHANGED
@@ -141,7 +141,7 @@ class Html2Doc
141
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
142
142
  .each do |x|
143
143
  (x["id"].empty? ||
144
- %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
144
+ %w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
145
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
146
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
147
147
  end
data/lib/html2doc/math.rb CHANGED
@@ -3,6 +3,20 @@ require "plurimath"
3
3
  require "htmlentities"
4
4
  require "nokogiri"
5
5
  require "plane1converter"
6
+ require "metanorma-utils"
7
+
8
+ module Nokogiri
9
+ module XML
10
+ class Node
11
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
12
+
13
+ def ooxml_xpath(path)
14
+ p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
15
+ xpath(p, "m" => OOXML_NS)
16
+ end
17
+ end
18
+ end
19
+ end
6
20
 
7
21
  class Html2Doc
8
22
  def progress_conv(idx, step, total, threshold, msg)
@@ -24,15 +38,26 @@ class Html2Doc
24
38
 
25
39
  # random fixes to MathML input that OOXML needs to render properly
26
40
  def ooxml_cleanup(math, docnamespaces)
27
- math = unwrap_accents(
28
- mathml_preserve_space(
29
- mathml_insert_rows(math, docnamespaces), docnamespaces
30
- ),
31
- )
41
+ #encode_math(
42
+ unwrap_accents(
43
+ mathml_preserve_space(
44
+ mathml_insert_rows(math, docnamespaces), docnamespaces
45
+ ),
46
+ )
47
+ #)
32
48
  math.add_namespace(nil, MATHML_NS)
33
49
  math
34
50
  end
35
51
 
52
+ def encode_math(elem)
53
+ elem.traverse do |e|
54
+ e.text? or next
55
+ e.text.strip.empty? and next
56
+ e.replace(@c.encode(e.text, :hexadecimal))
57
+ end
58
+ elem
59
+ end
60
+
36
61
  def mathml_insert_rows(math, docnamespaces)
37
62
  math.xpath(%w(msup msub msubsup munder mover munderover)
38
63
  .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
@@ -52,47 +77,57 @@ class Html2Doc
52
77
 
53
78
  HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
54
79
 
80
+ def wrap_text(elem, wrapper)
81
+ elem.traverse do |e|
82
+ e.text? or next
83
+ e.text.strip.empty? and next
84
+ e.wrap(wrapper)
85
+ end
86
+ end
87
+
55
88
  def unitalic(math)
56
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
57
- x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
89
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
90
+ wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
58
91
  end
59
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
60
- x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
92
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
93
+ wrap_text(x,
94
+ "<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
61
95
  end
62
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
63
- x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
96
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
97
+ wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
64
98
  end
65
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
66
- x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
99
+ math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
100
+ wrap_text(x,
101
+ "<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
67
102
  end
68
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
103
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
69
104
  to_plane1(x, :monospace)
70
105
  end
71
- math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
106
+ math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
72
107
  to_plane1(x, :doublestruck)
73
108
  end
74
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
109
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
75
110
  to_plane1(x, :script)
76
111
  end
77
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
112
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
78
113
  to_plane1(x, :scriptbold)
79
114
  end
80
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
115
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
81
116
  to_plane1(x, :fraktur)
82
117
  end
83
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
118
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
84
119
  to_plane1(x, :frakturbold)
85
120
  end
86
- math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
121
+ math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
87
122
  to_plane1(x, :sans)
88
123
  end
89
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
124
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
90
125
  to_plane1(x, :sansbold)
91
126
  end
92
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
127
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
93
128
  to_plane1(x, :sansitalic)
94
129
  end
95
- math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
130
+ math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
96
131
  to_plane1(x, :sansbolditalic)
97
132
  end
98
133
  math
@@ -121,22 +156,26 @@ class Html2Doc
121
156
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
122
157
  # insists on inheriting namespace from parent.
123
158
  def ooml_clean(xml)
124
- xml.to_s
159
+ xml.to_xml(indent: 0)
125
160
  .gsub(/<\?[^>]+>\s*/, "")
126
161
  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
127
- .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
162
+ # .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
128
163
  end
129
164
 
130
165
  def mathml_to_ooml1(xml, docnamespaces)
131
166
  doc = Nokogiri::XML::Document::new
132
167
  doc.root = ooxml_cleanup(xml, docnamespaces)
133
- ooxml = unitalic(esc_space(accent_tr(@xsltemplate.transform(doc))))
168
+ # ooxml = @xsltemplate.transform(doc)
169
+ d = xml.parent["block"] != "false" # display_style
170
+ ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
171
+ :mathml).to_omml)
172
+ ooxml = unitalic(accent_tr(ooxml))
134
173
  ooxml = ooml_clean(uncenter(xml, ooxml))
135
174
  xml.swap(ooxml)
136
175
  end
137
176
 
138
177
  def accent_tr(xml)
139
- xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
178
+ xml.ooxml_xpath(".//accPr/chr").each do |x|
140
179
  x["m:val"] &&= accent_tr1(x["m:val"])
141
180
  x["val"] &&= accent_tr1(x["val"])
142
181
  end
@@ -152,28 +191,20 @@ class Html2Doc
152
191
  end
153
192
  end
154
193
 
155
- # escape space as &#x32;; we are removing any spaces generated by
156
- # XML indentation
157
- def esc_space(xml)
158
- xml.traverse do |n|
159
- next unless n.text?
160
-
161
- n = n.text.gsub(/ /, "&#x32;")
162
- end
163
- xml
164
- end
165
-
166
- OOXML_NS = "http://schemas.microsoft.com/office/2004/12/omml".freeze
194
+ OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
167
195
 
168
196
  def math_only_para?(node)
169
197
  x = node.dup
170
198
  x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
171
199
  x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
200
+ x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
201
+ # namespace can go missing during processing
172
202
  x.text.strip.empty?
173
203
  end
174
204
 
175
- def math_block?(ooxml, mathml)
176
- ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
205
+ def math_block?(_ooxml, mathml)
206
+ # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
207
+ mathml["displaystyle"] == "true"
177
208
  end
178
209
 
179
210
  STYLE_BEARING_NODE =
@@ -184,12 +215,24 @@ class Html2Doc
184
215
  # also if ooml has mathPara already, or is in para with only oMath content
185
216
  def uncenter(math, ooxml)
186
217
  alignnode = math.xpath(STYLE_BEARING_NODE).last
187
- ret = ooxml.root.to_xml(indent: 0)
188
- (math_block?(ooxml, math) ||
189
- !alignnode) || !math_only_para?(alignnode) and return ret
218
+ ooxml.document? and ooxml = ooxml.root
219
+ ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
190
220
  dir = "left"
191
221
  alignnode["style"]&.include?("text-align:right") and dir = "right"
192
- "<oMathPara><oMathParaPr><jc " \
193
- "m:val='#{dir}'/></oMathParaPr>#{ret}</oMathPara>"
222
+ ooxml.name == "oMathPara" or
223
+ ooxml.wrap("<m:oMathPara></m:oMathPara>")
224
+ ooxml.elements.first.previous =
225
+ "<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
226
+ ooxml
227
+ end
228
+
229
+ def uncenter_unneeded(math, ooxml, alignnode)
230
+ (math_block?(ooxml, math) || !alignnode) and return ooxml
231
+ if !math_only_para?(alignnode)
232
+ ooxml.name == "oMathPara" and
233
+ ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
234
+ return ooxml
235
+ end
236
+ nil
194
237
  end
195
238
  end
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.6.1".freeze
2
+ VERSION = "1.7.0".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.1
4
+ version: 1.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-12 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: 3.2.0
41
+ - !ruby/object:Gem::Dependency
42
+ name: metanorma-utils
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: mime-types
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -84,16 +98,16 @@ dependencies:
84
98
  name: plurimath
85
99
  requirement: !ruby/object:Gem::Requirement
86
100
  requirements:
87
- - - ">="
101
+ - - "~>"
88
102
  - !ruby/object:Gem::Version
89
- version: '0'
103
+ version: 0.5.0
90
104
  type: :runtime
91
105
  prerelease: false
92
106
  version_requirements: !ruby/object:Gem::Requirement
93
107
  requirements:
94
- - - ">="
108
+ - - "~>"
95
109
  - !ruby/object:Gem::Version
96
- version: '0'
110
+ version: 0.5.0
97
111
  - !ruby/object:Gem::Dependency
98
112
  name: thread_safe
99
113
  requirement: !ruby/object:Gem::Requirement