html2doc 1.1.0 → 1.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.hound.yml +3 -1
- data/.rubocop.yml +4 -8
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc/base.rb +48 -46
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -79
- data/lib/html2doc/mime.rb +41 -34
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc.rb +0 -3
- data/spec/html2doc_spec.rb +566 -521
- metadata +42 -42
data/lib/html2doc/math.rb
CHANGED
@@ -9,29 +9,34 @@ module Html2Doc
|
|
9
9
|
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
10
10
|
encoding: "utf-8"))
|
11
11
|
|
12
|
-
def self.asciimath_to_mathml1(
|
13
|
-
|
14
|
-
AsciiMath
|
15
|
-
|
16
|
-
gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
end
|
12
|
+
def self.asciimath_to_mathml1(expr)
|
13
|
+
AsciiMath::MathMLBuilder.new(msword: true).append_expression(
|
14
|
+
AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
|
15
|
+
).to_s
|
16
|
+
.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
+
rescue StandardError => e
|
18
|
+
puts "parsing: #{expr}"
|
19
|
+
puts e.message
|
20
|
+
raise e
|
22
21
|
end
|
23
22
|
|
24
23
|
def self.asciimath_to_mathml(doc, delims)
|
25
24
|
return doc if delims.nil? || delims.size < 2
|
25
|
+
|
26
26
|
m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
|
27
27
|
m.each_slice(4).map.with_index do |(*a), i|
|
28
|
-
i
|
29
|
-
warn "MathML #{i} of #{(m.size / 4).floor}"
|
28
|
+
progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
|
30
29
|
a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
|
31
30
|
a.size > 1 ? a[0] + a[2] : a[0]
|
32
31
|
end.join
|
33
32
|
end
|
34
33
|
|
34
|
+
def self.progress_conv(idx, step, total, threshold, msg)
|
35
|
+
return unless (idx % step).zero? && total > threshold && idx.positive?
|
36
|
+
|
37
|
+
warn "#{msg} #{idx} of #{total}"
|
38
|
+
end
|
39
|
+
|
35
40
|
def self.unwrap_accents(doc)
|
36
41
|
doc.xpath("//*[@accent = 'true']").each do |x|
|
37
42
|
x.elements.length > 1 or next
|
@@ -42,106 +47,124 @@ module Html2Doc
|
|
42
47
|
end
|
43
48
|
|
44
49
|
# random fixes to MathML input that OOXML needs to render properly
|
45
|
-
def self.ooxml_cleanup(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
+
def self.ooxml_cleanup(math, docnamespaces)
|
51
|
+
math = unwrap_accents(
|
52
|
+
mathml_preserve_space(
|
53
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
54
|
+
),
|
55
|
+
)
|
56
|
+
math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
|
57
|
+
math
|
50
58
|
end
|
51
59
|
|
52
|
-
def self.mathml_insert_rows(
|
53
|
-
|
54
|
-
map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
60
|
+
def self.mathml_insert_rows(math, docnamespaces)
|
61
|
+
math.xpath(%w(msup msub msubsup munder mover munderover)
|
62
|
+
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
55
63
|
next unless x.next_element && x.next_element != "mrow"
|
64
|
+
|
56
65
|
x.next_element.wrap("<mrow/>")
|
57
66
|
end
|
58
|
-
|
67
|
+
math
|
59
68
|
end
|
60
69
|
|
61
|
-
def self.mathml_preserve_space(
|
62
|
-
|
70
|
+
def self.mathml_preserve_space(math, docnamespaces)
|
71
|
+
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
|
63
72
|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
|
64
73
|
end
|
65
|
-
|
74
|
+
math
|
66
75
|
end
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
78
|
+
|
79
|
+
def self.unitalic(math)
|
80
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
|
81
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
|
71
82
|
end
|
72
|
-
|
73
|
-
x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
|
83
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
|
84
|
+
x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
74
85
|
end
|
75
|
-
|
76
|
-
x.wrap("<span class='nostem'><em></em></span>")
|
86
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
|
87
|
+
x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
|
77
88
|
end
|
78
|
-
|
79
|
-
x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
|
89
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
|
90
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
80
91
|
end
|
81
|
-
|
82
|
-
|
92
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
|
93
|
+
to_plane1(x, :monospace)
|
83
94
|
end
|
84
|
-
|
85
|
-
|
95
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
|
96
|
+
to_plane1(x, :doublestruck)
|
86
97
|
end
|
87
|
-
|
88
|
-
|
98
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
|
99
|
+
to_plane1(x, :script)
|
89
100
|
end
|
90
|
-
|
91
|
-
|
101
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
|
102
|
+
to_plane1(x, :scriptbold)
|
92
103
|
end
|
93
|
-
|
94
|
-
|
104
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
105
|
+
to_plane1(x, :fraktur)
|
95
106
|
end
|
96
|
-
|
97
|
-
|
107
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
108
|
+
to_plane1(x, :frakturbold)
|
98
109
|
end
|
99
|
-
|
100
|
-
|
110
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
111
|
+
to_plane1(x, :sans)
|
101
112
|
end
|
102
|
-
|
103
|
-
|
113
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
114
|
+
to_plane1(x, :sansbold)
|
104
115
|
end
|
105
|
-
|
106
|
-
|
116
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
117
|
+
to_plane1(x, :sansitalic)
|
107
118
|
end
|
108
|
-
|
109
|
-
|
119
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
120
|
+
to_plane1(x, :sansbolditalic)
|
110
121
|
end
|
111
|
-
|
122
|
+
math
|
112
123
|
end
|
113
124
|
|
114
|
-
def self.
|
115
|
-
|
125
|
+
def self.to_plane1(xml, font)
|
126
|
+
xml.traverse do |n|
|
116
127
|
next unless n.text?
|
128
|
+
|
117
129
|
n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
|
118
130
|
end
|
119
|
-
|
131
|
+
xml
|
120
132
|
end
|
121
133
|
|
122
134
|
def self.mathml_to_ooml(docxml)
|
123
135
|
docnamespaces = docxml.collect_namespaces
|
124
136
|
m = docxml.xpath("//*[local-name() = 'math']")
|
125
137
|
m.each_with_index do |x, i|
|
126
|
-
i
|
127
|
-
|
128
|
-
element = ooxml_cleanup(x, docnamespaces)
|
129
|
-
doc = Nokogiri::XML::Document::new()
|
130
|
-
doc.root = element
|
131
|
-
ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
|
132
|
-
gsub(/<\?[^>]+>\s*/, "").
|
133
|
-
gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
|
134
|
-
gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
135
|
-
ooxml = uncenter(x, ooxml)
|
136
|
-
x.swap(ooxml)
|
138
|
+
progress_conv(i, 100, m.size, 500, "Math OOXML")
|
139
|
+
mathml_to_ooml1(x, docnamespaces)
|
137
140
|
end
|
138
141
|
end
|
139
142
|
|
140
|
-
#
|
143
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
|
+
# namespaces.
|
145
|
+
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
|
+
# insists on inheriting namespace from parent.
|
147
|
+
def self.ooml_clean(xml)
|
148
|
+
xml.to_s
|
149
|
+
.gsub(/<\?[^>]+>\s*/, "")
|
150
|
+
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
151
|
+
.gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.mathml_to_ooml1(xml, docnamespaces)
|
155
|
+
doc = Nokogiri::XML::Document::new
|
156
|
+
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
+
ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
|
158
|
+
ooxml = uncenter(xml, ooxml)
|
159
|
+
xml.swap(ooxml)
|
160
|
+
end
|
161
|
+
|
162
|
+
# escape space as 2; we are removing any spaces generated by
|
141
163
|
# XML indentation
|
142
164
|
def self.esc_space(xml)
|
143
165
|
xml.traverse do |n|
|
144
166
|
next unless n.text?
|
167
|
+
|
145
168
|
n = n.text.gsub(/ /, "2")
|
146
169
|
end
|
147
170
|
xml
|
@@ -149,17 +172,15 @@ module Html2Doc
|
|
149
172
|
|
150
173
|
# if oomml has no siblings, by default it is centered; override this with
|
151
174
|
# left/right if parent is so tagged
|
152
|
-
def self.uncenter(
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
"m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
160
|
-
elsif alignnode.text.include? ("text-align:right")
|
175
|
+
def self.uncenter(math, ooxml)
|
176
|
+
alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
|
177
|
+
"local-name() = 'div' or local-name() = 'td']/@style")
|
178
|
+
return ooxml unless alignnode && (math.next == nil && math.previous == nil)
|
179
|
+
|
180
|
+
%w(left right).each do |dir|
|
181
|
+
if alignnode.text.include? ("text-align:#{dir}")
|
161
182
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
162
|
-
"m:val='
|
183
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
163
184
|
end
|
164
185
|
end
|
165
186
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -7,20 +7,20 @@ require "fileutils"
|
|
7
7
|
module Html2Doc
|
8
8
|
def self.mime_preamble(boundary, filename, result)
|
9
9
|
<<~"PREAMBLE"
|
10
|
-
|
11
|
-
|
10
|
+
MIME-Version: 1.0
|
11
|
+
Content-Type: multipart/related; boundary="#{boundary}"
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
--#{boundary}
|
14
|
+
Content-ID: <#{File.basename(filename)}>
|
15
|
+
Content-Disposition: inline; filename="#{File.basename(filename)}"
|
16
|
+
Content-Type: text/html; charset="utf-8"
|
17
17
|
|
18
|
-
|
18
|
+
#{result}
|
19
19
|
|
20
20
|
PREAMBLE
|
21
21
|
end
|
22
22
|
|
23
|
-
def self.mime_attachment(boundary,
|
23
|
+
def self.mime_attachment(boundary, _filename, item, dir)
|
24
24
|
content_type = mime_type(item)
|
25
25
|
text_mode = %w[text application].any? { |p| content_type.start_with? p }
|
26
26
|
|
@@ -29,13 +29,13 @@ module Html2Doc
|
|
29
29
|
|
30
30
|
encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
|
31
31
|
<<~"FILE"
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
--#{boundary}
|
33
|
+
Content-ID: <#{File.basename(item)}>
|
34
|
+
Content-Disposition: inline; filename="#{File.basename(item)}"
|
35
|
+
Content-Transfer-Encoding: base64
|
36
|
+
Content-Type: #{content_type}
|
37
37
|
|
38
|
-
|
38
|
+
#{encoded_file}
|
39
39
|
|
40
40
|
FILE
|
41
41
|
end
|
@@ -43,7 +43,7 @@ module Html2Doc
|
|
43
43
|
def self.mime_type(item)
|
44
44
|
types = MIME::Types.type_for(item)
|
45
45
|
type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
|
46
|
-
type = type
|
46
|
+
type = %(#{type} charset="utf-8") if /^text/.match(type) && types
|
47
47
|
type
|
48
48
|
end
|
49
49
|
|
@@ -59,6 +59,7 @@ module Html2Doc
|
|
59
59
|
Dir.foreach(dir) do |item|
|
60
60
|
next if item == "." || item == ".." || /^\./.match(item) ||
|
61
61
|
item == "filelist.xml"
|
62
|
+
|
62
63
|
mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
|
63
64
|
end
|
64
65
|
mhtml += "--#{boundary}--"
|
@@ -69,17 +70,21 @@ module Html2Doc
|
|
69
70
|
mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
|
70
71
|
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
71
72
|
/^data:|^https?:/.match($2) ? m : repl
|
73
|
+
end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
|
74
|
+
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
75
|
+
/^data:|^https?:/.match($2) ? m : repl
|
72
76
|
end
|
73
77
|
end
|
74
78
|
|
75
79
|
# max width for Word document is 400, max height is 680
|
76
|
-
def self.image_resize(
|
77
|
-
|
78
|
-
s = [
|
79
|
-
s =
|
80
|
-
return [nil, nil] if
|
81
|
-
|
82
|
-
s[
|
80
|
+
def self.image_resize(img, path, maxheight, maxwidth)
|
81
|
+
realsize = ImageSize.path(path).size
|
82
|
+
s = [img["width"].to_i, img["height"].to_i]
|
83
|
+
s = realsize if s[0].zero? && s[1].zero?
|
84
|
+
return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?
|
85
|
+
|
86
|
+
s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
|
87
|
+
s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
|
83
88
|
s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
|
84
89
|
s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
|
85
90
|
s
|
@@ -92,19 +97,22 @@ module Html2Doc
|
|
92
97
|
end
|
93
98
|
|
94
99
|
def self.warnsvg(src)
|
95
|
-
warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
|
100
|
+
warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.localname(src, localdir)
|
104
|
+
%r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
|
96
105
|
end
|
97
106
|
|
98
107
|
# only processes locally stored images
|
99
108
|
def self.image_cleanup(docxml, dir, localdir)
|
100
109
|
docxml.traverse do |i|
|
101
110
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
102
|
-
|
103
|
-
next if
|
104
|
-
|
105
|
-
local_filename =
|
106
|
-
|
107
|
-
new_filename = "#{mkuuid}#{File.extname(i["src"])}"
|
111
|
+
next if /^http/.match? i["src"]
|
112
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
|
113
|
+
|
114
|
+
local_filename = localname(i["src"], localdir)
|
115
|
+
new_filename = "#{mkuuid}#{File.extname(i['src'])}"
|
108
116
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
109
117
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
110
118
|
i["src"] = File.join(File.basename(dir), new_filename)
|
@@ -112,7 +120,7 @@ module Html2Doc
|
|
112
120
|
docxml
|
113
121
|
end
|
114
122
|
|
115
|
-
# do not parse the header through Nokogiri, since it will contain
|
123
|
+
# do not parse the header through Nokogiri, since it will contain
|
116
124
|
# non-XML like <![if !supportFootnotes]>
|
117
125
|
def self.header_image_cleanup(doc, dir, filename, localdir)
|
118
126
|
doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
|
@@ -120,15 +128,13 @@ module Html2Doc
|
|
120
128
|
end.join
|
121
129
|
end
|
122
130
|
|
123
|
-
def self.header_image_cleanup1(a, dir,
|
131
|
+
def self.header_image_cleanup1(a, dir, _filename, localdir)
|
124
132
|
if a.size == 2 && !(/ src="https?:/.match a[1]) &&
|
125
133
|
!(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
|
126
134
|
m = / src=['"](?<src>[^"']+)['"]/.match a[1]
|
127
|
-
#warnsvg(m[:src])
|
128
135
|
m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
|
129
136
|
new_filename = "#{mkuuid}.#{m2[:suffix]}"
|
130
|
-
|
131
|
-
FileUtils.cp old_filename, File.join(dir, new_filename)
|
137
|
+
FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
|
132
138
|
a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
|
133
139
|
end
|
134
140
|
a.join
|
@@ -140,6 +146,7 @@ module Html2Doc
|
|
140
146
|
<o:MainFile HRef="../#{filename}.htm"/>}
|
141
147
|
Dir.entries(dir).sort.each do |item|
|
142
148
|
next if item == "." || item == ".." || /^\./.match(item)
|
149
|
+
|
143
150
|
f.write %{ <o:File HRef="#{item}"/>\n}
|
144
151
|
end
|
145
152
|
f.write("</xml>\n")
|
data/lib/html2doc/notes.rb
CHANGED
@@ -6,6 +6,7 @@ module Html2Doc
|
|
6
6
|
fn = []
|
7
7
|
docxml.xpath("//a").each do |a|
|
8
8
|
next unless process_footnote_link(docxml, a, i, fn)
|
9
|
+
|
9
10
|
i += 1
|
10
11
|
end
|
11
12
|
process_footnote_texts(docxml, fn)
|
@@ -22,13 +23,13 @@ module Html2Doc
|
|
22
23
|
footnote_cleanup(docxml)
|
23
24
|
end
|
24
25
|
|
25
|
-
def self.footnote_div_to_p(
|
26
|
-
if %w{div aside}.include?
|
27
|
-
if
|
28
|
-
|
26
|
+
def self.footnote_div_to_p(elem)
|
27
|
+
if %w{div aside}.include? elem.name
|
28
|
+
if elem.at(".//p")
|
29
|
+
elem.replace(elem.children)
|
29
30
|
else
|
30
|
-
|
31
|
-
|
31
|
+
elem.name = "p"
|
32
|
+
elem["class"] = "MsoFootnoteText"
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
@@ -36,34 +37,39 @@ module Html2Doc
|
|
36
37
|
FN = "<span class='MsoFootnoteReference'>"\
|
37
38
|
"<span style='mso-special-character:footnote'/></span>".freeze
|
38
39
|
|
39
|
-
def self.footnote_container(docxml,
|
40
|
-
ref = docxml&.at("//a[@href='#_ftn#{
|
41
|
-
gsub(/>\n</, "><") || FN
|
40
|
+
def self.footnote_container(docxml, idx)
|
41
|
+
ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
|
42
|
+
&.gsub(/>\n</, "><") || FN
|
42
43
|
<<~DIV
|
43
|
-
<div style='mso-element:footnote' id='ftn#{
|
44
|
-
<a style='mso-footnote-id:ftn#{
|
45
|
-
name='_ftnref#{
|
44
|
+
<div style='mso-element:footnote' id='ftn#{idx}'>
|
45
|
+
<a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
|
46
|
+
name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
|
46
47
|
DIV
|
47
48
|
end
|
48
49
|
|
49
|
-
def self.process_footnote_link(docxml,
|
50
|
-
return false unless footnote?(
|
51
|
-
|
50
|
+
def self.process_footnote_link(docxml, elem, idx, footnote)
|
51
|
+
return false unless footnote?(elem)
|
52
|
+
|
53
|
+
href = elem["href"].gsub(/^#/, "")
|
52
54
|
note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
|
53
55
|
return false if note.nil?
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
56
|
+
|
57
|
+
set_footnote_link_attrs(elem, idx)
|
58
|
+
if elem.at("./span[@class = 'MsoFootnoteReference']")
|
59
|
+
process_footnote_link1(elem)
|
60
|
+
else elem.children = FN
|
61
|
+
end
|
62
|
+
footnote << transform_footnote_text(note)
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.process_footnote_link1(elem)
|
66
|
+
elem.children.each do |c|
|
67
|
+
if c.name == "span" && c["class"] == "MsoFootnoteReference"
|
68
|
+
c.replace(FN)
|
69
|
+
else
|
70
|
+
c.wrap("<span class='MsoFootnoteReference'></span>")
|
62
71
|
end
|
63
|
-
else
|
64
|
-
a.children = FN
|
65
72
|
end
|
66
|
-
fn << transform_footnote_text(note)
|
67
73
|
end
|
68
74
|
|
69
75
|
def self.transform_footnote_text(note)
|
@@ -76,16 +82,16 @@ module Html2Doc
|
|
76
82
|
note.remove
|
77
83
|
end
|
78
84
|
|
79
|
-
def self.footnote?(
|
80
|
-
|
81
|
-
|
85
|
+
def self.footnote?(elem)
|
86
|
+
elem["epub:type"]&.casecmp("footnote")&.zero? ||
|
87
|
+
elem["class"]&.casecmp("footnote")&.zero?
|
82
88
|
end
|
83
89
|
|
84
|
-
def self.set_footnote_link_attrs(
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
90
|
+
def self.set_footnote_link_attrs(elem, idx)
|
91
|
+
elem["style"] = "mso-footnote-id:ftn#{idx}"
|
92
|
+
elem["href"] = "#_ftn#{idx}"
|
93
|
+
elem["name"] = "_ftnref#{idx}"
|
94
|
+
elem["title"] = ""
|
89
95
|
end
|
90
96
|
|
91
97
|
# We expect that the content of the footnote text received is one or
|
@@ -94,8 +100,8 @@ module Html2Doc
|
|
94
100
|
# are present in the HTML, they need to have been cleaned out before
|
95
101
|
# passing to this gem
|
96
102
|
def self.footnote_cleanup(docxml)
|
97
|
-
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
98
|
-
each do |x|
|
103
|
+
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
104
|
+
.each do |x|
|
99
105
|
n = x.next_element
|
100
106
|
n&.children&.first&.add_previous_sibling(x.remove)
|
101
107
|
end
|
data/lib/html2doc/version.rb
CHANGED
data/lib/html2doc.rb
CHANGED