html2doc 1.1.0 → 1.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +1 -11
- data/.hound.yml +3 -1
- data/.rubocop.yml +4 -8
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc/base.rb +48 -46
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -79
- data/lib/html2doc/mime.rb +41 -34
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc.rb +0 -3
- data/spec/html2doc_spec.rb +566 -521
- metadata +42 -42
data/lib/html2doc/math.rb
CHANGED
@@ -9,29 +9,34 @@ module Html2Doc
|
|
9
9
|
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
10
10
|
encoding: "utf-8"))
|
11
11
|
|
12
|
-
def self.asciimath_to_mathml1(
|
13
|
-
|
14
|
-
AsciiMath
|
15
|
-
|
16
|
-
gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
end
|
12
|
+
def self.asciimath_to_mathml1(expr)
|
13
|
+
AsciiMath::MathMLBuilder.new(msword: true).append_expression(
|
14
|
+
AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
|
15
|
+
).to_s
|
16
|
+
.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
+
rescue StandardError => e
|
18
|
+
puts "parsing: #{expr}"
|
19
|
+
puts e.message
|
20
|
+
raise e
|
22
21
|
end
|
23
22
|
|
24
23
|
def self.asciimath_to_mathml(doc, delims)
|
25
24
|
return doc if delims.nil? || delims.size < 2
|
25
|
+
|
26
26
|
m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
|
27
27
|
m.each_slice(4).map.with_index do |(*a), i|
|
28
|
-
i
|
29
|
-
warn "MathML #{i} of #{(m.size / 4).floor}"
|
28
|
+
progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
|
30
29
|
a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
|
31
30
|
a.size > 1 ? a[0] + a[2] : a[0]
|
32
31
|
end.join
|
33
32
|
end
|
34
33
|
|
34
|
+
def self.progress_conv(idx, step, total, threshold, msg)
|
35
|
+
return unless (idx % step).zero? && total > threshold && idx.positive?
|
36
|
+
|
37
|
+
warn "#{msg} #{idx} of #{total}"
|
38
|
+
end
|
39
|
+
|
35
40
|
def self.unwrap_accents(doc)
|
36
41
|
doc.xpath("//*[@accent = 'true']").each do |x|
|
37
42
|
x.elements.length > 1 or next
|
@@ -42,106 +47,124 @@ module Html2Doc
|
|
42
47
|
end
|
43
48
|
|
44
49
|
# random fixes to MathML input that OOXML needs to render properly
|
45
|
-
def self.ooxml_cleanup(
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
+
def self.ooxml_cleanup(math, docnamespaces)
|
51
|
+
math = unwrap_accents(
|
52
|
+
mathml_preserve_space(
|
53
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
54
|
+
),
|
55
|
+
)
|
56
|
+
math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
|
57
|
+
math
|
50
58
|
end
|
51
59
|
|
52
|
-
def self.mathml_insert_rows(
|
53
|
-
|
54
|
-
map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
60
|
+
def self.mathml_insert_rows(math, docnamespaces)
|
61
|
+
math.xpath(%w(msup msub msubsup munder mover munderover)
|
62
|
+
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
55
63
|
next unless x.next_element && x.next_element != "mrow"
|
64
|
+
|
56
65
|
x.next_element.wrap("<mrow/>")
|
57
66
|
end
|
58
|
-
|
67
|
+
math
|
59
68
|
end
|
60
69
|
|
61
|
-
def self.mathml_preserve_space(
|
62
|
-
|
70
|
+
def self.mathml_preserve_space(math, docnamespaces)
|
71
|
+
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
|
63
72
|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
|
64
73
|
end
|
65
|
-
|
74
|
+
math
|
66
75
|
end
|
67
76
|
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
78
|
+
|
79
|
+
def self.unitalic(math)
|
80
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
|
81
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
|
71
82
|
end
|
72
|
-
|
73
|
-
x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
|
83
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
|
84
|
+
x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
74
85
|
end
|
75
|
-
|
76
|
-
x.wrap("<span class='nostem'><em></em></span>")
|
86
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
|
87
|
+
x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
|
77
88
|
end
|
78
|
-
|
79
|
-
x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
|
89
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
|
90
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
80
91
|
end
|
81
|
-
|
82
|
-
|
92
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
|
93
|
+
to_plane1(x, :monospace)
|
83
94
|
end
|
84
|
-
|
85
|
-
|
95
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
|
96
|
+
to_plane1(x, :doublestruck)
|
86
97
|
end
|
87
|
-
|
88
|
-
|
98
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
|
99
|
+
to_plane1(x, :script)
|
89
100
|
end
|
90
|
-
|
91
|
-
|
101
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
|
102
|
+
to_plane1(x, :scriptbold)
|
92
103
|
end
|
93
|
-
|
94
|
-
|
104
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
105
|
+
to_plane1(x, :fraktur)
|
95
106
|
end
|
96
|
-
|
97
|
-
|
107
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
108
|
+
to_plane1(x, :frakturbold)
|
98
109
|
end
|
99
|
-
|
100
|
-
|
110
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
111
|
+
to_plane1(x, :sans)
|
101
112
|
end
|
102
|
-
|
103
|
-
|
113
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
114
|
+
to_plane1(x, :sansbold)
|
104
115
|
end
|
105
|
-
|
106
|
-
|
116
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
117
|
+
to_plane1(x, :sansitalic)
|
107
118
|
end
|
108
|
-
|
109
|
-
|
119
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
120
|
+
to_plane1(x, :sansbolditalic)
|
110
121
|
end
|
111
|
-
|
122
|
+
math
|
112
123
|
end
|
113
124
|
|
114
|
-
def self.
|
115
|
-
|
125
|
+
def self.to_plane1(xml, font)
|
126
|
+
xml.traverse do |n|
|
116
127
|
next unless n.text?
|
128
|
+
|
117
129
|
n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
|
118
130
|
end
|
119
|
-
|
131
|
+
xml
|
120
132
|
end
|
121
133
|
|
122
134
|
def self.mathml_to_ooml(docxml)
|
123
135
|
docnamespaces = docxml.collect_namespaces
|
124
136
|
m = docxml.xpath("//*[local-name() = 'math']")
|
125
137
|
m.each_with_index do |x, i|
|
126
|
-
i
|
127
|
-
|
128
|
-
element = ooxml_cleanup(x, docnamespaces)
|
129
|
-
doc = Nokogiri::XML::Document::new()
|
130
|
-
doc.root = element
|
131
|
-
ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
|
132
|
-
gsub(/<\?[^>]+>\s*/, "").
|
133
|
-
gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
|
134
|
-
gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
135
|
-
ooxml = uncenter(x, ooxml)
|
136
|
-
x.swap(ooxml)
|
138
|
+
progress_conv(i, 100, m.size, 500, "Math OOXML")
|
139
|
+
mathml_to_ooml1(x, docnamespaces)
|
137
140
|
end
|
138
141
|
end
|
139
142
|
|
140
|
-
#
|
143
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
|
+
# namespaces.
|
145
|
+
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
|
+
# insists on inheriting namespace from parent.
|
147
|
+
def self.ooml_clean(xml)
|
148
|
+
xml.to_s
|
149
|
+
.gsub(/<\?[^>]+>\s*/, "")
|
150
|
+
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
151
|
+
.gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.mathml_to_ooml1(xml, docnamespaces)
|
155
|
+
doc = Nokogiri::XML::Document::new
|
156
|
+
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
+
ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
|
158
|
+
ooxml = uncenter(xml, ooxml)
|
159
|
+
xml.swap(ooxml)
|
160
|
+
end
|
161
|
+
|
162
|
+
# escape space as 2; we are removing any spaces generated by
|
141
163
|
# XML indentation
|
142
164
|
def self.esc_space(xml)
|
143
165
|
xml.traverse do |n|
|
144
166
|
next unless n.text?
|
167
|
+
|
145
168
|
n = n.text.gsub(/ /, "2")
|
146
169
|
end
|
147
170
|
xml
|
@@ -149,17 +172,15 @@ module Html2Doc
|
|
149
172
|
|
150
173
|
# if oomml has no siblings, by default it is centered; override this with
|
151
174
|
# left/right if parent is so tagged
|
152
|
-
def self.uncenter(
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
"m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
160
|
-
elsif alignnode.text.include? ("text-align:right")
|
175
|
+
def self.uncenter(math, ooxml)
|
176
|
+
alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
|
177
|
+
"local-name() = 'div' or local-name() = 'td']/@style")
|
178
|
+
return ooxml unless alignnode && (math.next == nil && math.previous == nil)
|
179
|
+
|
180
|
+
%w(left right).each do |dir|
|
181
|
+
if alignnode.text.include? ("text-align:#{dir}")
|
161
182
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
162
|
-
"m:val='
|
183
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
163
184
|
end
|
164
185
|
end
|
165
186
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -7,20 +7,20 @@ require "fileutils"
|
|
7
7
|
module Html2Doc
|
8
8
|
def self.mime_preamble(boundary, filename, result)
|
9
9
|
<<~"PREAMBLE"
|
10
|
-
|
11
|
-
|
10
|
+
MIME-Version: 1.0
|
11
|
+
Content-Type: multipart/related; boundary="#{boundary}"
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
13
|
+
--#{boundary}
|
14
|
+
Content-ID: <#{File.basename(filename)}>
|
15
|
+
Content-Disposition: inline; filename="#{File.basename(filename)}"
|
16
|
+
Content-Type: text/html; charset="utf-8"
|
17
17
|
|
18
|
-
|
18
|
+
#{result}
|
19
19
|
|
20
20
|
PREAMBLE
|
21
21
|
end
|
22
22
|
|
23
|
-
def self.mime_attachment(boundary,
|
23
|
+
def self.mime_attachment(boundary, _filename, item, dir)
|
24
24
|
content_type = mime_type(item)
|
25
25
|
text_mode = %w[text application].any? { |p| content_type.start_with? p }
|
26
26
|
|
@@ -29,13 +29,13 @@ module Html2Doc
|
|
29
29
|
|
30
30
|
encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
|
31
31
|
<<~"FILE"
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
--#{boundary}
|
33
|
+
Content-ID: <#{File.basename(item)}>
|
34
|
+
Content-Disposition: inline; filename="#{File.basename(item)}"
|
35
|
+
Content-Transfer-Encoding: base64
|
36
|
+
Content-Type: #{content_type}
|
37
37
|
|
38
|
-
|
38
|
+
#{encoded_file}
|
39
39
|
|
40
40
|
FILE
|
41
41
|
end
|
@@ -43,7 +43,7 @@ module Html2Doc
|
|
43
43
|
def self.mime_type(item)
|
44
44
|
types = MIME::Types.type_for(item)
|
45
45
|
type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
|
46
|
-
type = type
|
46
|
+
type = %(#{type} charset="utf-8") if /^text/.match(type) && types
|
47
47
|
type
|
48
48
|
end
|
49
49
|
|
@@ -59,6 +59,7 @@ module Html2Doc
|
|
59
59
|
Dir.foreach(dir) do |item|
|
60
60
|
next if item == "." || item == ".." || /^\./.match(item) ||
|
61
61
|
item == "filelist.xml"
|
62
|
+
|
62
63
|
mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
|
63
64
|
end
|
64
65
|
mhtml += "--#{boundary}--"
|
@@ -69,17 +70,21 @@ module Html2Doc
|
|
69
70
|
mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
|
70
71
|
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
71
72
|
/^data:|^https?:/.match($2) ? m : repl
|
73
|
+
end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
|
74
|
+
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
75
|
+
/^data:|^https?:/.match($2) ? m : repl
|
72
76
|
end
|
73
77
|
end
|
74
78
|
|
75
79
|
# max width for Word document is 400, max height is 680
|
76
|
-
def self.image_resize(
|
77
|
-
|
78
|
-
s = [
|
79
|
-
s =
|
80
|
-
return [nil, nil] if
|
81
|
-
|
82
|
-
s[
|
80
|
+
def self.image_resize(img, path, maxheight, maxwidth)
|
81
|
+
realsize = ImageSize.path(path).size
|
82
|
+
s = [img["width"].to_i, img["height"].to_i]
|
83
|
+
s = realsize if s[0].zero? && s[1].zero?
|
84
|
+
return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?
|
85
|
+
|
86
|
+
s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
|
87
|
+
s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
|
83
88
|
s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
|
84
89
|
s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
|
85
90
|
s
|
@@ -92,19 +97,22 @@ module Html2Doc
|
|
92
97
|
end
|
93
98
|
|
94
99
|
def self.warnsvg(src)
|
95
|
-
warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
|
100
|
+
warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.localname(src, localdir)
|
104
|
+
%r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
|
96
105
|
end
|
97
106
|
|
98
107
|
# only processes locally stored images
|
99
108
|
def self.image_cleanup(docxml, dir, localdir)
|
100
109
|
docxml.traverse do |i|
|
101
110
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
102
|
-
|
103
|
-
next if
|
104
|
-
|
105
|
-
local_filename =
|
106
|
-
|
107
|
-
new_filename = "#{mkuuid}#{File.extname(i["src"])}"
|
111
|
+
next if /^http/.match? i["src"]
|
112
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
|
113
|
+
|
114
|
+
local_filename = localname(i["src"], localdir)
|
115
|
+
new_filename = "#{mkuuid}#{File.extname(i['src'])}"
|
108
116
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
109
117
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
110
118
|
i["src"] = File.join(File.basename(dir), new_filename)
|
@@ -112,7 +120,7 @@ module Html2Doc
|
|
112
120
|
docxml
|
113
121
|
end
|
114
122
|
|
115
|
-
# do not parse the header through Nokogiri, since it will contain
|
123
|
+
# do not parse the header through Nokogiri, since it will contain
|
116
124
|
# non-XML like <![if !supportFootnotes]>
|
117
125
|
def self.header_image_cleanup(doc, dir, filename, localdir)
|
118
126
|
doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
|
@@ -120,15 +128,13 @@ module Html2Doc
|
|
120
128
|
end.join
|
121
129
|
end
|
122
130
|
|
123
|
-
def self.header_image_cleanup1(a, dir,
|
131
|
+
def self.header_image_cleanup1(a, dir, _filename, localdir)
|
124
132
|
if a.size == 2 && !(/ src="https?:/.match a[1]) &&
|
125
133
|
!(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
|
126
134
|
m = / src=['"](?<src>[^"']+)['"]/.match a[1]
|
127
|
-
#warnsvg(m[:src])
|
128
135
|
m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
|
129
136
|
new_filename = "#{mkuuid}.#{m2[:suffix]}"
|
130
|
-
|
131
|
-
FileUtils.cp old_filename, File.join(dir, new_filename)
|
137
|
+
FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
|
132
138
|
a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
|
133
139
|
end
|
134
140
|
a.join
|
@@ -140,6 +146,7 @@ module Html2Doc
|
|
140
146
|
<o:MainFile HRef="../#{filename}.htm"/>}
|
141
147
|
Dir.entries(dir).sort.each do |item|
|
142
148
|
next if item == "." || item == ".." || /^\./.match(item)
|
149
|
+
|
143
150
|
f.write %{ <o:File HRef="#{item}"/>\n}
|
144
151
|
end
|
145
152
|
f.write("</xml>\n")
|
data/lib/html2doc/notes.rb
CHANGED
@@ -6,6 +6,7 @@ module Html2Doc
|
|
6
6
|
fn = []
|
7
7
|
docxml.xpath("//a").each do |a|
|
8
8
|
next unless process_footnote_link(docxml, a, i, fn)
|
9
|
+
|
9
10
|
i += 1
|
10
11
|
end
|
11
12
|
process_footnote_texts(docxml, fn)
|
@@ -22,13 +23,13 @@ module Html2Doc
|
|
22
23
|
footnote_cleanup(docxml)
|
23
24
|
end
|
24
25
|
|
25
|
-
def self.footnote_div_to_p(
|
26
|
-
if %w{div aside}.include?
|
27
|
-
if
|
28
|
-
|
26
|
+
def self.footnote_div_to_p(elem)
|
27
|
+
if %w{div aside}.include? elem.name
|
28
|
+
if elem.at(".//p")
|
29
|
+
elem.replace(elem.children)
|
29
30
|
else
|
30
|
-
|
31
|
-
|
31
|
+
elem.name = "p"
|
32
|
+
elem["class"] = "MsoFootnoteText"
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
@@ -36,34 +37,39 @@ module Html2Doc
|
|
36
37
|
FN = "<span class='MsoFootnoteReference'>"\
|
37
38
|
"<span style='mso-special-character:footnote'/></span>".freeze
|
38
39
|
|
39
|
-
def self.footnote_container(docxml,
|
40
|
-
ref = docxml&.at("//a[@href='#_ftn#{
|
41
|
-
gsub(/>\n</, "><") || FN
|
40
|
+
def self.footnote_container(docxml, idx)
|
41
|
+
ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
|
42
|
+
&.gsub(/>\n</, "><") || FN
|
42
43
|
<<~DIV
|
43
|
-
<div style='mso-element:footnote' id='ftn#{
|
44
|
-
<a style='mso-footnote-id:ftn#{
|
45
|
-
name='_ftnref#{
|
44
|
+
<div style='mso-element:footnote' id='ftn#{idx}'>
|
45
|
+
<a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
|
46
|
+
name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
|
46
47
|
DIV
|
47
48
|
end
|
48
49
|
|
49
|
-
def self.process_footnote_link(docxml,
|
50
|
-
return false unless footnote?(
|
51
|
-
|
50
|
+
def self.process_footnote_link(docxml, elem, idx, footnote)
|
51
|
+
return false unless footnote?(elem)
|
52
|
+
|
53
|
+
href = elem["href"].gsub(/^#/, "")
|
52
54
|
note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
|
53
55
|
return false if note.nil?
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
56
|
+
|
57
|
+
set_footnote_link_attrs(elem, idx)
|
58
|
+
if elem.at("./span[@class = 'MsoFootnoteReference']")
|
59
|
+
process_footnote_link1(elem)
|
60
|
+
else elem.children = FN
|
61
|
+
end
|
62
|
+
footnote << transform_footnote_text(note)
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.process_footnote_link1(elem)
|
66
|
+
elem.children.each do |c|
|
67
|
+
if c.name == "span" && c["class"] == "MsoFootnoteReference"
|
68
|
+
c.replace(FN)
|
69
|
+
else
|
70
|
+
c.wrap("<span class='MsoFootnoteReference'></span>")
|
62
71
|
end
|
63
|
-
else
|
64
|
-
a.children = FN
|
65
72
|
end
|
66
|
-
fn << transform_footnote_text(note)
|
67
73
|
end
|
68
74
|
|
69
75
|
def self.transform_footnote_text(note)
|
@@ -76,16 +82,16 @@ module Html2Doc
|
|
76
82
|
note.remove
|
77
83
|
end
|
78
84
|
|
79
|
-
def self.footnote?(
|
80
|
-
|
81
|
-
|
85
|
+
def self.footnote?(elem)
|
86
|
+
elem["epub:type"]&.casecmp("footnote")&.zero? ||
|
87
|
+
elem["class"]&.casecmp("footnote")&.zero?
|
82
88
|
end
|
83
89
|
|
84
|
-
def self.set_footnote_link_attrs(
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
90
|
+
def self.set_footnote_link_attrs(elem, idx)
|
91
|
+
elem["style"] = "mso-footnote-id:ftn#{idx}"
|
92
|
+
elem["href"] = "#_ftn#{idx}"
|
93
|
+
elem["name"] = "_ftnref#{idx}"
|
94
|
+
elem["title"] = ""
|
89
95
|
end
|
90
96
|
|
91
97
|
# We expect that the content of the footnote text received is one or
|
@@ -94,8 +100,8 @@ module Html2Doc
|
|
94
100
|
# are present in the HTML, they need to have been cleaned out before
|
95
101
|
# passing to this gem
|
96
102
|
def self.footnote_cleanup(docxml)
|
97
|
-
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
98
|
-
each do |x|
|
103
|
+
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
104
|
+
.each do |x|
|
99
105
|
n = x.next_element
|
100
106
|
n&.children&.first&.add_previous_sibling(x.remove)
|
101
107
|
end
|
data/lib/html2doc/version.rb
CHANGED
data/lib/html2doc.rb
CHANGED