html2doc 1.0.7 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +6 -37
- data/.gitignore +2 -0
- data/.hound.yml +3 -1
- data/.rubocop.yml +7 -7
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc.rb +0 -3
- data/lib/html2doc/base.rb +58 -47
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -73
- data/lib/html2doc/mime.rb +53 -37
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +575 -517
- metadata +44 -46
- data/.rubocop.ribose.yml +0 -65
- data/.rubocop.tb.yml +0 -650
data/lib/html2doc/math.rb
CHANGED
@@ -9,23 +9,34 @@ module Html2Doc
|
|
9
9
|
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
10
10
|
encoding: "utf-8"))
|
11
11
|
|
12
|
-
def self.asciimath_to_mathml1(
|
13
|
-
AsciiMath::MathMLBuilder.new(:
|
14
|
-
AsciiMath.parse(HTMLEntities.new.decode(
|
15
|
-
|
12
|
+
def self.asciimath_to_mathml1(expr)
|
13
|
+
AsciiMath::MathMLBuilder.new(msword: true).append_expression(
|
14
|
+
AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
|
15
|
+
).to_s
|
16
|
+
.gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
17
|
+
rescue StandardError => e
|
18
|
+
puts "parsing: #{expr}"
|
19
|
+
puts e.message
|
20
|
+
raise e
|
16
21
|
end
|
17
22
|
|
18
23
|
def self.asciimath_to_mathml(doc, delims)
|
19
24
|
return doc if delims.nil? || delims.size < 2
|
25
|
+
|
20
26
|
m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
|
21
27
|
m.each_slice(4).map.with_index do |(*a), i|
|
22
|
-
i
|
23
|
-
warn "MathML #{i} of #{(m.size / 4).floor}"
|
28
|
+
progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
|
24
29
|
a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
|
25
30
|
a.size > 1 ? a[0] + a[2] : a[0]
|
26
31
|
end.join
|
27
32
|
end
|
28
33
|
|
34
|
+
def self.progress_conv(idx, step, total, threshold, msg)
|
35
|
+
return unless (idx % step).zero? && total > threshold && idx.positive?
|
36
|
+
|
37
|
+
warn "#{msg} #{idx} of #{total}"
|
38
|
+
end
|
39
|
+
|
29
40
|
def self.unwrap_accents(doc)
|
30
41
|
doc.xpath("//*[@accent = 'true']").each do |x|
|
31
42
|
x.elements.length > 1 or next
|
@@ -36,106 +47,124 @@ module Html2Doc
|
|
36
47
|
end
|
37
48
|
|
38
49
|
# random fixes to MathML input that OOXML needs to render properly
|
39
|
-
def self.ooxml_cleanup(
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
50
|
+
def self.ooxml_cleanup(math, docnamespaces)
|
51
|
+
math = unwrap_accents(
|
52
|
+
mathml_preserve_space(
|
53
|
+
mathml_insert_rows(math, docnamespaces), docnamespaces
|
54
|
+
),
|
55
|
+
)
|
56
|
+
math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
|
57
|
+
math
|
44
58
|
end
|
45
59
|
|
46
|
-
def self.mathml_insert_rows(
|
47
|
-
|
48
|
-
map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
60
|
+
def self.mathml_insert_rows(math, docnamespaces)
|
61
|
+
math.xpath(%w(msup msub msubsup munder mover munderover)
|
62
|
+
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
49
63
|
next unless x.next_element && x.next_element != "mrow"
|
64
|
+
|
50
65
|
x.next_element.wrap("<mrow/>")
|
51
66
|
end
|
52
|
-
|
67
|
+
math
|
53
68
|
end
|
54
69
|
|
55
|
-
def self.mathml_preserve_space(
|
56
|
-
|
70
|
+
def self.mathml_preserve_space(math, docnamespaces)
|
71
|
+
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
|
57
72
|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
|
58
73
|
end
|
59
|
-
|
74
|
+
math
|
60
75
|
end
|
61
76
|
|
62
|
-
|
63
|
-
|
64
|
-
|
77
|
+
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
78
|
+
|
79
|
+
def self.unitalic(math)
|
80
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
|
81
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
|
65
82
|
end
|
66
|
-
|
67
|
-
x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
|
83
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
|
84
|
+
x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
|
68
85
|
end
|
69
|
-
|
70
|
-
x.wrap("<span class='nostem'><em></em></span>")
|
86
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
|
87
|
+
x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
|
71
88
|
end
|
72
|
-
|
73
|
-
x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
|
89
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
|
90
|
+
x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
|
74
91
|
end
|
75
|
-
|
76
|
-
|
92
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
|
93
|
+
to_plane1(x, :monospace)
|
77
94
|
end
|
78
|
-
|
79
|
-
|
95
|
+
math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
|
96
|
+
to_plane1(x, :doublestruck)
|
80
97
|
end
|
81
|
-
|
82
|
-
|
98
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
|
99
|
+
to_plane1(x, :script)
|
83
100
|
end
|
84
|
-
|
85
|
-
|
101
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
|
102
|
+
to_plane1(x, :scriptbold)
|
86
103
|
end
|
87
|
-
|
88
|
-
|
104
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
105
|
+
to_plane1(x, :fraktur)
|
89
106
|
end
|
90
|
-
|
91
|
-
|
107
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
|
108
|
+
to_plane1(x, :frakturbold)
|
92
109
|
end
|
93
|
-
|
94
|
-
|
110
|
+
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
111
|
+
to_plane1(x, :sans)
|
95
112
|
end
|
96
|
-
|
97
|
-
|
113
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
114
|
+
to_plane1(x, :sansbold)
|
98
115
|
end
|
99
|
-
|
100
|
-
|
116
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
117
|
+
to_plane1(x, :sansitalic)
|
101
118
|
end
|
102
|
-
|
103
|
-
|
119
|
+
math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
|
120
|
+
to_plane1(x, :sansbolditalic)
|
104
121
|
end
|
105
|
-
|
122
|
+
math
|
106
123
|
end
|
107
124
|
|
108
|
-
def self.
|
109
|
-
|
125
|
+
def self.to_plane1(xml, font)
|
126
|
+
xml.traverse do |n|
|
110
127
|
next unless n.text?
|
128
|
+
|
111
129
|
n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
|
112
130
|
end
|
113
|
-
|
131
|
+
xml
|
114
132
|
end
|
115
133
|
|
116
134
|
def self.mathml_to_ooml(docxml)
|
117
135
|
docnamespaces = docxml.collect_namespaces
|
118
136
|
m = docxml.xpath("//*[local-name() = 'math']")
|
119
137
|
m.each_with_index do |x, i|
|
120
|
-
i
|
121
|
-
|
122
|
-
element = ooxml_cleanup(x, docnamespaces)
|
123
|
-
doc = Nokogiri::XML::Document::new()
|
124
|
-
doc.root = element
|
125
|
-
ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
|
126
|
-
gsub(/<\?[^>]+>\s*/, "").
|
127
|
-
gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
|
128
|
-
gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
129
|
-
ooxml = uncenter(x, ooxml)
|
130
|
-
x.swap(ooxml)
|
138
|
+
progress_conv(i, 100, m.size, 500, "Math OOXML")
|
139
|
+
mathml_to_ooml1(x, docnamespaces)
|
131
140
|
end
|
132
141
|
end
|
133
142
|
|
134
|
-
#
|
143
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
|
+
# namespaces.
|
145
|
+
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
|
+
# insists on inheriting namespace from parent.
|
147
|
+
def self.ooml_clean(xml)
|
148
|
+
xml.to_s
|
149
|
+
.gsub(/<\?[^>]+>\s*/, "")
|
150
|
+
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
151
|
+
.gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
152
|
+
end
|
153
|
+
|
154
|
+
def self.mathml_to_ooml1(xml, docnamespaces)
|
155
|
+
doc = Nokogiri::XML::Document::new
|
156
|
+
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
+
ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
|
158
|
+
ooxml = uncenter(xml, ooxml)
|
159
|
+
xml.swap(ooxml)
|
160
|
+
end
|
161
|
+
|
162
|
+
# escape space as 2; we are removing any spaces generated by
|
135
163
|
# XML indentation
|
136
164
|
def self.esc_space(xml)
|
137
165
|
xml.traverse do |n|
|
138
166
|
next unless n.text?
|
167
|
+
|
139
168
|
n = n.text.gsub(/ /, "2")
|
140
169
|
end
|
141
170
|
xml
|
@@ -143,17 +172,15 @@ module Html2Doc
|
|
143
172
|
|
144
173
|
# if oomml has no siblings, by default it is centered; override this with
|
145
174
|
# left/right if parent is so tagged
|
146
|
-
def self.uncenter(
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
"m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
154
|
-
elsif alignnode.text.include? ("text-align:right")
|
175
|
+
def self.uncenter(math, ooxml)
|
176
|
+
alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
|
177
|
+
"local-name() = 'div' or local-name() = 'td']/@style")
|
178
|
+
return ooxml unless alignnode && (math.next == nil && math.previous == nil)
|
179
|
+
|
180
|
+
%w(left right).each do |dir|
|
181
|
+
if alignnode.text.include? ("text-align:#{dir}")
|
155
182
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
156
|
-
"m:val='
|
183
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
157
184
|
end
|
158
185
|
end
|
159
186
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -7,19 +7,20 @@ require "fileutils"
|
|
7
7
|
module Html2Doc
|
8
8
|
def self.mime_preamble(boundary, filename, result)
|
9
9
|
<<~"PREAMBLE"
|
10
|
-
|
11
|
-
|
10
|
+
MIME-Version: 1.0
|
11
|
+
Content-Type: multipart/related; boundary="#{boundary}"
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
--#{boundary}
|
14
|
+
Content-ID: <#{File.basename(filename)}>
|
15
|
+
Content-Disposition: inline; filename="#{File.basename(filename)}"
|
16
|
+
Content-Type: text/html; charset="utf-8"
|
16
17
|
|
17
|
-
|
18
|
+
#{result}
|
18
19
|
|
19
20
|
PREAMBLE
|
20
21
|
end
|
21
22
|
|
22
|
-
def self.mime_attachment(boundary,
|
23
|
+
def self.mime_attachment(boundary, _filename, item, dir)
|
23
24
|
content_type = mime_type(item)
|
24
25
|
text_mode = %w[text application].any? { |p| content_type.start_with? p }
|
25
26
|
|
@@ -28,12 +29,13 @@ module Html2Doc
|
|
28
29
|
|
29
30
|
encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
|
30
31
|
<<~"FILE"
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
32
|
+
--#{boundary}
|
33
|
+
Content-ID: <#{File.basename(item)}>
|
34
|
+
Content-Disposition: inline; filename="#{File.basename(item)}"
|
35
|
+
Content-Transfer-Encoding: base64
|
36
|
+
Content-Type: #{content_type}
|
35
37
|
|
36
|
-
|
38
|
+
#{encoded_file}
|
37
39
|
|
38
40
|
FILE
|
39
41
|
end
|
@@ -41,7 +43,7 @@ module Html2Doc
|
|
41
43
|
def self.mime_type(item)
|
42
44
|
types = MIME::Types.type_for(item)
|
43
45
|
type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
|
44
|
-
type = type
|
46
|
+
type = %(#{type} charset="utf-8") if /^text/.match(type) && types
|
45
47
|
type
|
46
48
|
end
|
47
49
|
|
@@ -52,25 +54,37 @@ module Html2Doc
|
|
52
54
|
|
53
55
|
def self.mime_package(result, filename, dir)
|
54
56
|
boundary = mime_boundary
|
55
|
-
mhtml = mime_preamble(boundary, filename, result)
|
56
|
-
mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
|
57
|
+
mhtml = mime_preamble(boundary, "#{filename}.htm", result)
|
58
|
+
mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
|
57
59
|
Dir.foreach(dir) do |item|
|
58
60
|
next if item == "." || item == ".." || /^\./.match(item) ||
|
59
61
|
item == "filelist.xml"
|
60
|
-
|
62
|
+
|
63
|
+
mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
|
61
64
|
end
|
62
65
|
mhtml += "--#{boundary}--"
|
63
|
-
File.open("#{filename}.doc", "w:UTF-8") { |f| f.write mhtml }
|
66
|
+
File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.contentid(mhtml)
|
70
|
+
mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
|
71
|
+
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
72
|
+
/^data:|^https?:/.match($2) ? m : repl
|
73
|
+
end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
|
74
|
+
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
75
|
+
/^data:|^https?:/.match($2) ? m : repl
|
76
|
+
end
|
64
77
|
end
|
65
78
|
|
66
79
|
# max width for Word document is 400, max height is 680
|
67
|
-
def self.image_resize(
|
68
|
-
|
69
|
-
s = [
|
70
|
-
s =
|
71
|
-
return [nil, nil] if
|
72
|
-
|
73
|
-
s[
|
80
|
+
def self.image_resize(img, path, maxheight, maxwidth)
|
81
|
+
realsize = ImageSize.path(path).size
|
82
|
+
s = [img["width"].to_i, img["height"].to_i]
|
83
|
+
s = realsize if s[0].zero? && s[1].zero?
|
84
|
+
return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?
|
85
|
+
|
86
|
+
s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
|
87
|
+
s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
|
74
88
|
s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
|
75
89
|
s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
|
76
90
|
s
|
@@ -83,19 +97,22 @@ module Html2Doc
|
|
83
97
|
end
|
84
98
|
|
85
99
|
def self.warnsvg(src)
|
86
|
-
warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
|
100
|
+
warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.localname(src, localdir)
|
104
|
+
%r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
|
87
105
|
end
|
88
106
|
|
89
107
|
# only processes locally stored images
|
90
108
|
def self.image_cleanup(docxml, dir, localdir)
|
91
109
|
docxml.traverse do |i|
|
92
110
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
93
|
-
|
94
|
-
next if
|
95
|
-
|
96
|
-
local_filename =
|
97
|
-
|
98
|
-
new_filename = "#{mkuuid}#{File.extname(i["src"])}"
|
111
|
+
next if /^http/.match? i["src"]
|
112
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
|
113
|
+
|
114
|
+
local_filename = localname(i["src"], localdir)
|
115
|
+
new_filename = "#{mkuuid}#{File.extname(i['src'])}"
|
99
116
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
100
117
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
101
118
|
i["src"] = File.join(File.basename(dir), new_filename)
|
@@ -103,7 +120,7 @@ module Html2Doc
|
|
103
120
|
docxml
|
104
121
|
end
|
105
122
|
|
106
|
-
# do not parse the header through Nokogiri, since it will contain
|
123
|
+
# do not parse the header through Nokogiri, since it will contain
|
107
124
|
# non-XML like <![if !supportFootnotes]>
|
108
125
|
def self.header_image_cleanup(doc, dir, filename, localdir)
|
109
126
|
doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
|
@@ -111,16 +128,14 @@ module Html2Doc
|
|
111
128
|
end.join
|
112
129
|
end
|
113
130
|
|
114
|
-
def self.header_image_cleanup1(a, dir,
|
131
|
+
def self.header_image_cleanup1(a, dir, _filename, localdir)
|
115
132
|
if a.size == 2 && !(/ src="https?:/.match a[1]) &&
|
116
133
|
!(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
|
117
134
|
m = / src=['"](?<src>[^"']+)['"]/.match a[1]
|
118
|
-
#warnsvg(m[:src])
|
119
135
|
m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
|
120
136
|
new_filename = "#{mkuuid}.#{m2[:suffix]}"
|
121
|
-
|
122
|
-
|
123
|
-
a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='file:///C:/Doc/#{filename}_files/#{new_filename}'")
|
137
|
+
FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
|
138
|
+
a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
|
124
139
|
end
|
125
140
|
a.join
|
126
141
|
end
|
@@ -131,6 +146,7 @@ module Html2Doc
|
|
131
146
|
<o:MainFile HRef="../#{filename}.htm"/>}
|
132
147
|
Dir.entries(dir).sort.each do |item|
|
133
148
|
next if item == "." || item == ".." || /^\./.match(item)
|
149
|
+
|
134
150
|
f.write %{ <o:File HRef="#{item}"/>\n}
|
135
151
|
end
|
136
152
|
f.write("</xml>\n")
|
data/lib/html2doc/notes.rb
CHANGED
@@ -6,6 +6,7 @@ module Html2Doc
|
|
6
6
|
fn = []
|
7
7
|
docxml.xpath("//a").each do |a|
|
8
8
|
next unless process_footnote_link(docxml, a, i, fn)
|
9
|
+
|
9
10
|
i += 1
|
10
11
|
end
|
11
12
|
process_footnote_texts(docxml, fn)
|
@@ -22,13 +23,13 @@ module Html2Doc
|
|
22
23
|
footnote_cleanup(docxml)
|
23
24
|
end
|
24
25
|
|
25
|
-
def self.footnote_div_to_p(
|
26
|
-
if %w{div aside}.include?
|
27
|
-
if
|
28
|
-
|
26
|
+
def self.footnote_div_to_p(elem)
|
27
|
+
if %w{div aside}.include? elem.name
|
28
|
+
if elem.at(".//p")
|
29
|
+
elem.replace(elem.children)
|
29
30
|
else
|
30
|
-
|
31
|
-
|
31
|
+
elem.name = "p"
|
32
|
+
elem["class"] = "MsoFootnoteText"
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
@@ -36,34 +37,39 @@ module Html2Doc
|
|
36
37
|
FN = "<span class='MsoFootnoteReference'>"\
|
37
38
|
"<span style='mso-special-character:footnote'/></span>".freeze
|
38
39
|
|
39
|
-
def self.footnote_container(docxml,
|
40
|
-
ref = docxml&.at("//a[@href='#_ftn#{
|
41
|
-
gsub(/>\n</, "><") || FN
|
40
|
+
def self.footnote_container(docxml, idx)
|
41
|
+
ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
|
42
|
+
&.gsub(/>\n</, "><") || FN
|
42
43
|
<<~DIV
|
43
|
-
<div style='mso-element:footnote' id='ftn#{
|
44
|
-
<a style='mso-footnote-id:ftn#{
|
45
|
-
name='_ftnref#{
|
44
|
+
<div style='mso-element:footnote' id='ftn#{idx}'>
|
45
|
+
<a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
|
46
|
+
name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
|
46
47
|
DIV
|
47
48
|
end
|
48
49
|
|
49
|
-
def self.process_footnote_link(docxml,
|
50
|
-
return false unless footnote?(
|
51
|
-
|
50
|
+
def self.process_footnote_link(docxml, elem, idx, footnote)
|
51
|
+
return false unless footnote?(elem)
|
52
|
+
|
53
|
+
href = elem["href"].gsub(/^#/, "")
|
52
54
|
note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
|
53
55
|
return false if note.nil?
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
56
|
+
|
57
|
+
set_footnote_link_attrs(elem, idx)
|
58
|
+
if elem.at("./span[@class = 'MsoFootnoteReference']")
|
59
|
+
process_footnote_link1(elem)
|
60
|
+
else elem.children = FN
|
61
|
+
end
|
62
|
+
footnote << transform_footnote_text(note)
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.process_footnote_link1(elem)
|
66
|
+
elem.children.each do |c|
|
67
|
+
if c.name == "span" && c["class"] == "MsoFootnoteReference"
|
68
|
+
c.replace(FN)
|
69
|
+
else
|
70
|
+
c.wrap("<span class='MsoFootnoteReference'></span>")
|
62
71
|
end
|
63
|
-
else
|
64
|
-
a.children = FN
|
65
72
|
end
|
66
|
-
fn << transform_footnote_text(note)
|
67
73
|
end
|
68
74
|
|
69
75
|
def self.transform_footnote_text(note)
|
@@ -76,16 +82,16 @@ module Html2Doc
|
|
76
82
|
note.remove
|
77
83
|
end
|
78
84
|
|
79
|
-
def self.footnote?(
|
80
|
-
|
81
|
-
|
85
|
+
def self.footnote?(elem)
|
86
|
+
elem["epub:type"]&.casecmp("footnote")&.zero? ||
|
87
|
+
elem["class"]&.casecmp("footnote")&.zero?
|
82
88
|
end
|
83
89
|
|
84
|
-
def self.set_footnote_link_attrs(
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
90
|
+
def self.set_footnote_link_attrs(elem, idx)
|
91
|
+
elem["style"] = "mso-footnote-id:ftn#{idx}"
|
92
|
+
elem["href"] = "#_ftn#{idx}"
|
93
|
+
elem["name"] = "_ftnref#{idx}"
|
94
|
+
elem["title"] = ""
|
89
95
|
end
|
90
96
|
|
91
97
|
# We expect that the content of the footnote text received is one or
|
@@ -94,8 +100,8 @@ module Html2Doc
|
|
94
100
|
# are present in the HTML, they need to have been cleaned out before
|
95
101
|
# passing to this gem
|
96
102
|
def self.footnote_cleanup(docxml)
|
97
|
-
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
98
|
-
each do |x|
|
103
|
+
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
104
|
+
.each do |x|
|
99
105
|
n = x.next_element
|
100
106
|
n&.children&.first&.add_previous_sibling(x.remove)
|
101
107
|
end
|