html2doc 1.3.0.1 → 1.4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +2 -2
- data/bin/html2doc +1 -2
- data/lib/html2doc/base.rb +55 -41
- data/lib/html2doc/lists.rb +8 -8
- data/lib/html2doc/math.rb +35 -22
- data/lib/html2doc/mime.rb +20 -19
- data/lib/html2doc/notes.rb +11 -11
- data/lib/html2doc/version.rb +2 -2
- data/spec/html2doc_spec.rb +99 -83
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 39f218409dbbaa66345a38c3ac768bde9def9ffffa4fb40b388366994c28cba3
|
4
|
+
data.tar.gz: c3eb9ec0b62796ca8cd165c787f03a752e3ba92ff01f4fcedac7c47099e3b6f9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5de663e28833714b38e902ecb78f567f1e6734ab7998c12f37aec432a44694b8f5e8f868b861a443716b1a88d05c60e92fc298b2f9d608e6370c74d0ad0170f6
|
7
|
+
data.tar.gz: 5e508c5589940b6b50cb16de23d8de07717d82807308e4980ae8206a219778a0ef3ae835afd87d021f835ffd218675e557cea0fb396fc85707e6df9502cab118
|
data/README.adoc
CHANGED
@@ -58,14 +58,14 @@ There there are two other Microsoft Word vendors in the Ruby ecosystem.
|
|
58
58
|
--
|
59
59
|
require "html2doc"
|
60
60
|
|
61
|
-
Html2Doc.
|
61
|
+
Html2Doc.new(filename: filename, imagedir: imagedir, stylesheet: stylesheet, header_file: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles).process(result)
|
62
62
|
--
|
63
63
|
|
64
64
|
result:: is the Html document to be converted into Word, as a string.
|
65
65
|
filename:: is the name the document is to be saved as, without a file suffix
|
66
66
|
imagedir:: base directory for local image file names in source XML
|
67
67
|
stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided, the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
|
68
|
-
|
68
|
+
header_file:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/metanorma/metanorma-iso/blob/master/lib/asciidoctor/iso/word/header.html
|
69
69
|
dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided, it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder. External URL images are left alone, and are not downloaded.)
|
70
70
|
asciimathdelims:: are the AsciiMath delimiters used in the text (an array of an opening and a closing delimiter). If none are provided, no AsciiMath conversion is attempted.
|
71
71
|
liststyles:: a hash of list style labels in Word CSS, which are used to define the behaviour of list item labels (e.g. _i)_ vs _i._). The gem recognises the hash keys `ul`, `ol`. So if the appearance of an ordered list's item labels in the supplied stylesheet is governed by style `@list l1` (e.g. `@list l1:level1 {mso-level-text:"%1\)";}` appears in the stylesheet), call the method with `liststyles:{ol: "l1"}`. The lists that the `ul` and `ol` list styles are applied to are assumed not to have any CSS class. If there any additional hash keys, they are assumed to be classes applied to the topmost ordered or unordered list; e.g. `liststyles:{steps: "l5"}` means that any list with class `steps` at the topmost level has the list style `l5` recursively applied to it. Any top-level lists without a class named in liststyles will be treated like lists with no CSS class.
|
data/bin/html2doc
CHANGED
data/lib/html2doc/base.rb
CHANGED
@@ -4,27 +4,41 @@ require "htmlentities"
|
|
4
4
|
require "nokogiri"
|
5
5
|
require "fileutils"
|
6
6
|
|
7
|
-
|
8
|
-
def
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
7
|
+
class Html2Doc
|
8
|
+
def initialize(hash)
|
9
|
+
@filename = hash[:filename]
|
10
|
+
@dir = hash[:dir]
|
11
|
+
@dir1 = create_dir(@filename, @dir)
|
12
|
+
@header_file = hash[:header_file]
|
13
|
+
@asciimathdelims = hash[:asciimathdelims]
|
14
|
+
@imagedir = hash[:imagedir]
|
15
|
+
@debug = hash[:debug]
|
16
|
+
@liststyles = hash[:liststyles]
|
17
|
+
@stylesheet = hash[:stylesheet]
|
18
|
+
@xsltemplate =
|
19
|
+
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
20
|
+
encoding: "utf-8"))
|
21
|
+
end
|
22
|
+
|
23
|
+
def process(result)
|
24
|
+
result = process_html(result)
|
25
|
+
process_header(@header_file)
|
26
|
+
generate_filelist(@filename, @dir1)
|
27
|
+
File.open("#{@filename}.htm", "w:UTF-8") { |f| f.write(result) }
|
28
|
+
mime_package result, @filename, @dir1
|
29
|
+
rm_temp_files(@filename, @dir, @dir1) unless @debug
|
30
|
+
end
|
31
|
+
|
32
|
+
def process_header(headerfile)
|
19
33
|
return if headerfile.nil?
|
20
34
|
|
21
35
|
doc = File.read(headerfile, encoding: "utf-8")
|
22
|
-
doc = header_image_cleanup(doc,
|
23
|
-
File.dirname(
|
24
|
-
File.open("#{
|
36
|
+
doc = header_image_cleanup(doc, @dir1, @filename,
|
37
|
+
File.dirname(@filename))
|
38
|
+
File.open("#{@dir1}/header.html", "w:UTF-8") { |f| f.write(doc) }
|
25
39
|
end
|
26
40
|
|
27
|
-
def
|
41
|
+
def clear_dir(dir)
|
28
42
|
Dir.foreach(dir) do |f|
|
29
43
|
fn = File.join(dir, f)
|
30
44
|
File.delete(fn) if f != "." && f != ".."
|
@@ -32,30 +46,30 @@ module Html2Doc
|
|
32
46
|
dir
|
33
47
|
end
|
34
48
|
|
35
|
-
def
|
49
|
+
def create_dir(filename, dir)
|
36
50
|
dir and return clear_dir(dir)
|
37
51
|
dir = "#{filename}_files"
|
38
52
|
Dir.mkdir(dir) unless File.exists?(dir)
|
39
53
|
clear_dir(dir)
|
40
54
|
end
|
41
55
|
|
42
|
-
def
|
43
|
-
docxml = to_xhtml(asciimath_to_mathml(result,
|
44
|
-
define_head(cleanup(docxml
|
56
|
+
def process_html(result)
|
57
|
+
docxml = to_xhtml(asciimath_to_mathml(result, @asciimathdelims))
|
58
|
+
define_head(cleanup(docxml))
|
45
59
|
msword_fix(from_xhtml(docxml))
|
46
60
|
end
|
47
61
|
|
48
|
-
def
|
62
|
+
def rm_temp_files(filename, dir, dir1)
|
49
63
|
FileUtils.rm "#{filename}.htm"
|
50
64
|
FileUtils.rm_f "#{dir1}/header.html"
|
51
65
|
FileUtils.rm_r dir1 unless dir
|
52
66
|
end
|
53
67
|
|
54
|
-
def
|
68
|
+
def cleanup(docxml)
|
55
69
|
namespace(docxml.root)
|
56
|
-
image_cleanup(docxml,
|
70
|
+
image_cleanup(docxml, @dir1, @imagedir)
|
57
71
|
mathml_to_ooml(docxml)
|
58
|
-
lists(docxml,
|
72
|
+
lists(docxml, @liststyles)
|
59
73
|
footnotes(docxml)
|
60
74
|
bookmarks(docxml)
|
61
75
|
msonormal(docxml)
|
@@ -70,7 +84,7 @@ module Html2Doc
|
|
70
84
|
<body> </body> </html>
|
71
85
|
HERE
|
72
86
|
|
73
|
-
def
|
87
|
+
def to_xhtml(xml)
|
74
88
|
xml.gsub!(/<\?xml[^>]*>/, "")
|
75
89
|
unless /<!DOCTYPE /.match? xml
|
76
90
|
xml = '<!DOCTYPE html SYSTEM
|
@@ -85,7 +99,7 @@ module Html2Doc
|
|
85
99
|
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
86
100
|
DOCTYPE
|
87
101
|
|
88
|
-
def
|
102
|
+
def from_xhtml(xml)
|
89
103
|
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
90
104
|
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
91
105
|
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
@@ -93,7 +107,7 @@ module Html2Doc
|
|
93
107
|
.gsub("\n-->\n", "\n-->\n")
|
94
108
|
end
|
95
109
|
|
96
|
-
def
|
110
|
+
def msword_fix(doc)
|
97
111
|
# brain damage in MSWord parser
|
98
112
|
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
99
113
|
"<w:DoNotOptimizeForBrowser/>")
|
@@ -133,7 +147,7 @@ module Html2Doc
|
|
133
147
|
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
134
148
|
XML
|
135
149
|
|
136
|
-
def
|
150
|
+
def define_head1(docxml, _dir)
|
137
151
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
138
152
|
h.children.first.add_previous_sibling <<~XML
|
139
153
|
#{PRINT_VIEW}
|
@@ -142,7 +156,7 @@ module Html2Doc
|
|
142
156
|
end
|
143
157
|
end
|
144
158
|
|
145
|
-
def
|
159
|
+
def filename_substitute(head, header_filename)
|
146
160
|
return if header_filename.nil?
|
147
161
|
|
148
162
|
head.xpath(".//*[local-name() = 'style']").each do |s|
|
@@ -153,30 +167,30 @@ module Html2Doc
|
|
153
167
|
end
|
154
168
|
end
|
155
169
|
|
156
|
-
def
|
170
|
+
def stylesheet(_filename, _header_filename, cssname)
|
157
171
|
(cssname.nil? || cssname.empty?) and
|
158
172
|
cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
|
159
173
|
stylesheet = File.read(cssname, encoding: "UTF-8")
|
160
174
|
xml = Nokogiri::XML("<style/>")
|
161
|
-
#s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
162
|
-
#xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
175
|
+
# s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
176
|
+
# xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
163
177
|
xml.children.first << Nokogiri::XML::CDATA
|
164
178
|
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
165
179
|
|
166
180
|
xml.root.to_s
|
167
181
|
end
|
168
182
|
|
169
|
-
def
|
183
|
+
def define_head(docxml)
|
170
184
|
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
|
171
185
|
head = docxml.at("//*[local-name() = 'head']")
|
172
|
-
css = stylesheet(
|
186
|
+
css = stylesheet(@filename, @header_file, @stylesheet)
|
173
187
|
add_stylesheet(head, title, css)
|
174
|
-
filename_substitute(head,
|
175
|
-
define_head1(docxml,
|
188
|
+
filename_substitute(head, @header_file)
|
189
|
+
define_head1(docxml, @dir1)
|
176
190
|
rootnamespace(docxml.root)
|
177
191
|
end
|
178
192
|
|
179
|
-
def
|
193
|
+
def add_stylesheet(head, title, css)
|
180
194
|
if head.children.empty?
|
181
195
|
head.add_child css
|
182
196
|
elsif title.nil?
|
@@ -186,7 +200,7 @@ module Html2Doc
|
|
186
200
|
end
|
187
201
|
end
|
188
202
|
|
189
|
-
def
|
203
|
+
def namespace(root)
|
190
204
|
{
|
191
205
|
o: "urn:schemas-microsoft-com:office:office",
|
192
206
|
w: "urn:schemas-microsoft-com:office:word",
|
@@ -195,11 +209,11 @@ module Html2Doc
|
|
195
209
|
}.each { |k, v| root.add_namespace_definition(k.to_s, v) }
|
196
210
|
end
|
197
211
|
|
198
|
-
def
|
212
|
+
def rootnamespace(root)
|
199
213
|
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
|
200
214
|
end
|
201
215
|
|
202
|
-
def
|
216
|
+
def bookmarks(docxml)
|
203
217
|
docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
|
204
218
|
.each do |x|
|
205
219
|
next if x["id"].empty? ||
|
@@ -212,7 +226,7 @@ module Html2Doc
|
|
212
226
|
end
|
213
227
|
end
|
214
228
|
|
215
|
-
def
|
229
|
+
def msonormal(docxml)
|
216
230
|
docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
|
217
231
|
p["class"] = "MsoNormal"
|
218
232
|
end
|
data/lib/html2doc/lists.rb
CHANGED
@@ -3,8 +3,8 @@ require "asciimath"
|
|
3
3
|
require "htmlentities"
|
4
4
|
require "nokogiri"
|
5
5
|
|
6
|
-
|
7
|
-
def
|
6
|
+
class Html2Doc
|
7
|
+
def style_list(elem, level, liststyle, listnumber)
|
8
8
|
return unless liststyle
|
9
9
|
|
10
10
|
if elem["style"]
|
@@ -15,7 +15,7 @@ module Html2Doc
|
|
15
15
|
elem["style"] += "mso-list:#{liststyle} level#{level} lfo#{listnumber};"
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
18
|
+
def list_add1(elem, liststyles, listtype, level)
|
19
19
|
if %i[ul ol].include? listtype
|
20
20
|
list_add(elem.xpath(".//ul") - elem.xpath(".//ul//ul | .//ol//ul"),
|
21
21
|
liststyles, :ul, level + 1)
|
@@ -29,7 +29,7 @@ module Html2Doc
|
|
29
29
|
end
|
30
30
|
end
|
31
31
|
|
32
|
-
def
|
32
|
+
def list_add(xpath, liststyles, listtype, level)
|
33
33
|
xpath.each_with_index do |l, _i|
|
34
34
|
@listnumber += 1 if level == 1
|
35
35
|
l["seen"] = true if level == 1
|
@@ -46,7 +46,7 @@ module Html2Doc
|
|
46
46
|
end
|
47
47
|
end
|
48
48
|
|
49
|
-
def
|
49
|
+
def list2para(list)
|
50
50
|
return if list.xpath("./li").empty?
|
51
51
|
|
52
52
|
list.xpath("./li").first["class"] ||= "MsoListParagraphCxSpFirst"
|
@@ -63,7 +63,7 @@ module Html2Doc
|
|
63
63
|
|
64
64
|
TOPLIST = "[not(ancestor::ul) and not(ancestor::ol)]".freeze
|
65
65
|
|
66
|
-
def
|
66
|
+
def lists1(docxml, liststyles, style)
|
67
67
|
case style
|
68
68
|
when :ul then list_add(docxml.xpath("//ul[not(@class)]#{TOPLIST}"),
|
69
69
|
liststyles, :ul, 1)
|
@@ -76,7 +76,7 @@ module Html2Doc
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
def
|
79
|
+
def lists_unstyled(docxml, liststyles)
|
80
80
|
liststyles.has_key?(:ul) and
|
81
81
|
list_add(docxml.xpath("//ul#{TOPLIST}[not(@seen)]"),
|
82
82
|
liststyles, :ul, 1)
|
@@ -88,7 +88,7 @@ module Html2Doc
|
|
88
88
|
end
|
89
89
|
end
|
90
90
|
|
91
|
-
def
|
91
|
+
def lists(docxml, liststyles)
|
92
92
|
return if liststyles.nil?
|
93
93
|
|
94
94
|
@listnumber = 0
|
data/lib/html2doc/math.rb
CHANGED
@@ -4,12 +4,8 @@ require "htmlentities"
|
|
4
4
|
require "nokogiri"
|
5
5
|
require "plane1converter"
|
6
6
|
|
7
|
-
|
8
|
-
|
9
|
-
Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
|
10
|
-
encoding: "utf-8"))
|
11
|
-
|
12
|
-
def self.asciimath_to_mathml1(expr)
|
7
|
+
class Html2Doc
|
8
|
+
def asciimath_to_mathml1(expr)
|
13
9
|
AsciiMath::MathMLBuilder.new(msword: true).append_expression(
|
14
10
|
AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
|
15
11
|
).to_s
|
@@ -20,7 +16,7 @@ module Html2Doc
|
|
20
16
|
raise e
|
21
17
|
end
|
22
18
|
|
23
|
-
def
|
19
|
+
def asciimath_to_mathml(doc, delims)
|
24
20
|
return doc if delims.nil? || delims.size < 2
|
25
21
|
|
26
22
|
m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
|
@@ -31,13 +27,13 @@ module Html2Doc
|
|
31
27
|
end.join
|
32
28
|
end
|
33
29
|
|
34
|
-
def
|
30
|
+
def progress_conv(idx, step, total, threshold, msg)
|
35
31
|
return unless (idx % step).zero? && total > threshold && idx.positive?
|
36
32
|
|
37
33
|
warn "#{msg} #{idx} of #{total}"
|
38
34
|
end
|
39
35
|
|
40
|
-
def
|
36
|
+
def unwrap_accents(doc)
|
41
37
|
doc.xpath("//*[@accent = 'true']").each do |x|
|
42
38
|
x.elements.length > 1 or next
|
43
39
|
x.elements[1].name == "mrow" and
|
@@ -47,7 +43,7 @@ module Html2Doc
|
|
47
43
|
end
|
48
44
|
|
49
45
|
# random fixes to MathML input that OOXML needs to render properly
|
50
|
-
def
|
46
|
+
def ooxml_cleanup(math, docnamespaces)
|
51
47
|
math = unwrap_accents(
|
52
48
|
mathml_preserve_space(
|
53
49
|
mathml_insert_rows(math, docnamespaces), docnamespaces
|
@@ -57,7 +53,7 @@ module Html2Doc
|
|
57
53
|
math
|
58
54
|
end
|
59
55
|
|
60
|
-
def
|
56
|
+
def mathml_insert_rows(math, docnamespaces)
|
61
57
|
math.xpath(%w(msup msub msubsup munder mover munderover)
|
62
58
|
.map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
|
63
59
|
next unless x.next_element && x.next_element != "mrow"
|
@@ -67,7 +63,7 @@ module Html2Doc
|
|
67
63
|
math
|
68
64
|
end
|
69
65
|
|
70
|
-
def
|
66
|
+
def mathml_preserve_space(math, docnamespaces)
|
71
67
|
math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
|
72
68
|
x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
|
73
69
|
end
|
@@ -76,7 +72,7 @@ module Html2Doc
|
|
76
72
|
|
77
73
|
HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
|
78
74
|
|
79
|
-
def
|
75
|
+
def unitalic(math)
|
80
76
|
math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
|
81
77
|
x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
|
82
78
|
end
|
@@ -122,7 +118,7 @@ module Html2Doc
|
|
122
118
|
math
|
123
119
|
end
|
124
120
|
|
125
|
-
def
|
121
|
+
def to_plane1(xml, font)
|
126
122
|
xml.traverse do |n|
|
127
123
|
next unless n.text?
|
128
124
|
|
@@ -131,7 +127,7 @@ module Html2Doc
|
|
131
127
|
xml
|
132
128
|
end
|
133
129
|
|
134
|
-
def
|
130
|
+
def mathml_to_ooml(docxml)
|
135
131
|
docnamespaces = docxml.collect_namespaces
|
136
132
|
m = docxml.xpath("//*[local-name() = 'math']")
|
137
133
|
m.each_with_index do |x, i|
|
@@ -140,28 +136,45 @@ module Html2Doc
|
|
140
136
|
end
|
141
137
|
end
|
142
138
|
|
143
|
-
# We need span and em not to be namespaced. Word can't deal with explicit
|
139
|
+
# We need span and em not to be namespaced. Word can't deal with explicit
|
144
140
|
# namespaces.
|
145
141
|
# We will end up stripping them out again under Nokogiri 1.11, which correctly
|
146
142
|
# insists on inheriting namespace from parent.
|
147
|
-
def
|
143
|
+
def ooml_clean(xml)
|
148
144
|
xml.to_s
|
149
145
|
.gsub(/<\?[^>]+>\s*/, "")
|
150
146
|
.gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
|
151
147
|
.gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
|
152
148
|
end
|
153
149
|
|
154
|
-
def
|
150
|
+
def mathml_to_ooml1(xml, docnamespaces)
|
155
151
|
doc = Nokogiri::XML::Document::new
|
156
152
|
doc.root = ooxml_cleanup(xml, docnamespaces)
|
157
|
-
|
153
|
+
ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
|
158
154
|
ooxml = uncenter(xml, ooxml)
|
159
155
|
xml.swap(ooxml)
|
160
156
|
end
|
161
157
|
|
158
|
+
def accent_tr(xml)
|
159
|
+
xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
|
160
|
+
x["m:val"] &&= accent_tr1(x["m:val"])
|
161
|
+
x["val"] &&= accent_tr1(x["val"])
|
162
|
+
end
|
163
|
+
xml
|
164
|
+
end
|
165
|
+
|
166
|
+
def accent_tr1(accent)
|
167
|
+
case accent
|
168
|
+
when "\u2192" then "\u20D7"
|
169
|
+
when "^" then "\u0302"
|
170
|
+
when "~" then "\u0303"
|
171
|
+
else accent
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
162
175
|
# escape space as 2; we are removing any spaces generated by
|
163
176
|
# XML indentation
|
164
|
-
def
|
177
|
+
def esc_space(xml)
|
165
178
|
xml.traverse do |n|
|
166
179
|
next unless n.text?
|
167
180
|
|
@@ -172,7 +185,7 @@ module Html2Doc
|
|
172
185
|
|
173
186
|
# if oomml has no siblings, by default it is centered; override this with
|
174
187
|
# left/right if parent is so tagged
|
175
|
-
def
|
188
|
+
def uncenter(math, ooxml)
|
176
189
|
alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
|
177
190
|
"local-name() = 'div' or local-name() = 'td']/@style")
|
178
191
|
return ooxml unless alignnode && (math.next == nil && math.previous == nil)
|
@@ -180,7 +193,7 @@ module Html2Doc
|
|
180
193
|
%w(left right).each do |dir|
|
181
194
|
if alignnode.text.include? ("text-align:#{dir}")
|
182
195
|
ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
|
183
|
-
|
196
|
+
"m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
|
184
197
|
end
|
185
198
|
end
|
186
199
|
ooxml
|
data/lib/html2doc/mime.rb
CHANGED
@@ -4,8 +4,8 @@ require "mime/types"
|
|
4
4
|
require "image_size"
|
5
5
|
require "fileutils"
|
6
6
|
|
7
|
-
|
8
|
-
def
|
7
|
+
class Html2Doc
|
8
|
+
def mime_preamble(boundary, filename, result)
|
9
9
|
<<~"PREAMBLE"
|
10
10
|
MIME-Version: 1.0
|
11
11
|
Content-Type: multipart/related; boundary="#{boundary}"
|
@@ -20,7 +20,7 @@ module Html2Doc
|
|
20
20
|
PREAMBLE
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def mime_attachment(boundary, _filename, item, dir)
|
24
24
|
content_type = mime_type(item)
|
25
25
|
text_mode = %w[text application].any? { |p| content_type.start_with? p }
|
26
26
|
|
@@ -40,19 +40,19 @@ module Html2Doc
|
|
40
40
|
FILE
|
41
41
|
end
|
42
42
|
|
43
|
-
def
|
43
|
+
def mime_type(item)
|
44
44
|
types = MIME::Types.type_for(item)
|
45
45
|
type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
|
46
46
|
type = %(#{type} charset="utf-8") if /^text/.match(type) && types
|
47
47
|
type
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
50
|
+
def mime_boundary
|
51
51
|
salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
|
52
52
|
"----=_NextPart_#{salt}"
|
53
53
|
end
|
54
54
|
|
55
|
-
def
|
55
|
+
def mime_package(result, filename, dir)
|
56
56
|
boundary = mime_boundary
|
57
57
|
mhtml = mime_preamble(boundary, "#{filename}.htm", result)
|
58
58
|
mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
|
@@ -66,7 +66,7 @@ module Html2Doc
|
|
66
66
|
File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
|
67
67
|
end
|
68
68
|
|
69
|
-
def
|
69
|
+
def contentid(mhtml)
|
70
70
|
mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
|
71
71
|
repl = "#{$1}cid:#{File.basename($2)}#{$3}"
|
72
72
|
/^data:|^https?:/.match($2) ? m : repl
|
@@ -77,7 +77,7 @@ module Html2Doc
|
|
77
77
|
end
|
78
78
|
|
79
79
|
# max width for Word document is 400, max height is 680
|
80
|
-
def
|
80
|
+
def image_resize(img, path, maxheight, maxwidth)
|
81
81
|
realsize = ImageSize.path(path).size
|
82
82
|
s = [img["width"].to_i, img["height"].to_i]
|
83
83
|
s = realsize if s[0].zero? && s[1].zero?
|
@@ -92,27 +92,28 @@ module Html2Doc
|
|
92
92
|
|
93
93
|
IMAGE_PATH = "//*[local-name() = 'img' or local-name() = 'imagedata']".freeze
|
94
94
|
|
95
|
-
def
|
95
|
+
def mkuuid
|
96
96
|
UUIDTools::UUID.random_create.to_s
|
97
97
|
end
|
98
98
|
|
99
|
-
def
|
99
|
+
def warnsvg(src)
|
100
100
|
warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
|
101
101
|
end
|
102
102
|
|
103
|
-
def
|
103
|
+
def localname(src, localdir)
|
104
104
|
%r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
|
105
105
|
end
|
106
106
|
|
107
107
|
# only processes locally stored images
|
108
|
-
def
|
108
|
+
def image_cleanup(docxml, dir, localdir)
|
109
109
|
docxml.traverse do |i|
|
110
|
+
src = i["src"]
|
110
111
|
next unless i.element? && %w(img v:imagedata).include?(i.name)
|
111
|
-
next if /^http/.match?
|
112
|
-
next if %r{^data:(image|application)/[^;]+;base64}.match?
|
112
|
+
next if src.nil? || src.empty? || /^http/.match?(src)
|
113
|
+
next if %r{^data:(image|application)/[^;]+;base64}.match? src
|
113
114
|
|
114
|
-
local_filename = localname(
|
115
|
-
new_filename = "#{mkuuid}#{File.extname(
|
115
|
+
local_filename = localname(src, localdir)
|
116
|
+
new_filename = "#{mkuuid}#{File.extname(src)}"
|
116
117
|
FileUtils.cp local_filename, File.join(dir, new_filename)
|
117
118
|
i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
|
118
119
|
i["src"] = File.join(File.basename(dir), new_filename)
|
@@ -122,13 +123,13 @@ module Html2Doc
|
|
122
123
|
|
123
124
|
# do not parse the header through Nokogiri, since it will contain
|
124
125
|
# non-XML like <![if !supportFootnotes]>
|
125
|
-
def
|
126
|
+
def header_image_cleanup(doc, dir, filename, localdir)
|
126
127
|
doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
|
127
128
|
header_image_cleanup1(a, dir, filename, localdir)
|
128
129
|
end.join
|
129
130
|
end
|
130
131
|
|
131
|
-
def
|
132
|
+
def header_image_cleanup1(a, dir, _filename, localdir)
|
132
133
|
if a.size == 2 && !(/ src="https?:/.match a[1]) &&
|
133
134
|
!(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
|
134
135
|
m = / src=['"](?<src>[^"']+)['"]/.match a[1]
|
@@ -140,7 +141,7 @@ module Html2Doc
|
|
140
141
|
a.join
|
141
142
|
end
|
142
143
|
|
143
|
-
def
|
144
|
+
def generate_filelist(filename, dir)
|
144
145
|
File.open(File.join(dir, "filelist.xml"), "w") do |f|
|
145
146
|
f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
|
146
147
|
<o:MainFile HRef="../#{filename}.htm"/>}
|
data/lib/html2doc/notes.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require "uuidtools"
|
2
2
|
|
3
|
-
|
4
|
-
def
|
3
|
+
class Html2Doc
|
4
|
+
def footnotes(docxml)
|
5
5
|
i = 1
|
6
6
|
fn = []
|
7
7
|
docxml.xpath("//a").each do |a|
|
@@ -12,7 +12,7 @@ module Html2Doc
|
|
12
12
|
process_footnote_texts(docxml, fn)
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def process_footnote_texts(docxml, footnotes)
|
16
16
|
body = docxml.at("//body")
|
17
17
|
list = body.add_child("<div style='mso-element:footnote-list'/>")
|
18
18
|
footnotes.each_with_index do |f, i|
|
@@ -23,7 +23,7 @@ module Html2Doc
|
|
23
23
|
footnote_cleanup(docxml)
|
24
24
|
end
|
25
25
|
|
26
|
-
def
|
26
|
+
def footnote_div_to_p(elem)
|
27
27
|
if %w{div aside}.include? elem.name
|
28
28
|
if elem.at(".//p")
|
29
29
|
elem.replace(elem.children)
|
@@ -37,7 +37,7 @@ module Html2Doc
|
|
37
37
|
FN = "<span class='MsoFootnoteReference'>"\
|
38
38
|
"<span style='mso-special-character:footnote'/></span>".freeze
|
39
39
|
|
40
|
-
def
|
40
|
+
def footnote_container(docxml, idx)
|
41
41
|
ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
|
42
42
|
&.gsub(/>\n</, "><") || FN
|
43
43
|
<<~DIV
|
@@ -47,7 +47,7 @@ module Html2Doc
|
|
47
47
|
DIV
|
48
48
|
end
|
49
49
|
|
50
|
-
def
|
50
|
+
def process_footnote_link(docxml, elem, idx, footnote)
|
51
51
|
return false unless footnote?(elem)
|
52
52
|
|
53
53
|
href = elem["href"].gsub(/^#/, "")
|
@@ -62,7 +62,7 @@ module Html2Doc
|
|
62
62
|
footnote << transform_footnote_text(note)
|
63
63
|
end
|
64
64
|
|
65
|
-
def
|
65
|
+
def process_footnote_link1(elem)
|
66
66
|
elem.children.each do |c|
|
67
67
|
if c.name == "span" && c["class"] == "MsoFootnoteReference"
|
68
68
|
c.replace(FN)
|
@@ -72,7 +72,7 @@ module Html2Doc
|
|
72
72
|
end
|
73
73
|
end
|
74
74
|
|
75
|
-
def
|
75
|
+
def transform_footnote_text(note)
|
76
76
|
note["id"] = ""
|
77
77
|
note.xpath(".//div").each { |div| div.replace(div.children) }
|
78
78
|
note.xpath(".//aside | .//p").each do |p|
|
@@ -82,12 +82,12 @@ module Html2Doc
|
|
82
82
|
note.remove
|
83
83
|
end
|
84
84
|
|
85
|
-
def
|
85
|
+
def footnote?(elem)
|
86
86
|
elem["epub:type"]&.casecmp("footnote")&.zero? ||
|
87
87
|
elem["class"]&.casecmp("footnote")&.zero?
|
88
88
|
end
|
89
89
|
|
90
|
-
def
|
90
|
+
def set_footnote_link_attrs(elem, idx)
|
91
91
|
elem["style"] = "mso-footnote-id:ftn#{idx}"
|
92
92
|
elem["href"] = "#_ftn#{idx}"
|
93
93
|
elem["name"] = "_ftnref#{idx}"
|
@@ -99,7 +99,7 @@ module Html2Doc
|
|
99
99
|
# to p). We do not expect any <a name> or links back to text; if they
|
100
100
|
# are present in the HTML, they need to have been cleaned out before
|
101
101
|
# passing to this gem
|
102
|
-
def
|
102
|
+
def footnote_cleanup(docxml)
|
103
103
|
docxml.xpath('//div[@style="mso-element:footnote"]/a')
|
104
104
|
.each do |x|
|
105
105
|
n = x.next_element
|
data/lib/html2doc/version.rb
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
|
2
|
-
VERSION = "1.
|
1
|
+
class Html2Doc
|
2
|
+
VERSION = "1.4.0.1".freeze
|
3
3
|
end
|
data/spec/html2doc_spec.rb
CHANGED
@@ -76,7 +76,7 @@ WORD_FTR1 = <<~FTR.freeze
|
|
76
76
|
Content-ID: <filelist.xml>
|
77
77
|
Content-Disposition: inline; filename="filelist.xml"
|
78
78
|
Content-Transfer-Encoding: base64
|
79
|
-
Content-Type: #{Html2Doc
|
79
|
+
Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
|
80
80
|
|
81
81
|
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
82
82
|
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
@@ -90,7 +90,7 @@ WORD_FTR2 = <<~FTR.freeze
|
|
90
90
|
Content-ID: <filelist.xml>
|
91
91
|
Content-Disposition: inline; filename="filelist.xml"
|
92
92
|
Content-Transfer-Encoding: base64
|
93
|
-
Content-Type: #{Html2Doc
|
93
|
+
Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
|
94
94
|
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
95
95
|
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
96
96
|
bGVsaXN0LnhtbCIvPgogIDxvOkZpbGUgSFJlZj0iaGVhZGVyLmh0bWwiLz4KPC94bWw+Cg==
|
@@ -102,7 +102,7 @@ WORD_FTR3 = <<~FTR.freeze
|
|
102
102
|
Content-ID: <filelist.xml>
|
103
103
|
Content-Disposition: inline; filename="filelist.xml"
|
104
104
|
Content-Transfer-Encoding: base64
|
105
|
-
Content-Type: #{Html2Doc
|
105
|
+
Content-Type: #{Html2Doc.new({}).mime_type('filelist.xml')}
|
106
106
|
|
107
107
|
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
108
108
|
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9IjFh
|
@@ -278,18 +278,18 @@ RSpec.describe Html2Doc do
|
|
278
278
|
end
|
279
279
|
|
280
280
|
it "preserves Word HTML directives" do
|
281
|
-
Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B])
|
281
|
+
Html2Doc.new(filename: "test").process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]))
|
282
282
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
283
283
|
.to match_fuzzy(<<~OUTPUT)
|
284
284
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
285
285
|
#{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
|
286
|
-
|
286
|
+
'<div style="mso-element:footnote-list"/>')}
|
287
287
|
#{WORD_FTR1}
|
288
288
|
OUTPUT
|
289
289
|
end
|
290
290
|
|
291
291
|
it "processes a blank document" do
|
292
|
-
Html2Doc.
|
292
|
+
Html2Doc.new(filename: "test").process(html_input(""))
|
293
293
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
294
294
|
.to match_fuzzy(<<~OUTPUT)
|
295
295
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -299,15 +299,15 @@ RSpec.describe Html2Doc do
|
|
299
299
|
|
300
300
|
it "removes any temp files" do
|
301
301
|
File.delete("test.doc")
|
302
|
-
Html2Doc.
|
302
|
+
Html2Doc.new(filename: "test").process(html_input(""))
|
303
303
|
expect(File.exist?("test.doc")).to be true
|
304
304
|
expect(File.exist?("test.htm")).to be false
|
305
305
|
expect(File.exist?("test_files")).to be false
|
306
306
|
end
|
307
307
|
|
308
308
|
it "processes a stylesheet in an HTML document with a title" do
|
309
|
-
Html2Doc.
|
310
|
-
|
309
|
+
Html2Doc.new(filename: "test", stylesheet: "lib/html2doc/wordstyle.css")
|
310
|
+
.process(html_input(""))
|
311
311
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
312
312
|
.to match_fuzzy(<<~OUTPUT)
|
313
313
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -316,9 +316,11 @@ RSpec.describe Html2Doc do
|
|
316
316
|
end
|
317
317
|
|
318
318
|
it "processes a stylesheet in an HTML document without a title" do
|
319
|
-
Html2Doc.
|
320
|
-
|
321
|
-
|
319
|
+
Html2Doc.new(filename: "test",
|
320
|
+
stylesheet: "lib/html2doc/wordstyle.css")
|
321
|
+
.process(html_input_no_title(""))
|
322
|
+
expect(guid_clean(File.read("test.doc",
|
323
|
+
encoding: "utf-8")))
|
322
324
|
.to match_fuzzy(<<~OUTPUT)
|
323
325
|
#{WORD_HDR.sub('<title>blank</title>', '')}
|
324
326
|
#{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -327,12 +329,14 @@ RSpec.describe Html2Doc do
|
|
327
329
|
end
|
328
330
|
|
329
331
|
it "processes a stylesheet in an HTML document with an empty head" do
|
330
|
-
Html2Doc.
|
331
|
-
|
332
|
+
Html2Doc.new(filename: "test",
|
333
|
+
stylesheet: "lib/html2doc/wordstyle.css")
|
334
|
+
.process(html_input_empty_head(""))
|
332
335
|
word_hdr_end = WORD_HDR_END
|
333
336
|
.sub(%(<meta name="Originator" content="Me"/>\n), "")
|
334
337
|
.sub("</style>\n</head>", "</style></head>")
|
335
|
-
expect(guid_clean(File.read("test.doc",
|
338
|
+
expect(guid_clean(File.read("test.doc",
|
339
|
+
encoding: "utf-8")))
|
336
340
|
.to match_fuzzy(<<~OUTPUT)
|
337
341
|
#{WORD_HDR.sub('<title>blank</title>', '')}
|
338
342
|
#{DEFAULT_STYLESHEET}
|
@@ -342,8 +346,9 @@ RSpec.describe Html2Doc do
|
|
342
346
|
end
|
343
347
|
|
344
348
|
it "processes a header" do
|
345
|
-
Html2Doc.
|
346
|
-
|
349
|
+
Html2Doc.new(filename: "test",
|
350
|
+
header_file: "spec/header.html")
|
351
|
+
.process(html_input(""))
|
347
352
|
html = guid_clean(File.read("test.doc", encoding: "utf-8"))
|
348
353
|
hdr = Base64.decode64(
|
349
354
|
html
|
@@ -365,8 +370,9 @@ RSpec.describe Html2Doc do
|
|
365
370
|
end
|
366
371
|
|
367
372
|
it "processes a header with an image" do
|
368
|
-
Html2Doc.
|
369
|
-
|
373
|
+
Html2Doc.new(filename: "test",
|
374
|
+
header_file: "spec/header_img.html")
|
375
|
+
.process(html_input(""))
|
370
376
|
doc = guid_clean(File.read("test.doc", encoding: "utf-8"))
|
371
377
|
expect(doc).to match(%r{Content-Type: image/png})
|
372
378
|
expect(doc).to match(%r{iVBORw0KGgoAAAANSUhEUgAAA5cAAAN7CAYAAADRE24cAAAgAElEQVR4XuydB5gUxdaGC65gTogB})
|
@@ -381,8 +387,9 @@ RSpec.describe Html2Doc do
|
|
381
387
|
"19160-6.png"))),
|
382
388
|
)
|
383
389
|
end
|
384
|
-
Html2Doc.
|
385
|
-
|
390
|
+
Html2Doc.new(filename: "test",
|
391
|
+
header_file: "spec/header_img1.html")
|
392
|
+
.process(html_input(""))
|
386
393
|
doc = guid_clean(File.read("test.doc", encoding: "utf-8"))
|
387
394
|
expect(doc).to match(%r{Content-Type: image/png})
|
388
395
|
expect(doc).to match(%r{iVBORw0KGgoAAAANSUhEUgAAA5cAAAN7CAYAAADRE24cAAAgAElEQVR4XuydB5gUxdaGC65gTogB})
|
@@ -391,7 +398,7 @@ RSpec.describe Html2Doc do
|
|
391
398
|
it "processes a populated document" do
|
392
399
|
simple_body = "<h1>Hello word!</h1>
|
393
400
|
<div>This is a very simple document</div>"
|
394
|
-
Html2Doc.
|
401
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
395
402
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
396
403
|
.to match_fuzzy(<<~OUTPUT)
|
397
404
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -401,9 +408,11 @@ RSpec.describe Html2Doc do
|
|
401
408
|
end
|
402
409
|
|
403
410
|
it "processes AsciiMath" do
|
404
|
-
Html2Doc.
|
405
|
-
|
406
|
-
|
411
|
+
Html2Doc.new(filename: "test",
|
412
|
+
asciimathdelims: ["{{", "}}"])
|
413
|
+
.process(html_input(%[<div>{{sum_(i=1)^n i^3=((n(n+1))/2)^2 text("integer"))}}</div>]))
|
414
|
+
expect(guid_clean(File.read("test.doc",
|
415
|
+
encoding: "utf-8")))
|
407
416
|
.to match_fuzzy(<<~OUTPUT)
|
408
417
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
409
418
|
#{word_body(%{
|
@@ -416,8 +425,8 @@ RSpec.describe Html2Doc do
|
|
416
425
|
end
|
417
426
|
|
418
427
|
it "processes mstyle" do
|
419
|
-
Html2Doc.
|
420
|
-
|
428
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
429
|
+
.process(html_input(%[<div>{{bb (-log_2 (p_u)) bb "BB" bbb "BBB" cc "CC" bcc "BCC" tt "TT" fr "FR" bfr "BFR" sf "SF" bsf "BSFα" sfi "SFI" sfbi "SFBIα" bii "BII" ii "II"}}</div>]))
|
421
430
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
422
431
|
.to match_fuzzy(<<~OUTPUT)
|
423
432
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -431,8 +440,8 @@ RSpec.describe Html2Doc do
|
|
431
440
|
end
|
432
441
|
|
433
442
|
it "processes spaces in AsciiMath" do
|
434
|
-
Html2Doc.
|
435
|
-
|
443
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
444
|
+
.process(html_input(%[<div>{{text " integer ")}}</div>]))
|
436
445
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
437
446
|
.to match_fuzzy(<<~OUTPUT)
|
438
447
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -446,10 +455,10 @@ RSpec.describe Html2Doc do
|
|
446
455
|
end
|
447
456
|
|
448
457
|
it "processes spaces in MathML mtext" do
|
449
|
-
Html2Doc.
|
458
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
459
|
+
.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
|
450
460
|
<mrow><mi>H</mi><mtext> original </mtext><mi>J</mi></mrow>
|
451
|
-
</math></div>")
|
452
|
-
filename: "test", asciimathdelims: ["{{", "}}"])
|
461
|
+
</math></div>"))
|
453
462
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
454
463
|
.to match_fuzzy(<<~OUTPUT)
|
455
464
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -461,15 +470,16 @@ RSpec.describe Html2Doc do
|
|
461
470
|
OUTPUT
|
462
471
|
end
|
463
472
|
|
464
|
-
it "unwraps accent in MathML" do
|
465
|
-
Html2Doc.
|
473
|
+
it "unwraps and converts accent in MathML" do
|
474
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
475
|
+
.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
|
466
476
|
<mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
|
467
|
-
</math></div>")
|
477
|
+
</math></div>"))
|
468
478
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
469
479
|
.to match_fuzzy(<<~OUTPUT)
|
470
480
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
471
481
|
#{word_body('<div><m:oMath>
|
472
|
-
<m:acc><m:accPr><m:chr m:val="
|
482
|
+
<m:acc><m:accPr><m:chr m:val="̂"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
|
473
483
|
</m:oMath>
|
474
484
|
</div>', '<div style="mso-element:footnote-list"/>')}
|
475
485
|
#{WORD_FTR1}
|
@@ -477,8 +487,8 @@ RSpec.describe Html2Doc do
|
|
477
487
|
end
|
478
488
|
|
479
489
|
it "left-aligns AsciiMath" do
|
480
|
-
Html2Doc.
|
481
|
-
|
490
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
491
|
+
.process(html_input("<div style='text-align:left;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"))
|
482
492
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
483
493
|
.to match_fuzzy(<<~OUTPUT)
|
484
494
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -492,9 +502,11 @@ RSpec.describe Html2Doc do
|
|
492
502
|
end
|
493
503
|
|
494
504
|
it "right-aligns AsciiMath" do
|
495
|
-
Html2Doc.
|
496
|
-
|
497
|
-
|
505
|
+
Html2Doc.new(filename: "test",
|
506
|
+
asciimathdelims: ["{{", "}}"])
|
507
|
+
.process(html_input("<div style='text-align:right;'>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"))
|
508
|
+
expect(guid_clean(File.read("test.doc",
|
509
|
+
encoding: "utf-8")))
|
498
510
|
.to match_fuzzy(<<~OUTPUT)
|
499
511
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
500
512
|
#{word_body(%{
|
@@ -509,21 +521,21 @@ RSpec.describe Html2Doc do
|
|
509
521
|
it "raises error in processing of broken AsciiMath" do
|
510
522
|
begin
|
511
523
|
expect do
|
512
|
-
Html2Doc.
|
513
|
-
|
524
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
525
|
+
.process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]))
|
514
526
|
end.to output('parsing: u_c = 6.6"unitsml(kHz)').to_stderr
|
515
527
|
rescue StandardError
|
516
528
|
end
|
517
529
|
expect do
|
518
|
-
Html2Doc.
|
519
|
-
|
530
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
531
|
+
.process(html_input(%[<div style='text-align:right;'>{{u_c = 6.6"unitsml(kHz)}}</div>]))
|
520
532
|
end.to raise_error(StandardError)
|
521
533
|
end
|
522
534
|
|
523
535
|
it "wraps msup after munderover in MathML" do
|
524
|
-
Html2Doc.
|
525
|
-
<
|
526
|
-
|
536
|
+
Html2Doc.new(filename: "test", asciimathdelims: ["{{", "}}"])
|
537
|
+
.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
|
538
|
+
<munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>0</mn></mrow><mrow><mi>n</mi></mrow></munderover><msup><mn>2</mn><mrow><mi>i</mi></mrow></msup></math></div>"))
|
527
539
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
528
540
|
.to match_fuzzy(<<~OUTPUT)
|
529
541
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -537,7 +549,7 @@ RSpec.describe Html2Doc do
|
|
537
549
|
it "processes tabs" do
|
538
550
|
simple_body = "<h1>Hello word!</h1>
|
539
551
|
<div>This is a very &tab; simple document</div>"
|
540
|
-
Html2Doc.
|
552
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
541
553
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
542
554
|
.to match_fuzzy(<<~OUTPUT)
|
543
555
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -550,7 +562,7 @@ RSpec.describe Html2Doc do
|
|
550
562
|
simple_body = '<h1>Hello word!</h1>
|
551
563
|
<p>This is a very simple document</p>
|
552
564
|
<p class="x">This style stays</p>'
|
553
|
-
Html2Doc.
|
565
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
554
566
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
555
567
|
.to match_fuzzy(<<~OUTPUT)
|
556
568
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -565,7 +577,7 @@ RSpec.describe Html2Doc do
|
|
565
577
|
<li>This is a very simple document</li>
|
566
578
|
<li class="x">This style stays</li>
|
567
579
|
</ul>'
|
568
|
-
Html2Doc.
|
580
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
569
581
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
570
582
|
.to match_fuzzy(<<~OUTPUT)
|
571
583
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -576,8 +588,8 @@ RSpec.describe Html2Doc do
|
|
576
588
|
|
577
589
|
it "resizes images for height, in a file in a subdirectory" do
|
578
590
|
simple_body = '<img src="19160-6.png">'
|
579
|
-
Html2Doc.
|
580
|
-
|
591
|
+
Html2Doc.new(filename: "spec/test", imagedir: "spec")
|
592
|
+
.process(html_input(simple_body))
|
581
593
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
582
594
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
583
595
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -589,7 +601,8 @@ RSpec.describe Html2Doc do
|
|
589
601
|
|
590
602
|
it "resizes images for width" do
|
591
603
|
simple_body = '<img src="spec/19160-7.gif">'
|
592
|
-
Html2Doc.
|
604
|
+
Html2Doc.new(filename: "test", imagedir: ".")
|
605
|
+
.process(html_input(simple_body))
|
593
606
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
594
607
|
expect(testdoc).to match(%r{Content-Type: image/gif})
|
595
608
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -601,7 +614,8 @@ RSpec.describe Html2Doc do
|
|
601
614
|
|
602
615
|
it "resizes images for height" do
|
603
616
|
simple_body = '<img src="spec/19160-8.jpg">'
|
604
|
-
Html2Doc.
|
617
|
+
Html2Doc.new(filename: "test", imagedir: ".")
|
618
|
+
.process(html_input(simple_body))
|
605
619
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
606
620
|
expect(testdoc).to match(%r{Content-Type: image/jpeg})
|
607
621
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -613,48 +627,49 @@ RSpec.describe Html2Doc do
|
|
613
627
|
|
614
628
|
it "resizes images with missing or auto sizes" do
|
615
629
|
image = { "src" => "spec/19160-8.jpg" }
|
616
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
630
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
617
631
|
.to eq [30, 100]
|
618
632
|
image["width"] = "20"
|
619
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
633
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
620
634
|
.to eq [20, 65]
|
621
635
|
image.delete("width")
|
622
636
|
image["height"] = "50"
|
623
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
637
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
624
638
|
.to eq [15, 50]
|
625
639
|
image.delete("height")
|
626
640
|
image["width"] = "500"
|
627
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
641
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
628
642
|
.to eq [30, 100]
|
629
643
|
image.delete("width")
|
630
644
|
image["height"] = "500"
|
631
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
645
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
632
646
|
.to eq [30, 100]
|
633
647
|
image["width"] = "20"
|
634
648
|
image["height"] = "auto"
|
635
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
649
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
636
650
|
.to eq [20, 65]
|
637
651
|
image["width"] = "auto"
|
638
652
|
image["height"] = "50"
|
639
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
653
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
640
654
|
.to eq [15, 50]
|
641
655
|
image["width"] = "500"
|
642
656
|
image["height"] = "auto"
|
643
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
657
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
644
658
|
.to eq [30, 100]
|
645
659
|
image["width"] = "auto"
|
646
660
|
image["height"] = "500"
|
647
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
661
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
648
662
|
.to eq [30, 100]
|
649
663
|
image["width"] = "auto"
|
650
664
|
image["height"] = "auto"
|
651
|
-
expect(Html2Doc.image_resize(image, "spec/19160-8.jpg", 100, 100))
|
665
|
+
expect(Html2Doc.new({}).image_resize(image, "spec/19160-8.jpg", 100, 100))
|
652
666
|
.to eq [30, 100]
|
653
667
|
end
|
654
668
|
|
655
669
|
it "does not move images if they are external URLs" do
|
656
670
|
simple_body = '<img src="https://example.com/19160-6.png">'
|
657
|
-
Html2Doc.
|
671
|
+
Html2Doc.new(filename: "test", imagedir: ".")
|
672
|
+
.process(html_input(simple_body))
|
658
673
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
659
674
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
660
675
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -665,8 +680,8 @@ RSpec.describe Html2Doc do
|
|
665
680
|
|
666
681
|
it "deals with absolute image locations" do
|
667
682
|
simple_body = %{<img src="#{__dir__}/19160-6.png">}
|
668
|
-
Html2Doc.
|
669
|
-
|
683
|
+
Html2Doc.new(filename: "spec/test", imagedir: ".")
|
684
|
+
.process(html_input(simple_body))
|
670
685
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
671
686
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
672
687
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -687,7 +702,7 @@ RSpec.describe Html2Doc do
|
|
687
702
|
document<a epub:type="footnote" href="#a1">1</a> allegedly<a epub:type="footnote" href="#a2">2</a></div>
|
688
703
|
<aside id="a1">Footnote</aside>
|
689
704
|
<aside id="a2">Other Footnote</aside>'
|
690
|
-
Html2Doc.
|
705
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
691
706
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
692
707
|
.to match_fuzzy(<<~OUTPUT)
|
693
708
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -707,7 +722,7 @@ RSpec.describe Html2Doc do
|
|
707
722
|
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
708
723
|
<aside id="a1">Footnote</aside>
|
709
724
|
<aside id="a2">Other Footnote</aside>'
|
710
|
-
Html2Doc.
|
725
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
711
726
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
712
727
|
.to match_fuzzy(<<~OUTPUT)
|
713
728
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -727,7 +742,7 @@ RSpec.describe Html2Doc do
|
|
727
742
|
document<a class="footnote" href="#a1">(<span class="MsoFootnoteReference">1</span>)</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
728
743
|
<aside id="a1">Footnote</aside>
|
729
744
|
<aside id="a2">Other Footnote</aside>'
|
730
|
-
Html2Doc.
|
745
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
731
746
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
732
747
|
.to match_fuzzy(<<~OUTPUT)
|
733
748
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -747,7 +762,7 @@ RSpec.describe Html2Doc do
|
|
747
762
|
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
748
763
|
<aside id="a1"><p>Footnote</p></aside>
|
749
764
|
<div id="a2"><p>Other Footnote</p></div>'
|
750
|
-
Html2Doc.
|
765
|
+
Html2Doc.new(filename: "test").process(html_input(simple_body))
|
751
766
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
752
767
|
.to match_fuzzy(<<~OUTPUT)
|
753
768
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -767,8 +782,8 @@ RSpec.describe Html2Doc do
|
|
767
782
|
<div><ul id="0">
|
768
783
|
<li><div><p><ol id="1"><li><ul id="2"><li><p><ol id="3"><li><ol id="4"><li>A</li><li><p>B</p><p>B2</p></li><li>C</li></ol></li></ol></p></li></ul></li></ol></p></div></li><div><ul id="5"><li>C</li></ul></div>
|
769
784
|
BODY
|
770
|
-
Html2Doc.
|
771
|
-
|
785
|
+
Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
|
786
|
+
.process(html_input(simple_body))
|
772
787
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
773
788
|
.to match_fuzzy(<<~OUTPUT)
|
774
789
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -786,8 +801,8 @@ RSpec.describe Html2Doc do
|
|
786
801
|
<ol id="1"><li><div><p><ol id="2"><li><ul id="3"><li><p><ol id="4"><li><ol id="5"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol>
|
787
802
|
<ol id="6"><li><div><p><ol id="7"><li><ul id="8"><li><p><ol id="9"><li><ol id="10"><li>A</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ol></div>
|
788
803
|
BODY
|
789
|
-
Html2Doc.
|
790
|
-
|
804
|
+
Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
|
805
|
+
.process(html_input(simple_body))
|
791
806
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
792
807
|
.to match_fuzzy(<<~OUTPUT)
|
793
808
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -808,9 +823,10 @@ RSpec.describe Html2Doc do
|
|
808
823
|
<div><ul class="other" id="10">
|
809
824
|
<li><div><p><ol id="11"><li><ul id="12"><li><p><ol id="13"><li><ol id="14"><li>A</li><li><p>B</p><p>B2</p></li><li>C</li></ol></li></ol></p></li></ul></li></ol></p></div></li></ul></div>
|
810
825
|
BODY
|
811
|
-
Html2Doc.
|
812
|
-
|
813
|
-
|
826
|
+
Html2Doc.new(filename: "test",
|
827
|
+
liststyles: { ul: "l1", ol: "l2",
|
828
|
+
steps: "l3" })
|
829
|
+
.process(html_input(simple_body))
|
814
830
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
815
831
|
.to match_fuzzy(<<~OUTPUT)
|
816
832
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -832,8 +848,8 @@ RSpec.describe Html2Doc do
|
|
832
848
|
<p id="b"/>
|
833
849
|
</div>
|
834
850
|
BODY
|
835
|
-
Html2Doc.
|
836
|
-
|
851
|
+
Html2Doc.new(filename: "test", liststyles: { ul: "l1", ol: "l2" })
|
852
|
+
.process(html_input(simple_body))
|
837
853
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
838
854
|
.to match_fuzzy(<<~OUTPUT)
|
839
855
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -848,8 +864,8 @@ RSpec.describe Html2Doc do
|
|
848
864
|
|
849
865
|
it "test image base64 image encoding" do
|
850
866
|
simple_body = '<img src="19160-6.png">'
|
851
|
-
Html2Doc.
|
852
|
-
|
867
|
+
Html2Doc.new(filename: "spec/test", debug: true, imagedir: "spec")
|
868
|
+
.process(html_input(simple_body))
|
853
869
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
854
870
|
base64_image = testdoc[/image\/png\n\n(.*?)\n\n----/m, 1].gsub!("\n", "")
|
855
871
|
base64_image_basename = testdoc[%r{Content-ID: <([0-9a-z\-]+)\.png}m, 1]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-05-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: asciimath
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
337
|
+
rubygems_version: 3.3.9
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: Convert HTML document to Microsoft Word document
|