html2doc 0.6.1 → 0.6.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitattributes +2 -0
- data/Gemfile +6 -0
- data/README.adoc +3 -3
- data/bin/rspec +18 -0
- data/html2doc.gemspec +1 -1
- data/lib/html2doc/base.rb +56 -22
- data/lib/html2doc/mime.rb +4 -2
- data/lib/html2doc/notes.rb +1 -2
- data/lib/html2doc/version.rb +1 -1
- data/spec/19160-6.png +0 -0
- data/spec/19160-7.gif +0 -0
- data/spec/header.html +184 -0
- data/spec/html2doc_spec.rb +401 -4
- data/spec/spec_helper.rb +6 -0
- metadata +21 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74ea5fa5a0e4221f38ded5491536c3b7fbeeb51b
|
4
|
+
data.tar.gz: 6cfa24874e5afe45854c7c86061b1d3a9c1cb80a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2aa3ea91ba1fe76ee540c85b0cb944ef3cc9eca4dd12f7884028fad0e41cdbc17b8e1ca17c6d110a73cb65e0dc214b8815f6296464c8a20399b00e28016717f
|
7
|
+
data.tar.gz: efa78fc0e9d6533ed21bd9e564cdad6175c128510417ccf43d02a429a3c397bdec8f50987548ce9c2db78204ddb641fdc2828b9f1c29e02cc90868c8f5a1e7fa
|
data/.gitattributes
ADDED
data/Gemfile
CHANGED
data/README.adoc
CHANGED
@@ -29,9 +29,9 @@ Work to be done:
|
|
29
29
|
|
30
30
|
== Constraints
|
31
31
|
|
32
|
-
This generates .doc documents.
|
32
|
+
This generates .doc documents. Future versions will upgrade the output to docx.
|
33
33
|
|
34
|
-
|
34
|
+
There there are two other Microsoft Word vendors in the Ruby ecosystem. https://github.com/jetruby/puredocx generate Word documents from a ruby struct as a DSL, rather than converting a preexisting html document. That constrains it's coverage to what is explicitly catered for in the DSL. https://github.com/MuhammetDilmac/Html2Docx is a much simpler wrapper around html: it does not do any of the added functionality described above (image resizing, converting footnotes, AsciiMath and MathML), though it does already generate docx.
|
35
35
|
|
36
36
|
== Usage
|
37
37
|
|
@@ -46,7 +46,7 @@ result:: is the Html document to be converted into Word, as a string.
|
|
46
46
|
filename:: is the name the document is to be saved as, without a file suffix
|
47
47
|
stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided (`nil`), the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
|
48
48
|
header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/riboseinc/asciidoctor-iso/blob/master/lib/asciidoctor/iso/word/header.html
|
49
|
-
dir:: is the
|
49
|
+
dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided (`nil`), it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder.)
|
50
50
|
asciimathdelims:: are the AsciiMath delimiters used in the text. If none are provided, no AsciiMath conversion is attempted.
|
51
51
|
|
52
52
|
Note that the local CSS stylesheet file contains a variable `FILENAME` for the location of footnote/endnote separators and headers/footers, which are provided in the header HTML file. The gem replaces `FILENAME` with the file nane that the document will be saved as. If you supply your own stylesheet and also wish to use separators or headers/footers, you will likewise need to replace the document name mentioned in your stylesheet with a `FILENAME` string.
|
data/bin/rspec
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rspec' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require "pathname"
|
10
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path(
|
11
|
+
"../../Gemfile", Pathname.new(__FILE__).realpath
|
12
|
+
)
|
13
|
+
|
14
|
+
require "rubygems"
|
15
|
+
require "bundler/setup"
|
16
|
+
|
17
|
+
load Gem.bin_path("rspec-core", "rspec")
|
18
|
+
|
data/html2doc.gemspec
CHANGED
@@ -10,7 +10,6 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["open.source@ribose.com"]
|
11
11
|
|
12
12
|
spec.summary = "Convert HTML document to Microsoft Word document"
|
13
|
-
"in AsciiDoc."
|
14
13
|
spec.description = <<~DESCRIPTION
|
15
14
|
Convert HTML document to Microsoft Word document.
|
16
15
|
|
@@ -45,4 +44,5 @@ Gem::Specification.new do |spec|
|
|
45
44
|
spec.add_development_dependency "rubocop", "~> 0.50"
|
46
45
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
47
46
|
spec.add_development_dependency "timecop", "~> 0.9"
|
47
|
+
spec.add_development_dependency "rspec-match_fuzzy"
|
48
48
|
end
|
data/lib/html2doc/base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "uuidtools"
|
2
2
|
require "asciimath"
|
3
|
+
require "image_size"
|
3
4
|
require "nokogiri"
|
4
5
|
require "xml/xslt"
|
5
6
|
require "pp"
|
@@ -8,27 +9,36 @@ module Html2Doc
|
|
8
9
|
@xslt = XML::XSLT.new
|
9
10
|
@xslt.xsl = File.read(File.join(File.dirname(__FILE__), "mathml2omml.xsl"))
|
10
11
|
|
11
|
-
def self.process(result, filename, stylesheet, header_file, dir,
|
12
|
+
def self.process(result, filename, stylesheet, header_file, dir = nil,
|
12
13
|
asciimathdelims = nil)
|
14
|
+
dir1 = create_dir(filename, dir)
|
13
15
|
result = process_html(result, filename, stylesheet, header_file,
|
14
|
-
|
15
|
-
system "cp #{header_file} #{
|
16
|
-
generate_filelist(filename,
|
16
|
+
dir1, asciimathdelims)
|
17
|
+
system "cp #{header_file} #{dir1}/header.html" unless header_file.nil?
|
18
|
+
generate_filelist(filename, dir1)
|
17
19
|
File.open("#{filename}.htm", "w") { |f| f.write(result) }
|
18
|
-
mime_package result, filename,
|
19
|
-
rm_temp_files(filename, dir)
|
20
|
+
mime_package result, filename, dir1
|
21
|
+
rm_temp_files(filename, dir, dir1)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.create_dir(filename, dir)
|
25
|
+
return dir if dir
|
26
|
+
dir = "#{filename}_files"
|
27
|
+
Dir.mkdir(dir) unless File.exists?(dir)
|
28
|
+
dir
|
20
29
|
end
|
21
30
|
|
22
31
|
def self.process_html(result, filename, stylesheet, header_file, dir,
|
23
32
|
asciimathdelims)
|
24
|
-
docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims))
|
33
|
+
# docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims))
|
34
|
+
docxml = to_xhtml(asciimath_to_mathml(result, asciimathdelims))
|
25
35
|
define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file)
|
26
|
-
msword_fix(docxml
|
36
|
+
msword_fix(from_xhtml(docxml))
|
27
37
|
end
|
28
38
|
|
29
|
-
def self.rm_temp_files(filename,
|
39
|
+
def self.rm_temp_files(filename, dir, dir1)
|
30
40
|
system "rm #{filename}.htm"
|
31
|
-
system "rm -r #{
|
41
|
+
system "rm -r #{dir1}" unless dir
|
32
42
|
end
|
33
43
|
|
34
44
|
def self.cleanup(docxml, dir)
|
@@ -58,26 +68,44 @@ module Html2Doc
|
|
58
68
|
end
|
59
69
|
end
|
60
70
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
71
|
+
NOKOHEAD = <<~HERE.freeze
|
72
|
+
<!DOCTYPE html SYSTEM
|
73
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
74
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
75
|
+
<head> <title></title> <meta charset="UTF-8" /> </head>
|
76
|
+
<body> </body> </html>
|
77
|
+
HERE
|
78
|
+
|
79
|
+
def self.to_xhtml(xml)
|
80
|
+
xml.gsub!(/<\?xml[^>]*>/, "")
|
81
|
+
unless /<!DOCTYPE /.match? xml
|
82
|
+
xml = '<!DOCTYPE html SYSTEM
|
83
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
67
84
|
end
|
68
|
-
|
85
|
+
Nokogiri::XML.parse(xml)
|
86
|
+
end
|
87
|
+
|
88
|
+
DOCTYPE = <<~"DOCTYPE".freeze
|
89
|
+
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
90
|
+
DOCTYPE
|
91
|
+
|
92
|
+
def self.from_xhtml(xml)
|
93
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
|
94
|
+
sub(DOCTYPE, "").
|
95
|
+
gsub(%{ />}, "/>")
|
69
96
|
end
|
70
97
|
|
71
98
|
def self.msword_fix(r)
|
72
99
|
# brain damage in MSWord parser
|
73
100
|
r.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
74
101
|
'<span style="mso-special-character:footnote"></span>')
|
102
|
+
r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
103
|
+
'<div style="mso-element:footnote-list"/>')
|
75
104
|
r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
76
105
|
r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
77
106
|
r.gsub!(%r{<meta http-equiv="Content-Type"},
|
78
107
|
"<meta http-equiv=Content-Type")
|
79
|
-
r.gsub!(%r{&tab;|&tab;},
|
80
|
-
'<span style="mso-tab-count:1">  </span>')
|
108
|
+
r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>')
|
81
109
|
r
|
82
110
|
end
|
83
111
|
|
@@ -152,13 +180,19 @@ module Html2Doc
|
|
152
180
|
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
|
153
181
|
head = docxml.at("//*[local-name() = 'head']")
|
154
182
|
css = stylesheet(filename, header_file, cssname)
|
155
|
-
|
183
|
+
add_stylesheet(head, title, css)
|
184
|
+
define_head1(docxml, dir)
|
185
|
+
namespace(docxml.root)
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.add_stylesheet(head, title, css)
|
189
|
+
if head.children.empty?
|
190
|
+
head.add_child css
|
191
|
+
elsif title.nil?
|
156
192
|
head.children.first.add_previous_sibling css
|
157
193
|
else
|
158
194
|
title.add_next_sibling css
|
159
195
|
end
|
160
|
-
define_head1(docxml, dir)
|
161
|
-
namespace(docxml.root)
|
162
196
|
end
|
163
197
|
|
164
198
|
def self.namespace(root)
|
data/lib/html2doc/mime.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "uuidtools"
|
2
|
-
require "
|
2
|
+
require "base64"
|
3
|
+
require "mime/types"
|
3
4
|
|
4
5
|
module Html2Doc
|
5
6
|
def self.mime_preamble(boundary, filename, result)
|
@@ -46,8 +47,9 @@ module Html2Doc
|
|
46
47
|
def self.mime_package(result, filename, dir)
|
47
48
|
boundary = mime_boundary
|
48
49
|
mhtml = mime_preamble(boundary, filename, result)
|
50
|
+
mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
|
49
51
|
Dir.foreach(dir) do |item|
|
50
|
-
next if item == "." || item == ".." || /^\./.match(item)
|
52
|
+
next if item == "." || item == ".." || /^\./.match(item) || item == "filelist.xml"
|
51
53
|
mhtml += mime_attachment(boundary, filename, item, dir)
|
52
54
|
end
|
53
55
|
mhtml += "--#{boundary}--"
|
data/lib/html2doc/notes.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require "uuidtools"
|
2
|
-
require "nokogiri"
|
3
2
|
|
4
3
|
module Html2Doc
|
5
4
|
def self.footnotes(docxml)
|
@@ -37,7 +36,7 @@ module Html2Doc
|
|
37
36
|
def self.footnote_container(i)
|
38
37
|
<<~DIV
|
39
38
|
<div style='mso-element:footnote' id='ftn#{i}'>
|
40
|
-
<a style='mso-footnote-id:ftn#{i}' href
|
39
|
+
<a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}'
|
41
40
|
name='_ftnref#{i}' title='' id='_ftnref#{i}'><span
|
42
41
|
class='MsoFootnoteReference'><span
|
43
42
|
style='mso-special-character:footnote'></span></span></div>
|
data/lib/html2doc/version.rb
CHANGED
data/spec/19160-6.png
ADDED
Binary file
|
data/spec/19160-7.gif
ADDED
Binary file
|
data/spec/header.html
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
<html xmlns:v="urn:schemas-microsoft-com:vml"
|
2
|
+
xmlns:o="urn:schemas-microsoft-com:office:office"
|
3
|
+
xmlns:w="urn:schemas-microsoft-com:office:word"
|
4
|
+
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
|
5
|
+
xmlns:mv="http://macVmlSchemaUri" xmlns="http://www.w3.org/TR/REC-html40">
|
6
|
+
|
7
|
+
<head>
|
8
|
+
<meta name=Title content="">
|
9
|
+
<meta name=Keywords content="">
|
10
|
+
<meta http-equiv=Content-Type content="text/html; charset=utf-8">
|
11
|
+
<meta name=ProgId content=Word.Document>
|
12
|
+
<meta name=Generator content="Microsoft Word 15">
|
13
|
+
<meta name=Originator content="Microsoft Word 15">
|
14
|
+
<link id=Main-File rel=Main-File href="FILENAME.html">
|
15
|
+
<!--[if gte mso 9]><xml>
|
16
|
+
<o:shapedefaults v:ext="edit" spidmax="2049"/>
|
17
|
+
</xml><![endif]-->
|
18
|
+
</head>
|
19
|
+
|
20
|
+
<body lang=EN link=blue vlink="#954F72">
|
21
|
+
|
22
|
+
<div style='mso-element:footnote-separator' id=fs>
|
23
|
+
|
24
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
25
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-separator'><![if !supportFootnotes]>
|
26
|
+
|
27
|
+
<hr align=left size=1 width="33%">
|
28
|
+
|
29
|
+
<![endif]></span></span></p>
|
30
|
+
|
31
|
+
</div>
|
32
|
+
|
33
|
+
<div style='mso-element:footnote-continuation-separator' id=fcs>
|
34
|
+
|
35
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
36
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-continuation-separator'><![if !supportFootnotes]>
|
37
|
+
|
38
|
+
<hr align=left size=1>
|
39
|
+
|
40
|
+
<![endif]></span></span></p>
|
41
|
+
|
42
|
+
</div>
|
43
|
+
|
44
|
+
<div style='mso-element:endnote-separator' id=es>
|
45
|
+
|
46
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
47
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-separator'><![if !supportFootnotes]>
|
48
|
+
|
49
|
+
<hr align=left size=1 width="33%">
|
50
|
+
|
51
|
+
<![endif]></span></span></p>
|
52
|
+
|
53
|
+
</div>
|
54
|
+
|
55
|
+
<div style='mso-element:endnote-continuation-separator' id=ecs>
|
56
|
+
|
57
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
58
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-continuation-separator'><![if !supportFootnotes]>
|
59
|
+
|
60
|
+
<hr align=left size=1>
|
61
|
+
|
62
|
+
<![endif]></span></span></p>
|
63
|
+
|
64
|
+
</div>
|
65
|
+
|
66
|
+
<div style='mso-element:header' id=eh1>
|
67
|
+
|
68
|
+
<p class=MsoHeader align=left style='text-align:left;line-height:12.0pt;
|
69
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
70
|
+
|
71
|
+
</div>
|
72
|
+
|
73
|
+
<div style='mso-element:header' id=h1>
|
74
|
+
|
75
|
+
<p class=MsoHeader style='margin-bottom:18.0pt'><span lang=EN-GB
|
76
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt;font-weight:normal'>©
|
77
|
+
ISO/IEC 2016 – All rights reserved</span><span lang=EN-GB
|
78
|
+
style='font-weight:normal'><o:p></o:p></span></p>
|
79
|
+
|
80
|
+
</div>
|
81
|
+
|
82
|
+
<div style='mso-element:footer' id=ef1>
|
83
|
+
|
84
|
+
<p class=MsoFooter style='margin-top:12.0pt;line-height:12.0pt;mso-line-height-rule:
|
85
|
+
exactly'><!--[if supportFields]><b style='mso-bidi-font-weight:normal'><span
|
86
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
87
|
+
style='mso-element:field-begin'></span><span
|
88
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
89
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span></b><![endif]--><b
|
90
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
91
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>2</span></span></b><!--[if supportFields]><b
|
92
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
93
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
94
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
95
|
+
style='mso-tab-count:1'> </span>©
|
96
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
97
|
+
|
98
|
+
</div>
|
99
|
+
|
100
|
+
<div style='mso-element:header' id=eh2>
|
101
|
+
|
102
|
+
<p class=MsoHeader align=left style='text-align:left;line-height:12.0pt;
|
103
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
104
|
+
|
105
|
+
</div>
|
106
|
+
|
107
|
+
<div style='mso-element:header' id=h2>
|
108
|
+
|
109
|
+
<p class=MsoHeader align=right style='text-align:right;line-height:12.0pt;
|
110
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
111
|
+
|
112
|
+
</div>
|
113
|
+
|
114
|
+
<div style='mso-element:footer' id=ef2>
|
115
|
+
|
116
|
+
<p class=MsoFooter style='line-height:12.0pt;mso-line-height-rule:exactly'><!--[if supportFields]><span
|
117
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
118
|
+
style='mso-element:field-begin'></span><span
|
119
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
120
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span><![endif]--><span
|
121
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
122
|
+
style='mso-no-proof:yes'>ii</span></span><!--[if supportFields]><span
|
123
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
124
|
+
style='mso-element:field-end'></span></span><![endif]--><span lang=EN-GB
|
125
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span style='mso-tab-count:
|
126
|
+
1'> </span>©
|
127
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
128
|
+
|
129
|
+
</div>
|
130
|
+
|
131
|
+
<div style='mso-element:footer' id=f2>
|
132
|
+
|
133
|
+
<p class=MsoFooter style='line-height:12.0pt'><span lang=EN-GB
|
134
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'>© ISO/IEC 2016 – All
|
135
|
+
rights reserved<span style='mso-tab-count:1'> </span></span><!--[if supportFields]><span
|
136
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
137
|
+
style='mso-element:field-begin'></span> PAGE<span style='mso-spacerun:yes'>
|
138
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span><![endif]--><span
|
139
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
140
|
+
style='mso-no-proof:yes'>iii</span></span><!--[if supportFields]><span
|
141
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
142
|
+
style='mso-element:field-end'></span></span><![endif]--><span lang=EN-GB
|
143
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><o:p></o:p></span></p>
|
144
|
+
|
145
|
+
</div>
|
146
|
+
|
147
|
+
<div style='mso-element:footer' id=ef3>
|
148
|
+
|
149
|
+
<p class=MsoFooter style='margin-top:12.0pt;line-height:12.0pt;mso-line-height-rule:
|
150
|
+
exactly'><!--[if supportFields]><b style='mso-bidi-font-weight:normal'><span
|
151
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
152
|
+
style='mso-element:field-begin'></span><span
|
153
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
154
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span></b><![endif]--><b
|
155
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
156
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>2</span></span></b><!--[if supportFields]><b
|
157
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
158
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
159
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
160
|
+
style='mso-tab-count:1'> </span>©
|
161
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
162
|
+
|
163
|
+
</div>
|
164
|
+
|
165
|
+
<div style='mso-element:footer' id=f3>
|
166
|
+
|
167
|
+
<p class=MsoFooter style='line-height:12.0pt'><span lang=EN-GB
|
168
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'>© ISO/IEC 2016 – All
|
169
|
+
rights reserved<span style='mso-tab-count:1'> </span></span><!--[if supportFields]><b
|
170
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
171
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-begin'></span>
|
172
|
+
PAGE<span style='mso-spacerun:yes'> </span>\* MERGEFORMAT <span
|
173
|
+
style='mso-element:field-separator'></span></span></b><![endif]--><b
|
174
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
175
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>3</span></span></b><!--[if supportFields]><b
|
176
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
177
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
178
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><o:p></o:p></span></p>
|
179
|
+
|
180
|
+
</div>
|
181
|
+
|
182
|
+
</body>
|
183
|
+
|
184
|
+
</html>
|
data/spec/html2doc_spec.rb
CHANGED
@@ -1,9 +1,406 @@
|
|
1
|
-
|
1
|
+
def html_input(x)
|
2
|
+
<<~HTML
|
3
|
+
<html><head><title>blank</title>
|
4
|
+
<meta name="Originator" content="Me"/>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
#{x}
|
8
|
+
</body></html>
|
9
|
+
HTML
|
10
|
+
end
|
11
|
+
|
12
|
+
def html_input_no_title(x)
|
13
|
+
<<~HTML
|
14
|
+
<html><head>
|
15
|
+
<meta name="Originator" content="Me"/>
|
16
|
+
</head>
|
17
|
+
<body>
|
18
|
+
#{x}
|
19
|
+
</body></html>
|
20
|
+
HTML
|
21
|
+
end
|
22
|
+
|
23
|
+
def html_input_empty_head(x)
|
24
|
+
<<~HTML
|
25
|
+
<html><head></head>
|
26
|
+
<body>
|
27
|
+
#{x}
|
28
|
+
</body></html>
|
29
|
+
HTML
|
30
|
+
end
|
31
|
+
|
32
|
+
WORD_HDR = <<~HDR
|
33
|
+
MIME-Version: 1.0
|
34
|
+
Content-Type: multipart/related; boundary="----=_NextPart_"
|
35
|
+
|
36
|
+
------=_NextPart_
|
37
|
+
Content-Location: file:///C:/Doc/test.htm
|
38
|
+
Content-Type: text/html; charset="utf-8"
|
39
|
+
|
40
|
+
<?xml version="1.0"?>
|
41
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
|
42
|
+
<xml>
|
43
|
+
<w:WordDocument>
|
44
|
+
<w:View>Print</w:View>
|
45
|
+
<w:Zoom>100</w:Zoom>
|
46
|
+
<w:DoNotOptimizeForBrowser/>
|
47
|
+
</w:WordDocument>
|
48
|
+
</xml>
|
49
|
+
<![endif]-->
|
50
|
+
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
51
|
+
|
52
|
+
<link rel=File-List href="test_files/filelist.xml"/>
|
53
|
+
<title>blank</title><style><![CDATA[
|
54
|
+
<!--
|
55
|
+
HDR
|
56
|
+
|
57
|
+
WORD_HDR_END = <<~HDR
|
58
|
+
-->
|
59
|
+
]]></style>
|
60
|
+
<meta name="Originator" content="Me"/>
|
61
|
+
</head>
|
62
|
+
HDR
|
63
|
+
|
64
|
+
def word_body(x, fn)
|
65
|
+
<<~BODY
|
66
|
+
<body>
|
67
|
+
#{x}
|
68
|
+
#{fn}</body></html>
|
69
|
+
BODY
|
70
|
+
end
|
71
|
+
|
72
|
+
WORD_FTR1 = <<~FTR
|
73
|
+
------=_NextPart_
|
74
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
75
|
+
Content-Transfer-Encoding: base64
|
76
|
+
Content-Type: application/xml
|
77
|
+
|
78
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
79
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
80
|
+
bGVsaXN0LnhtbCIvPgo8L3htbD4K
|
81
|
+
|
82
|
+
------=_NextPart_--
|
83
|
+
FTR
|
84
|
+
|
85
|
+
WORD_FTR2 = <<~FTR
|
86
|
+
------=_NextPart_
|
87
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
88
|
+
Content-Transfer-Encoding: base64
|
89
|
+
Content-Type: application/xml
|
90
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
91
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
92
|
+
bGVsaXN0LnhtbCIvPgogIDxvOkZpbGUgSFJlZj0iaGVhZGVyLmh0bWwiLz4KPC94bWw+Cg==
|
93
|
+
------=_NextPart_
|
94
|
+
Content-Location: file:///C:/Doc/test_files/header.html
|
95
|
+
Content-Transfer-Encoding: base64
|
96
|
+
Content-Type: text/html charset="utf-8"
|
97
|
+
PGh0bWwgeG1sbnM6dj0idXJuOnNjaGVtYXMtbWljcm9zb2Z0LWNvbTp2bWwiDQp4bWxuczpvPSJ1
|
98
|
+
cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiDQp4bWxuczp3PSJ1cm46c2No
|
99
|
+
ZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTp3b3JkIg0KeG1sbnM6bT0iaHR0cDovL3NjaGVtYXMu
|
100
|
+
bWljcm9zb2Z0LmNvbS9vZmZpY2UvMjAwNC8xMi9vbW1sIg0KeG1sbnM9Imh0dHA6Ly93d3cudzMu
|
101
|
+
b3JnL1RSL1JFQy1odG1sNDAiPg0KDQo8aGVhZD4NCjxtZXRhIGh0dHAtZXF1aXY9Q29udGVudC1U
|
102
|
+
eXBlIGNvbnRlbnQ9InRleHQvaHRtbDsgY2hhcnNldD11dGYtOCI+DQo8bWV0YSBuYW1lPVByb2dJ
|
103
|
+
ZCBjb250ZW50PVdvcmQuRG9jdW1lbnQ+DQo8bWV0YSBuYW1lPUdlbmVyYXRvciBjb250ZW50PSJN
|
104
|
+
aWNyb3NvZnQgV29yZCAxNSI+DQo8bWV0YSBuYW1lPU9yaWdpbmF0b3IgY29udGVudD0iTWljcm9z
|
105
|
+
b2Z0IFdvcmQgMTUiPg0KPGxpbmsgaWQ9TWFpbi1GaWxlIHJlbD1NYWluLUZpbGUgaHJlZj0iLi4v
|
106
|
+
cmljZS5nYi5odG1sIj4NCjwhLS1baWYgZ3RlIG1zbyA5XT48eG1sPg0KIDxvOnNoYXBlZGVmYXVs
|
107
|
+
dHMgdjpleHQ9ImVkaXQiIHNwaWRtYXg9IjIwNDkiLz4NCjwveG1sPjwhW2VuZGlmXS0tPg0KPC9o
|
108
|
+
ZWFkPg0KDQo8Ym9keSBsYW5nPVpIIGxpbms9Ymx1ZSB2bGluaz1wdXJwbGU+DQoNCjxkaXYgc3R5
|
109
|
+
bGU9J21zby1lbGVtZW50OmZvb3Rub3RlLXNlcGFyYXRvcicgaWQ9ZnM+DQoNCjxwIGNsYXNzPU1z
|
110
|
+
b05vcm1hbD48c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lhbC1jaGFyYWN0
|
111
|
+
ZXI6Zm9vdG5vdGUtc2VwYXJhdG9yJz48IVtpZiAhc3VwcG9ydEZvb3Rub3Rlc10+DQoNCjxociBh
|
112
|
+
bGlnbj1sZWZ0IHNpemU9MSB3aWR0aD0iMzMlIj4NCg0KPCFbZW5kaWZdPjwvc3Bhbj48L3NwYW4+
|
113
|
+
PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6Zm9vdG5vdGUtY29udGlu
|
114
|
+
dWF0aW9uLXNlcGFyYXRvcicgaWQ9ZmNzPg0KDQo8cCBjbGFzcz1Nc29Ob3JtYWw+PHNwYW4gbGFu
|
115
|
+
Zz1FTi1VUz48c3BhbiBzdHlsZT0nbXNvLXNwZWNpYWwtY2hhcmFjdGVyOmZvb3Rub3RlLWNvbnRp
|
116
|
+
bnVhdGlvbi1zZXBhcmF0b3InPjwhW2lmICFzdXBwb3J0Rm9vdG5vdGVzXT4NCg0KPGhyIGFsaWdu
|
117
|
+
PWxlZnQgc2l6ZT0xPg0KDQo8IVtlbmRpZl0+PC9zcGFuPjwvc3Bhbj48L3A+DQoNCjwvZGl2Pg0K
|
118
|
+
DQo8ZGl2IHN0eWxlPSdtc28tZWxlbWVudDplbmRub3RlLXNlcGFyYXRvcicgaWQ9ZXM+DQoNCjxw
|
119
|
+
IGNsYXNzPU1zb05vcm1hbD48c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lh
|
120
|
+
bC1jaGFyYWN0ZXI6Zm9vdG5vdGUtc2VwYXJhdG9yJz48IVtpZiAhc3VwcG9ydEZvb3Rub3Rlc10+
|
121
|
+
DQoNCjxociBhbGlnbj1sZWZ0IHNpemU9MSB3aWR0aD0iMzMlIj4NCg0KPCFbZW5kaWZdPjwvc3Bh
|
122
|
+
bj48L3NwYW4+PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6ZW5kbm90
|
123
|
+
ZS1jb250aW51YXRpb24tc2VwYXJhdG9yJyBpZD1lY3M+DQoNCjxwIGNsYXNzPU1zb05vcm1hbD48
|
124
|
+
c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lhbC1jaGFyYWN0ZXI6Zm9vdG5v
|
125
|
+
dGUtY29udGludWF0aW9uLXNlcGFyYXRvcic+PCFbaWYgIXN1cHBvcnRGb290bm90ZXNdPg0KDQo8
|
126
|
+
aHIgYWxpZ249bGVmdCBzaXplPTE+DQoNCjwhW2VuZGlmXT48L3NwYW4+PC9zcGFuPjwvcD4NCg0K
|
127
|
+
PC9kaXY+DQoNCjxkaXYgc3R5bGU9J21zby1lbGVtZW50OmhlYWRlcicgaWQ9aDI+DQoNCjxwIGNs
|
128
|
+
YXNzPU1zb0hlYWRlcj48c3BhbiBsYW5nPUVOLVVTPkRCMTEvQ0QgMTczMDEtMTwvc3Bhbj48c3Bh
|
129
|
+
biBsYW5nPUVOLVVTDQpzdHlsZT0nZm9udC1mYW1pbHk6IlRpbWVzIE5ldyBSb21hbiIsc2VyaWY7
|
130
|
+
bXNvLWFzY2lpLWZvbnQtZmFtaWx5OlNpbUhlaSc+4oCUPC9zcGFuPjxzcGFuDQpsYW5nPUVOLVVT
|
131
|
+
PjIwMTY8L3NwYW4+PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6Zm9v
|
132
|
+
dGVyJyBpZD1mMj4NCg0KPHAgY2xhc3M9TXNvRm9vdGVyPjwhLS1baWYgc3VwcG9ydEZpZWxkc10+
|
133
|
+
PHNwYW4gbGFuZz1FTi1VUz48c3BhbiBzdHlsZT0nbXNvLWVsZW1lbnQ6DQpmaWVsZC1iZWdpbic+
|
134
|
+
PC9zcGFuPjxzcGFuIHN0eWxlPSdtc28tc3BhY2VydW46eWVzJz7CoDwvc3Bhbj5QQUdFPHNwYW4N
|
135
|
+
CnN0eWxlPSdtc28tc3BhY2VydW46eWVzJz7CoCA8L3NwYW4+XCogTUVSR0VGT1JNQVQgPHNwYW4g
|
136
|
+
c3R5bGU9J21zby1lbGVtZW50OmZpZWxkLXNlcGFyYXRvcic+PC9zcGFuPjwvc3Bhbj48IVtlbmRp
|
137
|
+
Zl0tLT48c3Bhbg0KbGFuZz1lbCBzdHlsZT0nbXNvLWFuc2ktbGFuZ3VhZ2U6IzA0MDA7bXNvLWZh
|
138
|
+
cmVhc3QtbGFuZ3VhZ2U6IzA0MDA7bXNvLW5vLXByb29mOg0KeWVzJz40Mjwvc3Bhbj48IS0tW2lm
|
139
|
+
IHN1cHBvcnRGaWVsZHNdPjxzcGFuIGxhbmc9RU4tVVM+PHNwYW4gc3R5bGU9J21zby1lbGVtZW50
|
140
|
+
Og0KZmllbGQtZW5kJz48L3NwYW4+PC9zcGFuPjwhW2VuZGlmXS0tPjwvcD4NCg0KPC9kaXY+DQoN
|
141
|
+
CjwvYm9keT4NCg0KPC9odG1sPg0K
|
142
|
+
|
143
|
+
------=_NextPart_--
|
144
|
+
FTR
|
145
|
+
|
146
|
+
WORD_FTR3 = <<~FTR
|
147
|
+
------=_NextPart_
|
148
|
+
Content-Location: file:///C:/Doc/test_files/609e8807-c2d0-450c-b60b-d995a0f8dcaf.png
|
149
|
+
Content-Transfer-Encoding: base64
|
150
|
+
Content-Type: image/png
|
151
|
+
FTR
|
152
|
+
|
153
|
+
WORD_FTR3 = <<~FTR
|
154
|
+
------=_NextPart_
|
155
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
156
|
+
Content-Transfer-Encoding: base64
|
157
|
+
Content-Type: application/xml
|
158
|
+
|
159
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
160
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9IjFh
|
161
|
+
YzIwNjVmLTAzZjAtNGM3YS1iOWE2LTkyZTgyMDU5MWJmMC5wbmciLz4KICA8bzpGaWxlIEhSZWY9
|
162
|
+
ImZpbGVsaXN0LnhtbCIvPgo8L3htbD4K
|
163
|
+
------=_NextPart_
|
164
|
+
Content-Location: file:///C:/Doc/test_files/cb7b0d19-891e-4634-815a-570d019d454c.png
|
165
|
+
Content-Transfer-Encoding: base64
|
166
|
+
Content-Type: image/png
|
167
|
+
------=_NextPart_--
|
168
|
+
FTR
|
169
|
+
|
170
|
+
DEFAULT_STYLESHEET = File.read("lib/html2doc/wordstyle.css", encoding: "utf-8").freeze
|
171
|
+
|
172
|
+
def guid_clean(x)
|
173
|
+
x.gsub(/NextPart_[0-9a-f.]+/, "NextPart_")
|
174
|
+
end
|
175
|
+
|
176
|
+
def image_clean(x)
|
177
|
+
x.gsub(%r{[0-9a-f-]+\.png}, "image.png").
|
178
|
+
gsub(%r{[0-9a-f-]+\.gif}, "image.gif").
|
179
|
+
gsub(%r{[0-9a-f-]+\.(jpeg|jpg)}, "image.jpg").
|
180
|
+
gsub(%r{------=_NextPart_\s+Content-Location: file:///C:/Doc/test_files/image\.(png|gif).*?\s-----=_NextPart_}m, "------=_NextPart_").
|
181
|
+
gsub(%r{Content-Type: image/(png|gif|jpeg)[^-]*------=_NextPart_-?-?}m, "").
|
182
|
+
gsub(%r{ICAgICAg[^-]*-----}m, "-----").
|
183
|
+
gsub(%r{\s*</img>\s*}m, "</img>").
|
184
|
+
gsub(%r{</body>\s*</html>}m, "</body></html>")
|
185
|
+
end
|
186
|
+
|
187
|
+
RSpec.describe Html2Doc do
|
2
188
|
it "has a version number" do
|
3
|
-
expect(
|
189
|
+
expect(Html2Doc::VERSION).not_to be nil
|
4
190
|
end
|
5
191
|
|
6
|
-
it "
|
7
|
-
|
192
|
+
it "processes a blank document" do
|
193
|
+
Html2Doc.process(html_input(""), "test", nil, nil, nil, nil)
|
194
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
195
|
+
to match_fuzzy(<<~OUTPUT)
|
196
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
197
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
198
|
+
OUTPUT
|
8
199
|
end
|
200
|
+
|
201
|
+
it "removes any temp files" do
|
202
|
+
File.delete("test.doc")
|
203
|
+
Html2Doc.process(html_input(""), "test", nil, nil, nil, nil)
|
204
|
+
expect(File.exist?("test.doc")).to be true
|
205
|
+
expect(File.exist?("test.htm")).to be false
|
206
|
+
expect(File.exist?("test_files")).to be false
|
207
|
+
end
|
208
|
+
|
209
|
+
it "processes a stylesheet in an HTML document with a title" do
|
210
|
+
Html2Doc.process(html_input(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
211
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
212
|
+
to match_fuzzy(<<~OUTPUT)
|
213
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
214
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
215
|
+
OUTPUT
|
216
|
+
end
|
217
|
+
|
218
|
+
it "processes a stylesheet in an HTML document without a title" do
|
219
|
+
Html2Doc.process(html_input_no_title(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
220
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
221
|
+
to match_fuzzy(<<~OUTPUT)
|
222
|
+
#{WORD_HDR.sub("<title>blank</title>", "")}
|
223
|
+
#{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
224
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
225
|
+
OUTPUT
|
226
|
+
end
|
227
|
+
|
228
|
+
it "processes a stylesheet in an HTML document with an empty head" do
|
229
|
+
Html2Doc.process(html_input_empty_head(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
230
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
231
|
+
to match_fuzzy(<<~OUTPUT)
|
232
|
+
#{WORD_HDR.sub("<title>blank</title>", "")}
|
233
|
+
#{DEFAULT_STYLESHEET}
|
234
|
+
#{WORD_HDR_END.sub('<meta name="Originator" content="Me"/>'+"\n", "").sub("</style>\n</head>", "</style></head>")}
|
235
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
236
|
+
OUTPUT
|
237
|
+
end
|
238
|
+
|
239
|
+
it "processes a header" do
|
240
|
+
Html2Doc.process(html_input(""), "test", nil, "header.html", nil, nil)
|
241
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
242
|
+
to match_fuzzy(<<~OUTPUT)
|
243
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET.gsub(/FILENAME/, "test")}
|
244
|
+
#{WORD_HDR_END} #{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR2}
|
245
|
+
OUTPUT
|
246
|
+
end
|
247
|
+
|
248
|
+
it "processes a populated document" do
|
249
|
+
simple_body = "<h1>Hello word!</h1>
|
250
|
+
<div>This is a very simple document</div>"
|
251
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
252
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
253
|
+
to match_fuzzy(<<~OUTPUT)
|
254
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
255
|
+
#{word_body(simple_body, '<div style="mso-element:footnote-list"/>')}
|
256
|
+
#{WORD_FTR1}
|
257
|
+
OUTPUT
|
258
|
+
end
|
259
|
+
|
260
|
+
it "processes AsciiMath" do
|
261
|
+
Html2Doc.process(html_input("<div>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"), "test", nil, nil, nil, ["{{", "}}"])
|
262
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
263
|
+
to match_fuzzy(<<~OUTPUT)
|
264
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
265
|
+
#{word_body('<div><m:oMath><m:nary><m:naryPr><m:chr m:val="∑"></m:chr><m:limLoc m:val="undOvr"></m:limLoc><m:grow m:val="1"></m:grow><m:subHide m:val="off"></m:subHide><m:supHide m:val="off"></m:supHide></m:naryPr><m:sub><m:r><m:t>i=1</m:t></m:r></m:sub><m:sup><m:r><m:t>n</m:t></m:r></m:sup><m:e></m:e></m:nary><m:sSup><m:e><m:r><m:t>i</m:t></m:r></m:e><m:sup><m:r><m:t>3</m:t></m:r></m:sup></m:sSup><m:r><m:t>=</m:t></m:r><m:sSup><m:e><m:r><m:t>(</m:t></m:r><m:f><m:fPr><m:type m:val="bar"></m:type></m:fPr><m:num><m:r><m:t>n</m:t></m:r><m:r><m:t>(n+1)</m:t></m:r></m:num><m:den><m:r><m:t>2</m:t></m:r></m:den></m:f><m:r><m:t>)</m:t></m:r></m:e><m:sup><m:r><m:t>2</m:t></m:r></m:sup></m:sSup></m:oMath>
|
266
|
+
</div>', '<div style="mso-element:footnote-list"/>')}
|
267
|
+
#{WORD_FTR1}
|
268
|
+
OUTPUT
|
269
|
+
end
|
270
|
+
|
271
|
+
it "processes tabs" do
|
272
|
+
simple_body = "<h1>Hello word!</h1>
|
273
|
+
<div>This is a very &tab; simple document</div>"
|
274
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
275
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
276
|
+
to match_fuzzy(<<~OUTPUT)
|
277
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
278
|
+
#{word_body(simple_body.gsub(/\&tab;/, %[<span style="mso-tab-count:1">  </span>]), '<div style="mso-element:footnote-list"/>')}
|
279
|
+
#{WORD_FTR1}
|
280
|
+
OUTPUT
|
281
|
+
end
|
282
|
+
|
283
|
+
it "makes unstyled paragraphs be MsoNormal" do
|
284
|
+
simple_body = '<h1>Hello word!</h1>
|
285
|
+
<p>This is a very simple document</p>
|
286
|
+
<p class="x">This style stays</p>'
|
287
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
288
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
289
|
+
to match_fuzzy(<<~OUTPUT)
|
290
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
291
|
+
#{word_body(simple_body.gsub(/<p>/, %[<p class="MsoNormal">]), '<div style="mso-element:footnote-list"/>')}
|
292
|
+
#{WORD_FTR1}
|
293
|
+
OUTPUT
|
294
|
+
end
|
295
|
+
|
296
|
+
it "makes unstyled list entries be MsoNormal" do
|
297
|
+
simple_body = '<h1>Hello word!</h1>
|
298
|
+
<ul>
|
299
|
+
<li>This is a very simple document</li>
|
300
|
+
<li class="x">This style stays</li>
|
301
|
+
</ul>'
|
302
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
303
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
304
|
+
to match_fuzzy(<<~OUTPUT)
|
305
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
306
|
+
#{word_body(simple_body.gsub(/<li>/, %[<li class="MsoNormal">]), '<div style="mso-element:footnote-list"/>')}
|
307
|
+
#{WORD_FTR1}
|
308
|
+
OUTPUT
|
309
|
+
end
|
310
|
+
|
311
|
+
it "resizes images for height" do
|
312
|
+
simple_body = '<img src="spec/19160-6.png">'
|
313
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
314
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
315
|
+
expect(testdoc).to match(%r{Content-Type: image/png})
|
316
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
317
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
318
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.png" width="400" height="387"></img>', '<div style="mso-element:footnote-list"/>'))}
|
319
|
+
#{image_clean(WORD_FTR3)}
|
320
|
+
OUTPUT
|
321
|
+
end
|
322
|
+
|
323
|
+
it "resizes images for width" do
|
324
|
+
simple_body = '<img src="spec/19160-7.gif">'
|
325
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
326
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
327
|
+
expect(testdoc).to match(%r{Content-Type: image/gif})
|
328
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
329
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
330
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.gif" width="400" height="118"></img>', '<div style="mso-element:footnote-list"/>'))}
|
331
|
+
#{image_clean(WORD_FTR3).gsub(/image\.png/, "image.gif")}
|
332
|
+
OUTPUT
|
333
|
+
end
|
334
|
+
|
335
|
+
it "resizes images for height" do
|
336
|
+
simple_body = '<img src="spec/19160-8.jpg">'
|
337
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
338
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
339
|
+
expect(testdoc).to match(%r{Content-Type: image/jpeg})
|
340
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
341
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
342
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.jpg" width="208" height="680"></img>', '<div style="mso-element:footnote-list"/>'))}
|
343
|
+
#{image_clean(WORD_FTR3).gsub(/image\.png/, "image.jpg")}
|
344
|
+
OUTPUT
|
345
|
+
end
|
346
|
+
|
347
|
+
it "processes epub:type footnotes" do
|
348
|
+
simple_body = '<div>This is a very simple
|
349
|
+
document<a epub:type="footnote" href="#a1">1</a> allegedly<a epub:type="footnote" href="#a2">2</a></div>
|
350
|
+
<aside id="a1">Footnote</aside>
|
351
|
+
<aside id="a2">Other Footnote</aside>'
|
352
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
353
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
354
|
+
to match_fuzzy(<<~OUTPUT)
|
355
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
356
|
+
#{word_body('<div>This is a very simple
|
357
|
+
document<a epub:type="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a epub:type="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
358
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
359
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
360
|
+
<div style="mso-element:footnote" id="ftn2">
|
361
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
362
|
+
</div>')}
|
363
|
+
#{WORD_FTR1}
|
364
|
+
OUTPUT
|
365
|
+
end
|
366
|
+
|
367
|
+
it "processes class footnotes" do
|
368
|
+
simple_body = '<div>This is a very simple
|
369
|
+
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
370
|
+
<aside id="a1">Footnote</aside>
|
371
|
+
<aside id="a2">Other Footnote</aside>'
|
372
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
373
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
374
|
+
to match_fuzzy(<<~OUTPUT)
|
375
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
376
|
+
#{word_body('<div>This is a very simple
|
377
|
+
document<a class="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a class="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
378
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
379
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
380
|
+
<div style="mso-element:footnote" id="ftn2">
|
381
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
382
|
+
</div>')}
|
383
|
+
#{WORD_FTR1}
|
384
|
+
OUTPUT
|
385
|
+
end
|
386
|
+
|
387
|
+
it "extracts paragraphs from footnotes" do
|
388
|
+
simple_body = '<div>This is a very simple
|
389
|
+
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
390
|
+
<aside id="a1"><p>Footnote</p></aside>
|
391
|
+
<div id="a2"><p>Other Footnote</p></div>'
|
392
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
393
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
394
|
+
to match_fuzzy(<<~OUTPUT)
|
395
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
396
|
+
#{word_body('<div>This is a very simple
|
397
|
+
document<a class="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a class="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
398
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
399
|
+
<p class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
400
|
+
<div style="mso-element:footnote" id="ftn2">
|
401
|
+
<p class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
402
|
+
</div>')}
|
403
|
+
#{WORD_FTR1}
|
404
|
+
OUTPUT
|
405
|
+
end
|
9
406
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-02-
|
11
|
+
date: 2018-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -262,6 +262,20 @@ dependencies:
|
|
262
262
|
- - "~>"
|
263
263
|
- !ruby/object:Gem::Version
|
264
264
|
version: '0.9'
|
265
|
+
- !ruby/object:Gem::Dependency
|
266
|
+
name: rspec-match_fuzzy
|
267
|
+
requirement: !ruby/object:Gem::Requirement
|
268
|
+
requirements:
|
269
|
+
- - ">="
|
270
|
+
- !ruby/object:Gem::Version
|
271
|
+
version: '0'
|
272
|
+
type: :development
|
273
|
+
prerelease: false
|
274
|
+
version_requirements: !ruby/object:Gem::Requirement
|
275
|
+
requirements:
|
276
|
+
- - ">="
|
277
|
+
- !ruby/object:Gem::Version
|
278
|
+
version: '0'
|
265
279
|
description: |
|
266
280
|
Convert HTML document to Microsoft Word document.
|
267
281
|
|
@@ -272,6 +286,7 @@ executables: []
|
|
272
286
|
extensions: []
|
273
287
|
extra_rdoc_files: []
|
274
288
|
files:
|
289
|
+
- ".gitattributes"
|
275
290
|
- ".gitignore"
|
276
291
|
- ".hound.yml"
|
277
292
|
- ".oss-guides.rubocop.yml"
|
@@ -285,6 +300,7 @@ files:
|
|
285
300
|
- README.adoc
|
286
301
|
- Rakefile
|
287
302
|
- bin/console
|
303
|
+
- bin/rspec
|
288
304
|
- bin/setup
|
289
305
|
- html2doc.gemspec
|
290
306
|
- lib/html2doc.rb
|
@@ -294,6 +310,8 @@ files:
|
|
294
310
|
- lib/html2doc/notes.rb
|
295
311
|
- lib/html2doc/version.rb
|
296
312
|
- lib/html2doc/wordstyle.css
|
313
|
+
- spec/19160-6.png
|
314
|
+
- spec/19160-7.gif
|
297
315
|
- spec/examples/header.html
|
298
316
|
- spec/examples/rice.doc
|
299
317
|
- spec/examples/rice.html
|
@@ -303,6 +321,7 @@ files:
|
|
303
321
|
- spec/examples/rice_images/rice_image3_1.png
|
304
322
|
- spec/examples/rice_images/rice_image3_2.png
|
305
323
|
- spec/examples/rice_images/rice_image3_3.png
|
324
|
+
- spec/header.html
|
306
325
|
- spec/html2doc_spec.rb
|
307
326
|
- spec/spec_helper.rb
|
308
327
|
homepage: https://github.com/riboseinc/html2doc
|