html2doc 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitattributes +2 -0
- data/Gemfile +6 -0
- data/README.adoc +3 -3
- data/bin/rspec +18 -0
- data/html2doc.gemspec +1 -1
- data/lib/html2doc/base.rb +56 -22
- data/lib/html2doc/mime.rb +4 -2
- data/lib/html2doc/notes.rb +1 -2
- data/lib/html2doc/version.rb +1 -1
- data/spec/19160-6.png +0 -0
- data/spec/19160-7.gif +0 -0
- data/spec/header.html +184 -0
- data/spec/html2doc_spec.rb +401 -4
- data/spec/spec_helper.rb +6 -0
- metadata +21 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 74ea5fa5a0e4221f38ded5491536c3b7fbeeb51b
|
4
|
+
data.tar.gz: 6cfa24874e5afe45854c7c86061b1d3a9c1cb80a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d2aa3ea91ba1fe76ee540c85b0cb944ef3cc9eca4dd12f7884028fad0e41cdbc17b8e1ca17c6d110a73cb65e0dc214b8815f6296464c8a20399b00e28016717f
|
7
|
+
data.tar.gz: efa78fc0e9d6533ed21bd9e564cdad6175c128510417ccf43d02a429a3c397bdec8f50987548ce9c2db78204ddb641fdc2828b9f1c29e02cc90868c8f5a1e7fa
|
data/.gitattributes
ADDED
data/Gemfile
CHANGED
data/README.adoc
CHANGED
@@ -29,9 +29,9 @@ Work to be done:
|
|
29
29
|
|
30
30
|
== Constraints
|
31
31
|
|
32
|
-
This generates .doc documents.
|
32
|
+
This generates .doc documents. Future versions will upgrade the output to docx.
|
33
33
|
|
34
|
-
|
34
|
+
There there are two other Microsoft Word vendors in the Ruby ecosystem. https://github.com/jetruby/puredocx generate Word documents from a ruby struct as a DSL, rather than converting a preexisting html document. That constrains it's coverage to what is explicitly catered for in the DSL. https://github.com/MuhammetDilmac/Html2Docx is a much simpler wrapper around html: it does not do any of the added functionality described above (image resizing, converting footnotes, AsciiMath and MathML), though it does already generate docx.
|
35
35
|
|
36
36
|
== Usage
|
37
37
|
|
@@ -46,7 +46,7 @@ result:: is the Html document to be converted into Word, as a string.
|
|
46
46
|
filename:: is the name the document is to be saved as, without a file suffix
|
47
47
|
stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided (`nil`), the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
|
48
48
|
header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/riboseinc/asciidoctor-iso/blob/master/lib/asciidoctor/iso/word/header.html
|
49
|
-
dir:: is the
|
49
|
+
dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided (`nil`), it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder.)
|
50
50
|
asciimathdelims:: are the AsciiMath delimiters used in the text. If none are provided, no AsciiMath conversion is attempted.
|
51
51
|
|
52
52
|
Note that the local CSS stylesheet file contains a variable `FILENAME` for the location of footnote/endnote separators and headers/footers, which are provided in the header HTML file. The gem replaces `FILENAME` with the file nane that the document will be saved as. If you supply your own stylesheet and also wish to use separators or headers/footers, you will likewise need to replace the document name mentioned in your stylesheet with a `FILENAME` string.
|
data/bin/rspec
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# This file was generated by Bundler.
|
4
|
+
#
|
5
|
+
# The application 'rspec' is installed as part of a gem, and
|
6
|
+
# this file is here to facilitate running it.
|
7
|
+
#
|
8
|
+
|
9
|
+
require "pathname"
|
10
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path(
|
11
|
+
"../../Gemfile", Pathname.new(__FILE__).realpath
|
12
|
+
)
|
13
|
+
|
14
|
+
require "rubygems"
|
15
|
+
require "bundler/setup"
|
16
|
+
|
17
|
+
load Gem.bin_path("rspec-core", "rspec")
|
18
|
+
|
data/html2doc.gemspec
CHANGED
@@ -10,7 +10,6 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["open.source@ribose.com"]
|
11
11
|
|
12
12
|
spec.summary = "Convert HTML document to Microsoft Word document"
|
13
|
-
"in AsciiDoc."
|
14
13
|
spec.description = <<~DESCRIPTION
|
15
14
|
Convert HTML document to Microsoft Word document.
|
16
15
|
|
@@ -45,4 +44,5 @@ Gem::Specification.new do |spec|
|
|
45
44
|
spec.add_development_dependency "rubocop", "~> 0.50"
|
46
45
|
spec.add_development_dependency "simplecov", "~> 0.15"
|
47
46
|
spec.add_development_dependency "timecop", "~> 0.9"
|
47
|
+
spec.add_development_dependency "rspec-match_fuzzy"
|
48
48
|
end
|
data/lib/html2doc/base.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "uuidtools"
|
2
2
|
require "asciimath"
|
3
|
+
require "image_size"
|
3
4
|
require "nokogiri"
|
4
5
|
require "xml/xslt"
|
5
6
|
require "pp"
|
@@ -8,27 +9,36 @@ module Html2Doc
|
|
8
9
|
@xslt = XML::XSLT.new
|
9
10
|
@xslt.xsl = File.read(File.join(File.dirname(__FILE__), "mathml2omml.xsl"))
|
10
11
|
|
11
|
-
def self.process(result, filename, stylesheet, header_file, dir,
|
12
|
+
def self.process(result, filename, stylesheet, header_file, dir = nil,
|
12
13
|
asciimathdelims = nil)
|
14
|
+
dir1 = create_dir(filename, dir)
|
13
15
|
result = process_html(result, filename, stylesheet, header_file,
|
14
|
-
|
15
|
-
system "cp #{header_file} #{
|
16
|
-
generate_filelist(filename,
|
16
|
+
dir1, asciimathdelims)
|
17
|
+
system "cp #{header_file} #{dir1}/header.html" unless header_file.nil?
|
18
|
+
generate_filelist(filename, dir1)
|
17
19
|
File.open("#{filename}.htm", "w") { |f| f.write(result) }
|
18
|
-
mime_package result, filename,
|
19
|
-
rm_temp_files(filename, dir)
|
20
|
+
mime_package result, filename, dir1
|
21
|
+
rm_temp_files(filename, dir, dir1)
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.create_dir(filename, dir)
|
25
|
+
return dir if dir
|
26
|
+
dir = "#{filename}_files"
|
27
|
+
Dir.mkdir(dir) unless File.exists?(dir)
|
28
|
+
dir
|
20
29
|
end
|
21
30
|
|
22
31
|
def self.process_html(result, filename, stylesheet, header_file, dir,
|
23
32
|
asciimathdelims)
|
24
|
-
docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims))
|
33
|
+
# docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims))
|
34
|
+
docxml = to_xhtml(asciimath_to_mathml(result, asciimathdelims))
|
25
35
|
define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file)
|
26
|
-
msword_fix(docxml
|
36
|
+
msword_fix(from_xhtml(docxml))
|
27
37
|
end
|
28
38
|
|
29
|
-
def self.rm_temp_files(filename,
|
39
|
+
def self.rm_temp_files(filename, dir, dir1)
|
30
40
|
system "rm #{filename}.htm"
|
31
|
-
system "rm -r #{
|
41
|
+
system "rm -r #{dir1}" unless dir
|
32
42
|
end
|
33
43
|
|
34
44
|
def self.cleanup(docxml, dir)
|
@@ -58,26 +68,44 @@ module Html2Doc
|
|
58
68
|
end
|
59
69
|
end
|
60
70
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
71
|
+
NOKOHEAD = <<~HERE.freeze
|
72
|
+
<!DOCTYPE html SYSTEM
|
73
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
74
|
+
<html xmlns="http://www.w3.org/1999/xhtml">
|
75
|
+
<head> <title></title> <meta charset="UTF-8" /> </head>
|
76
|
+
<body> </body> </html>
|
77
|
+
HERE
|
78
|
+
|
79
|
+
def self.to_xhtml(xml)
|
80
|
+
xml.gsub!(/<\?xml[^>]*>/, "")
|
81
|
+
unless /<!DOCTYPE /.match? xml
|
82
|
+
xml = '<!DOCTYPE html SYSTEM
|
83
|
+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
67
84
|
end
|
68
|
-
|
85
|
+
Nokogiri::XML.parse(xml)
|
86
|
+
end
|
87
|
+
|
88
|
+
DOCTYPE = <<~"DOCTYPE".freeze
|
89
|
+
<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
90
|
+
DOCTYPE
|
91
|
+
|
92
|
+
def self.from_xhtml(xml)
|
93
|
+
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "").
|
94
|
+
sub(DOCTYPE, "").
|
95
|
+
gsub(%{ />}, "/>")
|
69
96
|
end
|
70
97
|
|
71
98
|
def self.msword_fix(r)
|
72
99
|
# brain damage in MSWord parser
|
73
100
|
r.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
74
101
|
'<span style="mso-special-character:footnote"></span>')
|
102
|
+
r.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
103
|
+
'<div style="mso-element:footnote-list"/>')
|
75
104
|
r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
76
105
|
r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
77
106
|
r.gsub!(%r{<meta http-equiv="Content-Type"},
|
78
107
|
"<meta http-equiv=Content-Type")
|
79
|
-
r.gsub!(%r{&tab;|&tab;},
|
80
|
-
'<span style="mso-tab-count:1">  </span>')
|
108
|
+
r.gsub!(%r{&tab;|&tab;}, '<span style="mso-tab-count:1">  </span>')
|
81
109
|
r
|
82
110
|
end
|
83
111
|
|
@@ -152,13 +180,19 @@ module Html2Doc
|
|
152
180
|
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
|
153
181
|
head = docxml.at("//*[local-name() = 'head']")
|
154
182
|
css = stylesheet(filename, header_file, cssname)
|
155
|
-
|
183
|
+
add_stylesheet(head, title, css)
|
184
|
+
define_head1(docxml, dir)
|
185
|
+
namespace(docxml.root)
|
186
|
+
end
|
187
|
+
|
188
|
+
def self.add_stylesheet(head, title, css)
|
189
|
+
if head.children.empty?
|
190
|
+
head.add_child css
|
191
|
+
elsif title.nil?
|
156
192
|
head.children.first.add_previous_sibling css
|
157
193
|
else
|
158
194
|
title.add_next_sibling css
|
159
195
|
end
|
160
|
-
define_head1(docxml, dir)
|
161
|
-
namespace(docxml.root)
|
162
196
|
end
|
163
197
|
|
164
198
|
def self.namespace(root)
|
data/lib/html2doc/mime.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require "uuidtools"
|
2
|
-
require "
|
2
|
+
require "base64"
|
3
|
+
require "mime/types"
|
3
4
|
|
4
5
|
module Html2Doc
|
5
6
|
def self.mime_preamble(boundary, filename, result)
|
@@ -46,8 +47,9 @@ module Html2Doc
|
|
46
47
|
def self.mime_package(result, filename, dir)
|
47
48
|
boundary = mime_boundary
|
48
49
|
mhtml = mime_preamble(boundary, filename, result)
|
50
|
+
mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
|
49
51
|
Dir.foreach(dir) do |item|
|
50
|
-
next if item == "." || item == ".." || /^\./.match(item)
|
52
|
+
next if item == "." || item == ".." || /^\./.match(item) || item == "filelist.xml"
|
51
53
|
mhtml += mime_attachment(boundary, filename, item, dir)
|
52
54
|
end
|
53
55
|
mhtml += "--#{boundary}--"
|
data/lib/html2doc/notes.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require "uuidtools"
|
2
|
-
require "nokogiri"
|
3
2
|
|
4
3
|
module Html2Doc
|
5
4
|
def self.footnotes(docxml)
|
@@ -37,7 +36,7 @@ module Html2Doc
|
|
37
36
|
def self.footnote_container(i)
|
38
37
|
<<~DIV
|
39
38
|
<div style='mso-element:footnote' id='ftn#{i}'>
|
40
|
-
<a style='mso-footnote-id:ftn#{i}' href
|
39
|
+
<a style='mso-footnote-id:ftn#{i}' href='#_ftn#{i}'
|
41
40
|
name='_ftnref#{i}' title='' id='_ftnref#{i}'><span
|
42
41
|
class='MsoFootnoteReference'><span
|
43
42
|
style='mso-special-character:footnote'></span></span></div>
|
data/lib/html2doc/version.rb
CHANGED
data/spec/19160-6.png
ADDED
Binary file
|
data/spec/19160-7.gif
ADDED
Binary file
|
data/spec/header.html
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
<html xmlns:v="urn:schemas-microsoft-com:vml"
|
2
|
+
xmlns:o="urn:schemas-microsoft-com:office:office"
|
3
|
+
xmlns:w="urn:schemas-microsoft-com:office:word"
|
4
|
+
xmlns:m="http://schemas.microsoft.com/office/2004/12/omml"
|
5
|
+
xmlns:mv="http://macVmlSchemaUri" xmlns="http://www.w3.org/TR/REC-html40">
|
6
|
+
|
7
|
+
<head>
|
8
|
+
<meta name=Title content="">
|
9
|
+
<meta name=Keywords content="">
|
10
|
+
<meta http-equiv=Content-Type content="text/html; charset=utf-8">
|
11
|
+
<meta name=ProgId content=Word.Document>
|
12
|
+
<meta name=Generator content="Microsoft Word 15">
|
13
|
+
<meta name=Originator content="Microsoft Word 15">
|
14
|
+
<link id=Main-File rel=Main-File href="FILENAME.html">
|
15
|
+
<!--[if gte mso 9]><xml>
|
16
|
+
<o:shapedefaults v:ext="edit" spidmax="2049"/>
|
17
|
+
</xml><![endif]-->
|
18
|
+
</head>
|
19
|
+
|
20
|
+
<body lang=EN link=blue vlink="#954F72">
|
21
|
+
|
22
|
+
<div style='mso-element:footnote-separator' id=fs>
|
23
|
+
|
24
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
25
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-separator'><![if !supportFootnotes]>
|
26
|
+
|
27
|
+
<hr align=left size=1 width="33%">
|
28
|
+
|
29
|
+
<![endif]></span></span></p>
|
30
|
+
|
31
|
+
</div>
|
32
|
+
|
33
|
+
<div style='mso-element:footnote-continuation-separator' id=fcs>
|
34
|
+
|
35
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
36
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-continuation-separator'><![if !supportFootnotes]>
|
37
|
+
|
38
|
+
<hr align=left size=1>
|
39
|
+
|
40
|
+
<![endif]></span></span></p>
|
41
|
+
|
42
|
+
</div>
|
43
|
+
|
44
|
+
<div style='mso-element:endnote-separator' id=es>
|
45
|
+
|
46
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
47
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-separator'><![if !supportFootnotes]>
|
48
|
+
|
49
|
+
<hr align=left size=1 width="33%">
|
50
|
+
|
51
|
+
<![endif]></span></span></p>
|
52
|
+
|
53
|
+
</div>
|
54
|
+
|
55
|
+
<div style='mso-element:endnote-continuation-separator' id=ecs>
|
56
|
+
|
57
|
+
<p class=MsoNormal style='margin-bottom:0cm;margin-bottom:.0001pt;line-height:
|
58
|
+
normal'><span lang=EN-GB><span style='mso-special-character:footnote-continuation-separator'><![if !supportFootnotes]>
|
59
|
+
|
60
|
+
<hr align=left size=1>
|
61
|
+
|
62
|
+
<![endif]></span></span></p>
|
63
|
+
|
64
|
+
</div>
|
65
|
+
|
66
|
+
<div style='mso-element:header' id=eh1>
|
67
|
+
|
68
|
+
<p class=MsoHeader align=left style='text-align:left;line-height:12.0pt;
|
69
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
70
|
+
|
71
|
+
</div>
|
72
|
+
|
73
|
+
<div style='mso-element:header' id=h1>
|
74
|
+
|
75
|
+
<p class=MsoHeader style='margin-bottom:18.0pt'><span lang=EN-GB
|
76
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt;font-weight:normal'>©
|
77
|
+
ISO/IEC 2016 – All rights reserved</span><span lang=EN-GB
|
78
|
+
style='font-weight:normal'><o:p></o:p></span></p>
|
79
|
+
|
80
|
+
</div>
|
81
|
+
|
82
|
+
<div style='mso-element:footer' id=ef1>
|
83
|
+
|
84
|
+
<p class=MsoFooter style='margin-top:12.0pt;line-height:12.0pt;mso-line-height-rule:
|
85
|
+
exactly'><!--[if supportFields]><b style='mso-bidi-font-weight:normal'><span
|
86
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
87
|
+
style='mso-element:field-begin'></span><span
|
88
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
89
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span></b><![endif]--><b
|
90
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
91
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>2</span></span></b><!--[if supportFields]><b
|
92
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
93
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
94
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
95
|
+
style='mso-tab-count:1'> </span>©
|
96
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
97
|
+
|
98
|
+
</div>
|
99
|
+
|
100
|
+
<div style='mso-element:header' id=eh2>
|
101
|
+
|
102
|
+
<p class=MsoHeader align=left style='text-align:left;line-height:12.0pt;
|
103
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
104
|
+
|
105
|
+
</div>
|
106
|
+
|
107
|
+
<div style='mso-element:header' id=h2>
|
108
|
+
|
109
|
+
<p class=MsoHeader align=right style='text-align:right;line-height:12.0pt;
|
110
|
+
mso-line-height-rule:exactly'><span lang=EN-GB>ISO/IEC CD 17301-1:2016(E)</span></p>
|
111
|
+
|
112
|
+
</div>
|
113
|
+
|
114
|
+
<div style='mso-element:footer' id=ef2>
|
115
|
+
|
116
|
+
<p class=MsoFooter style='line-height:12.0pt;mso-line-height-rule:exactly'><!--[if supportFields]><span
|
117
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
118
|
+
style='mso-element:field-begin'></span><span
|
119
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
120
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span><![endif]--><span
|
121
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
122
|
+
style='mso-no-proof:yes'>ii</span></span><!--[if supportFields]><span
|
123
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
124
|
+
style='mso-element:field-end'></span></span><![endif]--><span lang=EN-GB
|
125
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span style='mso-tab-count:
|
126
|
+
1'> </span>©
|
127
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
128
|
+
|
129
|
+
</div>
|
130
|
+
|
131
|
+
<div style='mso-element:footer' id=f2>
|
132
|
+
|
133
|
+
<p class=MsoFooter style='line-height:12.0pt'><span lang=EN-GB
|
134
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'>© ISO/IEC 2016 – All
|
135
|
+
rights reserved<span style='mso-tab-count:1'> </span></span><!--[if supportFields]><span
|
136
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
137
|
+
style='mso-element:field-begin'></span> PAGE<span style='mso-spacerun:yes'>
|
138
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span><![endif]--><span
|
139
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
140
|
+
style='mso-no-proof:yes'>iii</span></span><!--[if supportFields]><span
|
141
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
142
|
+
style='mso-element:field-end'></span></span><![endif]--><span lang=EN-GB
|
143
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><o:p></o:p></span></p>
|
144
|
+
|
145
|
+
</div>
|
146
|
+
|
147
|
+
<div style='mso-element:footer' id=ef3>
|
148
|
+
|
149
|
+
<p class=MsoFooter style='margin-top:12.0pt;line-height:12.0pt;mso-line-height-rule:
|
150
|
+
exactly'><!--[if supportFields]><b style='mso-bidi-font-weight:normal'><span
|
151
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
152
|
+
style='mso-element:field-begin'></span><span
|
153
|
+
style='mso-spacerun:yes'> </span>PAGE<span style='mso-spacerun:yes'>
|
154
|
+
</span>\* MERGEFORMAT <span style='mso-element:field-separator'></span></span></b><![endif]--><b
|
155
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
156
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>2</span></span></b><!--[if supportFields]><b
|
157
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
158
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
159
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><span
|
160
|
+
style='mso-tab-count:1'> </span>©
|
161
|
+
ISO/IEC 2016 – All rights reserved<o:p></o:p></span></p>
|
162
|
+
|
163
|
+
</div>
|
164
|
+
|
165
|
+
<div style='mso-element:footer' id=f3>
|
166
|
+
|
167
|
+
<p class=MsoFooter style='line-height:12.0pt'><span lang=EN-GB
|
168
|
+
style='font-size:10.0pt;mso-bidi-font-size:11.0pt'>© ISO/IEC 2016 – All
|
169
|
+
rights reserved<span style='mso-tab-count:1'> </span></span><!--[if supportFields]><b
|
170
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
171
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-begin'></span>
|
172
|
+
PAGE<span style='mso-spacerun:yes'> </span>\* MERGEFORMAT <span
|
173
|
+
style='mso-element:field-separator'></span></span></b><![endif]--><b
|
174
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
175
|
+
mso-bidi-font-size:11.0pt'><span style='mso-no-proof:yes'>3</span></span></b><!--[if supportFields]><b
|
176
|
+
style='mso-bidi-font-weight:normal'><span lang=EN-GB style='font-size:10.0pt;
|
177
|
+
mso-bidi-font-size:11.0pt'><span style='mso-element:field-end'></span></span></b><![endif]--><span
|
178
|
+
lang=EN-GB style='font-size:10.0pt;mso-bidi-font-size:11.0pt'><o:p></o:p></span></p>
|
179
|
+
|
180
|
+
</div>
|
181
|
+
|
182
|
+
</body>
|
183
|
+
|
184
|
+
</html>
|
data/spec/html2doc_spec.rb
CHANGED
@@ -1,9 +1,406 @@
|
|
1
|
-
|
1
|
+
def html_input(x)
|
2
|
+
<<~HTML
|
3
|
+
<html><head><title>blank</title>
|
4
|
+
<meta name="Originator" content="Me"/>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
#{x}
|
8
|
+
</body></html>
|
9
|
+
HTML
|
10
|
+
end
|
11
|
+
|
12
|
+
def html_input_no_title(x)
|
13
|
+
<<~HTML
|
14
|
+
<html><head>
|
15
|
+
<meta name="Originator" content="Me"/>
|
16
|
+
</head>
|
17
|
+
<body>
|
18
|
+
#{x}
|
19
|
+
</body></html>
|
20
|
+
HTML
|
21
|
+
end
|
22
|
+
|
23
|
+
def html_input_empty_head(x)
|
24
|
+
<<~HTML
|
25
|
+
<html><head></head>
|
26
|
+
<body>
|
27
|
+
#{x}
|
28
|
+
</body></html>
|
29
|
+
HTML
|
30
|
+
end
|
31
|
+
|
32
|
+
WORD_HDR = <<~HDR
|
33
|
+
MIME-Version: 1.0
|
34
|
+
Content-Type: multipart/related; boundary="----=_NextPart_"
|
35
|
+
|
36
|
+
------=_NextPart_
|
37
|
+
Content-Location: file:///C:/Doc/test.htm
|
38
|
+
Content-Type: text/html; charset="utf-8"
|
39
|
+
|
40
|
+
<?xml version="1.0"?>
|
41
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
|
42
|
+
<xml>
|
43
|
+
<w:WordDocument>
|
44
|
+
<w:View>Print</w:View>
|
45
|
+
<w:Zoom>100</w:Zoom>
|
46
|
+
<w:DoNotOptimizeForBrowser/>
|
47
|
+
</w:WordDocument>
|
48
|
+
</xml>
|
49
|
+
<![endif]-->
|
50
|
+
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
51
|
+
|
52
|
+
<link rel=File-List href="test_files/filelist.xml"/>
|
53
|
+
<title>blank</title><style><![CDATA[
|
54
|
+
<!--
|
55
|
+
HDR
|
56
|
+
|
57
|
+
WORD_HDR_END = <<~HDR
|
58
|
+
-->
|
59
|
+
]]></style>
|
60
|
+
<meta name="Originator" content="Me"/>
|
61
|
+
</head>
|
62
|
+
HDR
|
63
|
+
|
64
|
+
def word_body(x, fn)
|
65
|
+
<<~BODY
|
66
|
+
<body>
|
67
|
+
#{x}
|
68
|
+
#{fn}</body></html>
|
69
|
+
BODY
|
70
|
+
end
|
71
|
+
|
72
|
+
WORD_FTR1 = <<~FTR
|
73
|
+
------=_NextPart_
|
74
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
75
|
+
Content-Transfer-Encoding: base64
|
76
|
+
Content-Type: application/xml
|
77
|
+
|
78
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
79
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
80
|
+
bGVsaXN0LnhtbCIvPgo8L3htbD4K
|
81
|
+
|
82
|
+
------=_NextPart_--
|
83
|
+
FTR
|
84
|
+
|
85
|
+
WORD_FTR2 = <<~FTR
|
86
|
+
------=_NextPart_
|
87
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
88
|
+
Content-Transfer-Encoding: base64
|
89
|
+
Content-Type: application/xml
|
90
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
91
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9ImZp
|
92
|
+
bGVsaXN0LnhtbCIvPgogIDxvOkZpbGUgSFJlZj0iaGVhZGVyLmh0bWwiLz4KPC94bWw+Cg==
|
93
|
+
------=_NextPart_
|
94
|
+
Content-Location: file:///C:/Doc/test_files/header.html
|
95
|
+
Content-Transfer-Encoding: base64
|
96
|
+
Content-Type: text/html charset="utf-8"
|
97
|
+
PGh0bWwgeG1sbnM6dj0idXJuOnNjaGVtYXMtbWljcm9zb2Z0LWNvbTp2bWwiDQp4bWxuczpvPSJ1
|
98
|
+
cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiDQp4bWxuczp3PSJ1cm46c2No
|
99
|
+
ZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTp3b3JkIg0KeG1sbnM6bT0iaHR0cDovL3NjaGVtYXMu
|
100
|
+
bWljcm9zb2Z0LmNvbS9vZmZpY2UvMjAwNC8xMi9vbW1sIg0KeG1sbnM9Imh0dHA6Ly93d3cudzMu
|
101
|
+
b3JnL1RSL1JFQy1odG1sNDAiPg0KDQo8aGVhZD4NCjxtZXRhIGh0dHAtZXF1aXY9Q29udGVudC1U
|
102
|
+
eXBlIGNvbnRlbnQ9InRleHQvaHRtbDsgY2hhcnNldD11dGYtOCI+DQo8bWV0YSBuYW1lPVByb2dJ
|
103
|
+
ZCBjb250ZW50PVdvcmQuRG9jdW1lbnQ+DQo8bWV0YSBuYW1lPUdlbmVyYXRvciBjb250ZW50PSJN
|
104
|
+
aWNyb3NvZnQgV29yZCAxNSI+DQo8bWV0YSBuYW1lPU9yaWdpbmF0b3IgY29udGVudD0iTWljcm9z
|
105
|
+
b2Z0IFdvcmQgMTUiPg0KPGxpbmsgaWQ9TWFpbi1GaWxlIHJlbD1NYWluLUZpbGUgaHJlZj0iLi4v
|
106
|
+
cmljZS5nYi5odG1sIj4NCjwhLS1baWYgZ3RlIG1zbyA5XT48eG1sPg0KIDxvOnNoYXBlZGVmYXVs
|
107
|
+
dHMgdjpleHQ9ImVkaXQiIHNwaWRtYXg9IjIwNDkiLz4NCjwveG1sPjwhW2VuZGlmXS0tPg0KPC9o
|
108
|
+
ZWFkPg0KDQo8Ym9keSBsYW5nPVpIIGxpbms9Ymx1ZSB2bGluaz1wdXJwbGU+DQoNCjxkaXYgc3R5
|
109
|
+
bGU9J21zby1lbGVtZW50OmZvb3Rub3RlLXNlcGFyYXRvcicgaWQ9ZnM+DQoNCjxwIGNsYXNzPU1z
|
110
|
+
b05vcm1hbD48c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lhbC1jaGFyYWN0
|
111
|
+
ZXI6Zm9vdG5vdGUtc2VwYXJhdG9yJz48IVtpZiAhc3VwcG9ydEZvb3Rub3Rlc10+DQoNCjxociBh
|
112
|
+
bGlnbj1sZWZ0IHNpemU9MSB3aWR0aD0iMzMlIj4NCg0KPCFbZW5kaWZdPjwvc3Bhbj48L3NwYW4+
|
113
|
+
PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6Zm9vdG5vdGUtY29udGlu
|
114
|
+
dWF0aW9uLXNlcGFyYXRvcicgaWQ9ZmNzPg0KDQo8cCBjbGFzcz1Nc29Ob3JtYWw+PHNwYW4gbGFu
|
115
|
+
Zz1FTi1VUz48c3BhbiBzdHlsZT0nbXNvLXNwZWNpYWwtY2hhcmFjdGVyOmZvb3Rub3RlLWNvbnRp
|
116
|
+
bnVhdGlvbi1zZXBhcmF0b3InPjwhW2lmICFzdXBwb3J0Rm9vdG5vdGVzXT4NCg0KPGhyIGFsaWdu
|
117
|
+
PWxlZnQgc2l6ZT0xPg0KDQo8IVtlbmRpZl0+PC9zcGFuPjwvc3Bhbj48L3A+DQoNCjwvZGl2Pg0K
|
118
|
+
DQo8ZGl2IHN0eWxlPSdtc28tZWxlbWVudDplbmRub3RlLXNlcGFyYXRvcicgaWQ9ZXM+DQoNCjxw
|
119
|
+
IGNsYXNzPU1zb05vcm1hbD48c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lh
|
120
|
+
bC1jaGFyYWN0ZXI6Zm9vdG5vdGUtc2VwYXJhdG9yJz48IVtpZiAhc3VwcG9ydEZvb3Rub3Rlc10+
|
121
|
+
DQoNCjxociBhbGlnbj1sZWZ0IHNpemU9MSB3aWR0aD0iMzMlIj4NCg0KPCFbZW5kaWZdPjwvc3Bh
|
122
|
+
bj48L3NwYW4+PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6ZW5kbm90
|
123
|
+
ZS1jb250aW51YXRpb24tc2VwYXJhdG9yJyBpZD1lY3M+DQoNCjxwIGNsYXNzPU1zb05vcm1hbD48
|
124
|
+
c3BhbiBsYW5nPUVOLVVTPjxzcGFuIHN0eWxlPSdtc28tc3BlY2lhbC1jaGFyYWN0ZXI6Zm9vdG5v
|
125
|
+
dGUtY29udGludWF0aW9uLXNlcGFyYXRvcic+PCFbaWYgIXN1cHBvcnRGb290bm90ZXNdPg0KDQo8
|
126
|
+
aHIgYWxpZ249bGVmdCBzaXplPTE+DQoNCjwhW2VuZGlmXT48L3NwYW4+PC9zcGFuPjwvcD4NCg0K
|
127
|
+
PC9kaXY+DQoNCjxkaXYgc3R5bGU9J21zby1lbGVtZW50OmhlYWRlcicgaWQ9aDI+DQoNCjxwIGNs
|
128
|
+
YXNzPU1zb0hlYWRlcj48c3BhbiBsYW5nPUVOLVVTPkRCMTEvQ0QgMTczMDEtMTwvc3Bhbj48c3Bh
|
129
|
+
biBsYW5nPUVOLVVTDQpzdHlsZT0nZm9udC1mYW1pbHk6IlRpbWVzIE5ldyBSb21hbiIsc2VyaWY7
|
130
|
+
bXNvLWFzY2lpLWZvbnQtZmFtaWx5OlNpbUhlaSc+4oCUPC9zcGFuPjxzcGFuDQpsYW5nPUVOLVVT
|
131
|
+
PjIwMTY8L3NwYW4+PC9wPg0KDQo8L2Rpdj4NCg0KPGRpdiBzdHlsZT0nbXNvLWVsZW1lbnQ6Zm9v
|
132
|
+
dGVyJyBpZD1mMj4NCg0KPHAgY2xhc3M9TXNvRm9vdGVyPjwhLS1baWYgc3VwcG9ydEZpZWxkc10+
|
133
|
+
PHNwYW4gbGFuZz1FTi1VUz48c3BhbiBzdHlsZT0nbXNvLWVsZW1lbnQ6DQpmaWVsZC1iZWdpbic+
|
134
|
+
PC9zcGFuPjxzcGFuIHN0eWxlPSdtc28tc3BhY2VydW46eWVzJz7CoDwvc3Bhbj5QQUdFPHNwYW4N
|
135
|
+
CnN0eWxlPSdtc28tc3BhY2VydW46eWVzJz7CoCA8L3NwYW4+XCogTUVSR0VGT1JNQVQgPHNwYW4g
|
136
|
+
c3R5bGU9J21zby1lbGVtZW50OmZpZWxkLXNlcGFyYXRvcic+PC9zcGFuPjwvc3Bhbj48IVtlbmRp
|
137
|
+
Zl0tLT48c3Bhbg0KbGFuZz1lbCBzdHlsZT0nbXNvLWFuc2ktbGFuZ3VhZ2U6IzA0MDA7bXNvLWZh
|
138
|
+
cmVhc3QtbGFuZ3VhZ2U6IzA0MDA7bXNvLW5vLXByb29mOg0KeWVzJz40Mjwvc3Bhbj48IS0tW2lm
|
139
|
+
IHN1cHBvcnRGaWVsZHNdPjxzcGFuIGxhbmc9RU4tVVM+PHNwYW4gc3R5bGU9J21zby1lbGVtZW50
|
140
|
+
Og0KZmllbGQtZW5kJz48L3NwYW4+PC9zcGFuPjwhW2VuZGlmXS0tPjwvcD4NCg0KPC9kaXY+DQoN
|
141
|
+
CjwvYm9keT4NCg0KPC9odG1sPg0K
|
142
|
+
|
143
|
+
------=_NextPart_--
|
144
|
+
FTR
|
145
|
+
|
146
|
+
WORD_FTR3 = <<~FTR
|
147
|
+
------=_NextPart_
|
148
|
+
Content-Location: file:///C:/Doc/test_files/609e8807-c2d0-450c-b60b-d995a0f8dcaf.png
|
149
|
+
Content-Transfer-Encoding: base64
|
150
|
+
Content-Type: image/png
|
151
|
+
FTR
|
152
|
+
|
153
|
+
WORD_FTR3 = <<~FTR
|
154
|
+
------=_NextPart_
|
155
|
+
Content-Location: file:///C:/Doc/test_files/filelist.xml
|
156
|
+
Content-Transfer-Encoding: base64
|
157
|
+
Content-Type: application/xml
|
158
|
+
|
159
|
+
PHhtbCB4bWxuczpvPSJ1cm46c2NoZW1hcy1taWNyb3NvZnQtY29tOm9mZmljZTpvZmZpY2UiPgog
|
160
|
+
ICAgICAgIDxvOk1haW5GaWxlIEhSZWY9Ii4uL3Rlc3QuaHRtIi8+ICA8bzpGaWxlIEhSZWY9IjFh
|
161
|
+
YzIwNjVmLTAzZjAtNGM3YS1iOWE2LTkyZTgyMDU5MWJmMC5wbmciLz4KICA8bzpGaWxlIEhSZWY9
|
162
|
+
ImZpbGVsaXN0LnhtbCIvPgo8L3htbD4K
|
163
|
+
------=_NextPart_
|
164
|
+
Content-Location: file:///C:/Doc/test_files/cb7b0d19-891e-4634-815a-570d019d454c.png
|
165
|
+
Content-Transfer-Encoding: base64
|
166
|
+
Content-Type: image/png
|
167
|
+
------=_NextPart_--
|
168
|
+
FTR
|
169
|
+
|
170
|
+
DEFAULT_STYLESHEET = File.read("lib/html2doc/wordstyle.css", encoding: "utf-8").freeze
|
171
|
+
|
172
|
+
def guid_clean(x)
|
173
|
+
x.gsub(/NextPart_[0-9a-f.]+/, "NextPart_")
|
174
|
+
end
|
175
|
+
|
176
|
+
def image_clean(x)
|
177
|
+
x.gsub(%r{[0-9a-f-]+\.png}, "image.png").
|
178
|
+
gsub(%r{[0-9a-f-]+\.gif}, "image.gif").
|
179
|
+
gsub(%r{[0-9a-f-]+\.(jpeg|jpg)}, "image.jpg").
|
180
|
+
gsub(%r{------=_NextPart_\s+Content-Location: file:///C:/Doc/test_files/image\.(png|gif).*?\s-----=_NextPart_}m, "------=_NextPart_").
|
181
|
+
gsub(%r{Content-Type: image/(png|gif|jpeg)[^-]*------=_NextPart_-?-?}m, "").
|
182
|
+
gsub(%r{ICAgICAg[^-]*-----}m, "-----").
|
183
|
+
gsub(%r{\s*</img>\s*}m, "</img>").
|
184
|
+
gsub(%r{</body>\s*</html>}m, "</body></html>")
|
185
|
+
end
|
186
|
+
|
187
|
+
RSpec.describe Html2Doc do
|
2
188
|
it "has a version number" do
|
3
|
-
expect(
|
189
|
+
expect(Html2Doc::VERSION).not_to be nil
|
4
190
|
end
|
5
191
|
|
6
|
-
it "
|
7
|
-
|
192
|
+
it "processes a blank document" do
|
193
|
+
Html2Doc.process(html_input(""), "test", nil, nil, nil, nil)
|
194
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
195
|
+
to match_fuzzy(<<~OUTPUT)
|
196
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
197
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
198
|
+
OUTPUT
|
8
199
|
end
|
200
|
+
|
201
|
+
it "removes any temp files" do
|
202
|
+
File.delete("test.doc")
|
203
|
+
Html2Doc.process(html_input(""), "test", nil, nil, nil, nil)
|
204
|
+
expect(File.exist?("test.doc")).to be true
|
205
|
+
expect(File.exist?("test.htm")).to be false
|
206
|
+
expect(File.exist?("test_files")).to be false
|
207
|
+
end
|
208
|
+
|
209
|
+
it "processes a stylesheet in an HTML document with a title" do
|
210
|
+
Html2Doc.process(html_input(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
211
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
212
|
+
to match_fuzzy(<<~OUTPUT)
|
213
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
214
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
215
|
+
OUTPUT
|
216
|
+
end
|
217
|
+
|
218
|
+
it "processes a stylesheet in an HTML document without a title" do
|
219
|
+
Html2Doc.process(html_input_no_title(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
220
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
221
|
+
to match_fuzzy(<<~OUTPUT)
|
222
|
+
#{WORD_HDR.sub("<title>blank</title>", "")}
|
223
|
+
#{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
224
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
225
|
+
OUTPUT
|
226
|
+
end
|
227
|
+
|
228
|
+
it "processes a stylesheet in an HTML document with an empty head" do
|
229
|
+
Html2Doc.process(html_input_empty_head(""), "test", "lib/html2doc/wordstyle.css", nil, nil, nil)
|
230
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
231
|
+
to match_fuzzy(<<~OUTPUT)
|
232
|
+
#{WORD_HDR.sub("<title>blank</title>", "")}
|
233
|
+
#{DEFAULT_STYLESHEET}
|
234
|
+
#{WORD_HDR_END.sub('<meta name="Originator" content="Me"/>'+"\n", "").sub("</style>\n</head>", "</style></head>")}
|
235
|
+
#{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR1}
|
236
|
+
OUTPUT
|
237
|
+
end
|
238
|
+
|
239
|
+
it "processes a header" do
|
240
|
+
Html2Doc.process(html_input(""), "test", nil, "header.html", nil, nil)
|
241
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
242
|
+
to match_fuzzy(<<~OUTPUT)
|
243
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET.gsub(/FILENAME/, "test")}
|
244
|
+
#{WORD_HDR_END} #{word_body("", '<div style="mso-element:footnote-list"/>')} #{WORD_FTR2}
|
245
|
+
OUTPUT
|
246
|
+
end
|
247
|
+
|
248
|
+
it "processes a populated document" do
|
249
|
+
simple_body = "<h1>Hello word!</h1>
|
250
|
+
<div>This is a very simple document</div>"
|
251
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
252
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
253
|
+
to match_fuzzy(<<~OUTPUT)
|
254
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
255
|
+
#{word_body(simple_body, '<div style="mso-element:footnote-list"/>')}
|
256
|
+
#{WORD_FTR1}
|
257
|
+
OUTPUT
|
258
|
+
end
|
259
|
+
|
260
|
+
it "processes AsciiMath" do
|
261
|
+
Html2Doc.process(html_input("<div>{{sum_(i=1)^n i^3=((n(n+1))/2)^2}}</div>"), "test", nil, nil, nil, ["{{", "}}"])
|
262
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
263
|
+
to match_fuzzy(<<~OUTPUT)
|
264
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
265
|
+
#{word_body('<div><m:oMath><m:nary><m:naryPr><m:chr m:val="∑"></m:chr><m:limLoc m:val="undOvr"></m:limLoc><m:grow m:val="1"></m:grow><m:subHide m:val="off"></m:subHide><m:supHide m:val="off"></m:supHide></m:naryPr><m:sub><m:r><m:t>i=1</m:t></m:r></m:sub><m:sup><m:r><m:t>n</m:t></m:r></m:sup><m:e></m:e></m:nary><m:sSup><m:e><m:r><m:t>i</m:t></m:r></m:e><m:sup><m:r><m:t>3</m:t></m:r></m:sup></m:sSup><m:r><m:t>=</m:t></m:r><m:sSup><m:e><m:r><m:t>(</m:t></m:r><m:f><m:fPr><m:type m:val="bar"></m:type></m:fPr><m:num><m:r><m:t>n</m:t></m:r><m:r><m:t>(n+1)</m:t></m:r></m:num><m:den><m:r><m:t>2</m:t></m:r></m:den></m:f><m:r><m:t>)</m:t></m:r></m:e><m:sup><m:r><m:t>2</m:t></m:r></m:sup></m:sSup></m:oMath>
|
266
|
+
</div>', '<div style="mso-element:footnote-list"/>')}
|
267
|
+
#{WORD_FTR1}
|
268
|
+
OUTPUT
|
269
|
+
end
|
270
|
+
|
271
|
+
it "processes tabs" do
|
272
|
+
simple_body = "<h1>Hello word!</h1>
|
273
|
+
<div>This is a very &tab; simple document</div>"
|
274
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
275
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
276
|
+
to match_fuzzy(<<~OUTPUT)
|
277
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
278
|
+
#{word_body(simple_body.gsub(/\&tab;/, %[<span style="mso-tab-count:1">  </span>]), '<div style="mso-element:footnote-list"/>')}
|
279
|
+
#{WORD_FTR1}
|
280
|
+
OUTPUT
|
281
|
+
end
|
282
|
+
|
283
|
+
it "makes unstyled paragraphs be MsoNormal" do
|
284
|
+
simple_body = '<h1>Hello word!</h1>
|
285
|
+
<p>This is a very simple document</p>
|
286
|
+
<p class="x">This style stays</p>'
|
287
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
288
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
289
|
+
to match_fuzzy(<<~OUTPUT)
|
290
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
291
|
+
#{word_body(simple_body.gsub(/<p>/, %[<p class="MsoNormal">]), '<div style="mso-element:footnote-list"/>')}
|
292
|
+
#{WORD_FTR1}
|
293
|
+
OUTPUT
|
294
|
+
end
|
295
|
+
|
296
|
+
it "makes unstyled list entries be MsoNormal" do
|
297
|
+
simple_body = '<h1>Hello word!</h1>
|
298
|
+
<ul>
|
299
|
+
<li>This is a very simple document</li>
|
300
|
+
<li class="x">This style stays</li>
|
301
|
+
</ul>'
|
302
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
303
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
304
|
+
to match_fuzzy(<<~OUTPUT)
|
305
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
306
|
+
#{word_body(simple_body.gsub(/<li>/, %[<li class="MsoNormal">]), '<div style="mso-element:footnote-list"/>')}
|
307
|
+
#{WORD_FTR1}
|
308
|
+
OUTPUT
|
309
|
+
end
|
310
|
+
|
311
|
+
it "resizes images for height" do
|
312
|
+
simple_body = '<img src="spec/19160-6.png">'
|
313
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
314
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
315
|
+
expect(testdoc).to match(%r{Content-Type: image/png})
|
316
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
317
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
318
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.png" width="400" height="387"></img>', '<div style="mso-element:footnote-list"/>'))}
|
319
|
+
#{image_clean(WORD_FTR3)}
|
320
|
+
OUTPUT
|
321
|
+
end
|
322
|
+
|
323
|
+
it "resizes images for width" do
|
324
|
+
simple_body = '<img src="spec/19160-7.gif">'
|
325
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
326
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
327
|
+
expect(testdoc).to match(%r{Content-Type: image/gif})
|
328
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
329
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
330
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.gif" width="400" height="118"></img>', '<div style="mso-element:footnote-list"/>'))}
|
331
|
+
#{image_clean(WORD_FTR3).gsub(/image\.png/, "image.gif")}
|
332
|
+
OUTPUT
|
333
|
+
end
|
334
|
+
|
335
|
+
it "resizes images for height" do
|
336
|
+
simple_body = '<img src="spec/19160-8.jpg">'
|
337
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
338
|
+
testdoc = File.read("test.doc", encoding: "utf-8")
|
339
|
+
expect(testdoc).to match(%r{Content-Type: image/jpeg})
|
340
|
+
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
341
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
342
|
+
#{image_clean(word_body('<img src="test_files/cb7b0d19-891e-4634-815a-570d019d454c.jpg" width="208" height="680"></img>', '<div style="mso-element:footnote-list"/>'))}
|
343
|
+
#{image_clean(WORD_FTR3).gsub(/image\.png/, "image.jpg")}
|
344
|
+
OUTPUT
|
345
|
+
end
|
346
|
+
|
347
|
+
it "processes epub:type footnotes" do
|
348
|
+
simple_body = '<div>This is a very simple
|
349
|
+
document<a epub:type="footnote" href="#a1">1</a> allegedly<a epub:type="footnote" href="#a2">2</a></div>
|
350
|
+
<aside id="a1">Footnote</aside>
|
351
|
+
<aside id="a2">Other Footnote</aside>'
|
352
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
353
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
354
|
+
to match_fuzzy(<<~OUTPUT)
|
355
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
356
|
+
#{word_body('<div>This is a very simple
|
357
|
+
document<a epub:type="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a epub:type="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
358
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
359
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
360
|
+
<div style="mso-element:footnote" id="ftn2">
|
361
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
362
|
+
</div>')}
|
363
|
+
#{WORD_FTR1}
|
364
|
+
OUTPUT
|
365
|
+
end
|
366
|
+
|
367
|
+
it "processes class footnotes" do
|
368
|
+
simple_body = '<div>This is a very simple
|
369
|
+
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
370
|
+
<aside id="a1">Footnote</aside>
|
371
|
+
<aside id="a2">Other Footnote</aside>'
|
372
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
373
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
374
|
+
to match_fuzzy(<<~OUTPUT)
|
375
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
376
|
+
#{word_body('<div>This is a very simple
|
377
|
+
document<a class="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a class="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
378
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
379
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
380
|
+
<div style="mso-element:footnote" id="ftn2">
|
381
|
+
<p id="" class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
382
|
+
</div>')}
|
383
|
+
#{WORD_FTR1}
|
384
|
+
OUTPUT
|
385
|
+
end
|
386
|
+
|
387
|
+
it "extracts paragraphs from footnotes" do
|
388
|
+
simple_body = '<div>This is a very simple
|
389
|
+
document<a class="footnote" href="#a1">1</a> allegedly<a class="footnote" href="#a2">2</a></div>
|
390
|
+
<aside id="a1"><p>Footnote</p></aside>
|
391
|
+
<div id="a2"><p>Other Footnote</p></div>'
|
392
|
+
Html2Doc.process(html_input(simple_body), "test", nil, nil, nil, nil)
|
393
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8"))).
|
394
|
+
to match_fuzzy(<<~OUTPUT)
|
395
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
396
|
+
#{word_body('<div>This is a very simple
|
397
|
+
document<a class="footnote" href="#_ftn1" style="mso-footnote-id:ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a> allegedly<a class="footnote" href="#_ftn2" style="mso-footnote-id:ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a></div>',
|
398
|
+
'<div style="mso-element:footnote-list"><div style="mso-element:footnote" id="ftn1">
|
399
|
+
<p class="MsoFootnoteText"><a style="mso-footnote-id:ftn1" href="#_ftn1" name="_ftnref1" title="" id="_ftnref1"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Footnote</p></div>
|
400
|
+
<div style="mso-element:footnote" id="ftn2">
|
401
|
+
<p class="MsoFootnoteText"><a style="mso-footnote-id:ftn2" href="#_ftn2" name="_ftnref2" title="" id="_ftnref2"><span class="MsoFootnoteReference"><span style="mso-special-character:footnote"></span></span></a>Other Footnote</p></div>
|
402
|
+
</div>')}
|
403
|
+
#{WORD_FTR1}
|
404
|
+
OUTPUT
|
405
|
+
end
|
9
406
|
end
|
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-02-
|
11
|
+
date: 2018-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -262,6 +262,20 @@ dependencies:
|
|
262
262
|
- - "~>"
|
263
263
|
- !ruby/object:Gem::Version
|
264
264
|
version: '0.9'
|
265
|
+
- !ruby/object:Gem::Dependency
|
266
|
+
name: rspec-match_fuzzy
|
267
|
+
requirement: !ruby/object:Gem::Requirement
|
268
|
+
requirements:
|
269
|
+
- - ">="
|
270
|
+
- !ruby/object:Gem::Version
|
271
|
+
version: '0'
|
272
|
+
type: :development
|
273
|
+
prerelease: false
|
274
|
+
version_requirements: !ruby/object:Gem::Requirement
|
275
|
+
requirements:
|
276
|
+
- - ">="
|
277
|
+
- !ruby/object:Gem::Version
|
278
|
+
version: '0'
|
265
279
|
description: |
|
266
280
|
Convert HTML document to Microsoft Word document.
|
267
281
|
|
@@ -272,6 +286,7 @@ executables: []
|
|
272
286
|
extensions: []
|
273
287
|
extra_rdoc_files: []
|
274
288
|
files:
|
289
|
+
- ".gitattributes"
|
275
290
|
- ".gitignore"
|
276
291
|
- ".hound.yml"
|
277
292
|
- ".oss-guides.rubocop.yml"
|
@@ -285,6 +300,7 @@ files:
|
|
285
300
|
- README.adoc
|
286
301
|
- Rakefile
|
287
302
|
- bin/console
|
303
|
+
- bin/rspec
|
288
304
|
- bin/setup
|
289
305
|
- html2doc.gemspec
|
290
306
|
- lib/html2doc.rb
|
@@ -294,6 +310,8 @@ files:
|
|
294
310
|
- lib/html2doc/notes.rb
|
295
311
|
- lib/html2doc/version.rb
|
296
312
|
- lib/html2doc/wordstyle.css
|
313
|
+
- spec/19160-6.png
|
314
|
+
- spec/19160-7.gif
|
297
315
|
- spec/examples/header.html
|
298
316
|
- spec/examples/rice.doc
|
299
317
|
- spec/examples/rice.html
|
@@ -303,6 +321,7 @@ files:
|
|
303
321
|
- spec/examples/rice_images/rice_image3_1.png
|
304
322
|
- spec/examples/rice_images/rice_image3_2.png
|
305
323
|
- spec/examples/rice_images/rice_image3_3.png
|
324
|
+
- spec/header.html
|
306
325
|
- spec/html2doc_spec.rb
|
307
326
|
- spec/spec_helper.rb
|
308
327
|
homepage: https://github.com/riboseinc/html2doc
|