html2doc 0.0.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +7 -4
- data/html2doc.gemspec +3 -1
- data/lib/html2doc.rb +1 -0
- data/lib/html2doc/base.rb +110 -91
- data/lib/html2doc/mime.rb +0 -141
- data/lib/html2doc/version.rb +1 -1
- data/lib/html2doc/wordstyle.css +6 -0
- metadata +31 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 878408b54af45f8693aade94aee06047dcd450a3
|
4
|
+
data.tar.gz: 66775cf77b38dc25490da74ac84a5dd5ade68650
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 04baa6214e38eb83f7bd687d42b4bc4db9a28ac80e9c78944dc0ec1150e8c3d9f3fb0ccbaf4df00040b17362405c7dc28765335c01a6d2e5a1e47314232c5b01
|
7
|
+
data.tar.gz: 0ad65befcc98b15e89bd6c94a8b18a54f709fbf01185b786dd1675a287473a4bb8ad05149647e383c130fcefaafc66048711a438eb557fe80fbd3fc08fded629
|
data/README.adoc
CHANGED
@@ -8,8 +8,10 @@ This work is driven by the Word document generation procedure documented in http
|
|
8
8
|
|
9
9
|
The gem currently does the following:
|
10
10
|
|
11
|
+
* Convert any AsciiMath and MathML to Word's native mathematical formatting language.
|
11
12
|
* Resize any images in the HTML file to fit within the maximum page size. (Word will otherwise crash on reading the document.)
|
12
13
|
* Generate a filelist.xml listing of all files to be bundled into the Word document.
|
14
|
+
* Assign the class `MsoNormal` to any paragraphs that do not have a class, so that they can be treated as Normal Style when editing the Word document.
|
13
15
|
* Inject Microsoft Word-specific CSS into the HTML document. The CSS file used is at `lib/html2doc/wordstyle.css`, and can be customised. (This generic CSS can be overridden by CSS already in the HTML document, since the generic CSS is injected at the top of the document.)
|
14
16
|
* Bundle up the images, the HTML file of the document proper, and the `header.html` file representing header/footer information, into a MIME file, and save that file to disk (so that Microsoft Word can deal with it as a Word file.)
|
15
17
|
|
@@ -19,7 +21,6 @@ Work being tracked at https://github.com/riboseinc/asciidoctor-iso/issues/47:
|
|
19
21
|
|
20
22
|
* Render footnotes
|
21
23
|
* Render (editorial) comments
|
22
|
-
* Render MathML, AsciiMath
|
23
24
|
|
24
25
|
== Constraints
|
25
26
|
|
@@ -33,12 +34,14 @@ TO DO: compare with https://github.com/MuhammetDilmac/Html2Docx (much simpler, b
|
|
33
34
|
--
|
34
35
|
require "html2doc"
|
35
36
|
|
36
|
-
Html2Doc.process(result, filename, header_filename, dir)
|
37
|
+
Html2Doc.process(result, filename, stylesheet, header_filename, dir, asciimathdelims = nil)
|
37
38
|
--
|
38
39
|
|
39
40
|
result:: is the Html document to be converted into Word, as a string.
|
40
41
|
filename:: is the name the document is to be saved as, without a file suffix
|
41
|
-
|
42
|
+
stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided (`nil`), the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
|
43
|
+
header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/riboseinc/asciidoctor-iso/blob/master/lib/asciidoctor/iso/word/header.html
|
42
44
|
dir:: is the directory the document is to be saved to
|
45
|
+
asciimathdelims:: are the AsciiMath delimiters used in the text. If none are provided, no AsciiMath conversion is attempted.
|
43
46
|
|
44
|
-
Note that the local CSS file contains a variable `FILENAME` for the location of footnote/endnote separators and headers/footers, which are provided in the header HTML file. The gem replaces `FILENAME` with the file nane that the document will be saved as.
|
47
|
+
Note that the local CSS stylesheet file contains a variable `FILENAME` for the location of footnote/endnote separators and headers/footers, which are provided in the header HTML file. The gem replaces `FILENAME` with the file nane that the document will be saved as. If you supply your own stylesheet and also wish to use separators or headers/footers, you will likewise need to replace the document name mentioned in your stylesheet with a `FILENAME` string.
|
data/html2doc.gemspec
CHANGED
@@ -10,7 +10,7 @@ Gem::Specification.new do |spec|
|
|
10
10
|
spec.email = ["open.source@ribose.com"]
|
11
11
|
|
12
12
|
spec.summary = "Convert HTML document to Microsoft Word document"
|
13
|
-
|
13
|
+
"in AsciiDoc."
|
14
14
|
spec.description = <<~DESCRIPTION
|
15
15
|
Convert HTML document to Microsoft Word document.
|
16
16
|
|
@@ -32,6 +32,8 @@ Gem::Specification.new do |spec|
|
|
32
32
|
spec.add_dependency "nokogiri", "~> 1.8.1"
|
33
33
|
spec.add_dependency "thread_safe"
|
34
34
|
spec.add_dependency "uuidtools"
|
35
|
+
spec.add_dependency "ruby-xslt"
|
36
|
+
spec.add_dependency "asciimath"
|
35
37
|
|
36
38
|
spec.add_development_dependency "bundler", "~> 1.15"
|
37
39
|
spec.add_development_dependency "byebug", "~> 9.1"
|
data/lib/html2doc.rb
CHANGED
data/lib/html2doc/base.rb
CHANGED
@@ -1,34 +1,82 @@
|
|
1
1
|
require "uuidtools"
|
2
|
+
require "asciimath"
|
2
3
|
require "nokogiri"
|
4
|
+
require "xml/xslt"
|
5
|
+
require "pp"
|
3
6
|
|
4
7
|
module Html2Doc
|
5
|
-
|
6
|
-
|
8
|
+
@xslt = XML::XSLT.new
|
9
|
+
@xslt.xsl = File.read(File.join(File.dirname(__FILE__), "mathml2omml.xsl"))
|
10
|
+
|
11
|
+
def self.process(result, filename, stylesheet, header_file, dir,
|
12
|
+
asciimathdelims = nil)
|
13
|
+
result = process_html(result, filename, stylesheet, header_file,
|
14
|
+
dir, asciimathdelims)
|
15
|
+
system "cp #{header_file} #{dir}/header.html" unless header_file.nil?
|
16
|
+
generate_filelist(filename, dir)
|
17
|
+
File.open("#{filename}.htm", "w") { |f| f.write(result) }
|
18
|
+
mime_package result, filename, dir
|
19
|
+
rm_temp_files(filename, dir)
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.process_html(result, filename, stylesheet, header_file, dir, asciimathdelims)
|
23
|
+
docxml = Nokogiri::XML(asciimath_to_mathml(result, asciimathdelims))
|
24
|
+
define_head(cleanup(docxml, dir), dir, filename, stylesheet, header_file)
|
25
|
+
result = msword_fix(docxml.to_xml)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.rm_temp_files(filename, dir)
|
29
|
+
system "rm #{filename}.htm"
|
30
|
+
system "rm -r #{filename}_files"
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.cleanup(docxml, dir)
|
34
|
+
image_cleanup(docxml, dir)
|
35
|
+
mathml_to_ooml(docxml)
|
36
|
+
msonormal(docxml)
|
37
|
+
docxml
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.asciimath_to_mathml(doc, delims)
|
41
|
+
return doc if delims.nil? || delims.size < 2
|
42
|
+
doc.split(/(#{delims[0]}|#{delims[1]})/).each_slice(4).map do |a|
|
43
|
+
a[2].nil? || a[2] = AsciiMath.parse(a[2]).to_mathml.
|
44
|
+
gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
45
|
+
a.size > 1 ? a[0] + a[2] : a[0]
|
46
|
+
end.join
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.mathml_to_ooml(docxml)
|
50
|
+
docxml.xpath("//*[local-name() = 'math']").each do |m|
|
51
|
+
@xslt.xml = m.to_s.gsub(/<math>/,
|
52
|
+
"<math xmlns='http://www.w3.org/1998/Math/MathML'>")
|
53
|
+
ooml = @xslt.serve.gsub(/<\?[^>]+>\s*/, "").
|
54
|
+
gsub(/ xmlns:[^=]+="[^"]+"/, "")# .gsub(%r{(</?)}, "\\1m:")
|
55
|
+
m.swap(ooml)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
# preserve HTML escapes
|
60
|
+
def self.xhtml(result)
|
7
61
|
unless /<!DOCTYPE html/.match? result
|
8
62
|
result.gsub!(/<\?xml version="1.0"\?>/, "")
|
9
63
|
result = "<!DOCTYPE html SYSTEM " +
|
10
64
|
"'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result
|
11
65
|
end
|
12
|
-
|
13
|
-
image_cleanup(docxml, dir)
|
14
|
-
define_head(docxml, dir, filename, header_file)
|
15
|
-
result = self.msword_fix(docxml.to_xml)
|
16
|
-
system "cp #{header_file} #{dir}/header.html" unless header_file.nil?
|
17
|
-
generate_filelist(filename, dir)
|
18
|
-
File.open("#{filename}.htm", "w") { |f| f.write(result) }
|
19
|
-
mime_package result, filename, dir
|
66
|
+
result
|
20
67
|
end
|
21
68
|
|
22
69
|
def self.msword_fix(r)
|
23
70
|
# brain damage in MSWord parser
|
24
|
-
r.gsub(%r{<span style="mso-special-character:footnote"/>},
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
71
|
+
r.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
72
|
+
'<span style="mso-special-character:footnote"></span>')
|
73
|
+
r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
74
|
+
r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
75
|
+
r.gsub!(%r{<meta http-equiv="Content-Type"},
|
76
|
+
"<meta http-equiv=Content-Type")
|
77
|
+
r.gsub!(%r{&tab;|&tab;},
|
78
|
+
'<span style="mso-tab-count:1">  </span>')
|
79
|
+
r
|
32
80
|
end
|
33
81
|
|
34
82
|
def self.image_resize(orig_filename)
|
@@ -52,11 +100,8 @@ module Html2Doc
|
|
52
100
|
new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
|
53
101
|
# presupposes that the image source is local
|
54
102
|
system "cp #{i['src']} #{new_full_filename}"
|
55
|
-
# image_size = image_resize(i["src"])
|
56
103
|
i["width"], i["height"] = image_resize(i["src"])
|
57
104
|
i["src"] = new_full_filename
|
58
|
-
#i["height"] = image_size[1]
|
59
|
-
#i["width"] = image_size[0]
|
60
105
|
end
|
61
106
|
docxml
|
62
107
|
end
|
@@ -64,102 +109,67 @@ module Html2Doc
|
|
64
109
|
def self.define_head1(docxml, dir)
|
65
110
|
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
66
111
|
h.children.first.add_previous_sibling <<~XML
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
112
|
+
<!--[if gte mso 9]>
|
113
|
+
<xml>
|
114
|
+
<w:WordDocument>
|
115
|
+
<w:View>Print</w:View>
|
116
|
+
<w:Zoom>100</w:Zoom>
|
117
|
+
<w:DoNotOptimizeForBrowser/>
|
118
|
+
</w:WordDocument>
|
119
|
+
</xml>
|
120
|
+
<![endif]-->
|
121
|
+
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
122
|
+
<link rel="File-List" href="#{dir}/filelist.xml"/>
|
78
123
|
XML
|
79
124
|
end
|
80
125
|
end
|
81
126
|
|
82
|
-
def self.stylesheet
|
83
|
-
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
84
|
-
stylesheet = File.read(fn, encoding: "UTF-8")
|
127
|
+
def self.filename_substitute(stylesheet, header_filename, filename)
|
85
128
|
if header_filename.nil?
|
86
129
|
stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
|
87
130
|
else
|
88
131
|
stylesheet.gsub!(/FILENAME/, filename)
|
89
132
|
end
|
133
|
+
stylesheet
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.stylesheet(filename, header_filename, fn)
|
137
|
+
(fn.nil? || fn.empty?) &&
|
138
|
+
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
139
|
+
stylesheet = File.read(fn, encoding: "UTF-8")
|
140
|
+
stylesheet = filename_substitute(stylesheet, header_filename, filename)
|
90
141
|
xml = Nokogiri::XML("<style/>")
|
91
142
|
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
|
92
143
|
xml.root.to_s
|
93
144
|
end
|
94
145
|
|
95
|
-
def self.define_head(docxml, dir, filename, header_file)
|
146
|
+
def self.define_head(docxml, dir, filename, cssname, header_file)
|
96
147
|
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
|
97
148
|
head = docxml.at("//*[local-name() = 'head']")
|
149
|
+
css = stylesheet(filename, header_file, cssname)
|
98
150
|
if title.nil?
|
99
|
-
head.children.first.add_previous_sibling
|
151
|
+
head.children.first.add_previous_sibling css
|
100
152
|
else
|
101
|
-
title.add_next_sibling
|
153
|
+
title.add_next_sibling css
|
102
154
|
end
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
def self.mime_preamble(boundary, filename, result)
|
107
|
-
<<~"PREAMBLE"
|
108
|
-
MIME-Version: 1.0
|
109
|
-
Content-Type: multipart/related; boundary="#{boundary}"
|
110
|
-
|
111
|
-
--#{boundary}
|
112
|
-
Content-Location: file:///C:/Doc/#{filename}.htm
|
113
|
-
Content-Type: text/html; charset="utf-8"
|
114
|
-
|
115
|
-
#{result}
|
116
|
-
|
117
|
-
PREAMBLE
|
155
|
+
define_head1(docxml, dir)
|
156
|
+
namespace(docxml.root)
|
118
157
|
end
|
119
158
|
|
120
|
-
def self.
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
Content-Transfer-Encoding: base64
|
128
|
-
Content-Type: #{mime_type(item)}
|
129
|
-
|
130
|
-
#{encoded_file}
|
131
|
-
|
132
|
-
FILE
|
133
|
-
end
|
134
|
-
|
135
|
-
def self.mime_type(item)
|
136
|
-
types = MIME::Types.type_for(item)
|
137
|
-
type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
|
138
|
-
type = type + ' charset="utf-8"' if /^text/.match?(type) && types
|
139
|
-
type
|
140
|
-
end
|
141
|
-
|
142
|
-
def self.mime_boundary
|
143
|
-
salt = UUIDTools::UUID.random_create.to_s.gsub(/-/, ".")[0..17]
|
144
|
-
"----=_NextPart_#{salt}"
|
145
|
-
end
|
146
|
-
|
147
|
-
def self.mime_package(result, filename, dir)
|
148
|
-
boundary = mime_boundary
|
149
|
-
mhtml = mime_preamble(boundary, filename, result)
|
150
|
-
Dir.foreach(dir) do |item|
|
151
|
-
next if item == "." || item == ".." || /^\./.match(item)
|
152
|
-
mhtml += mime_attachment(boundary, filename, item, dir)
|
153
|
-
end
|
154
|
-
mhtml += "--#{boundary}--"
|
155
|
-
File.open("#{filename}.doc", "w") { |f| f.write mhtml }
|
159
|
+
def self.namespace(root)
|
160
|
+
{
|
161
|
+
o: "urn:schemas-microsoft-com:office:office",
|
162
|
+
w: "urn:schemas-microsoft-com:office:word",
|
163
|
+
m: "http://schemas.microsoft.com/office/2004/12/omml",
|
164
|
+
}.each { |k, v| root.add_namespace_definition(k.to_s, v) }
|
165
|
+
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
|
156
166
|
end
|
157
167
|
|
158
168
|
def self.generate_filelist(filename, dir)
|
159
169
|
File.open(File.join(dir, "filelist.xml"), "w") do |f|
|
160
170
|
f.write(<<~"XML")
|
161
|
-
|
162
|
-
|
171
|
+
<xml xmlns:o="urn:schemas-microsoft-com:office:office">
|
172
|
+
<o:MainFile HRef="../#{filename}.htm"/>
|
163
173
|
XML
|
164
174
|
Dir.foreach(dir) do |item|
|
165
175
|
next if item == "." || item == ".." || /^\./.match(item)
|
@@ -168,4 +178,13 @@ module Html2Doc
|
|
168
178
|
f.write("</xml>\n")
|
169
179
|
end
|
170
180
|
end
|
181
|
+
|
182
|
+
def self.msonormal(docxml)
|
183
|
+
docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
|
184
|
+
p["class"] = "MsoNormal"
|
185
|
+
end
|
186
|
+
docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
|
187
|
+
p["class"] = "MsoNormal"
|
188
|
+
end
|
189
|
+
end
|
171
190
|
end
|
data/lib/html2doc/mime.rb
CHANGED
@@ -2,124 +2,6 @@ require "uuidtools"
|
|
2
2
|
require "nokogiri"
|
3
3
|
|
4
4
|
module Html2Doc
|
5
|
-
def self.process(result, filename, header_file, dir)
|
6
|
-
docxml = Nokogiri::XML(xhtml(result))
|
7
|
-
cleanup(docxml, dir)
|
8
|
-
define_head(docxml, dir, filename, header_file)
|
9
|
-
result = self.msword_fix(docxml.to_xml)
|
10
|
-
system "cp #{header_file} #{dir}/header.html" unless header_file.nil?
|
11
|
-
generate_filelist(filename, dir)
|
12
|
-
File.open("#{filename}.htm", "w") { |f| f.write(result) }
|
13
|
-
mime_package result, filename, dir
|
14
|
-
end
|
15
|
-
|
16
|
-
def self.cleanup(docxml, dir)
|
17
|
-
image_cleanup(docxml, dir)
|
18
|
-
msonormal(docxml)
|
19
|
-
end
|
20
|
-
|
21
|
-
# preserve HTML escapes
|
22
|
-
def self.xhtml(result)
|
23
|
-
unless /<!DOCTYPE html/.match? result
|
24
|
-
result.gsub!(/<\?xml version="1.0"\?>/, "")
|
25
|
-
result = "<!DOCTYPE html SYSTEM " +
|
26
|
-
"'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'>" + result
|
27
|
-
end
|
28
|
-
result
|
29
|
-
end
|
30
|
-
|
31
|
-
def self.msword_fix(r)
|
32
|
-
# brain damage in MSWord parser
|
33
|
-
r.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
34
|
-
'<span style="mso-special-character:footnote"></span>')
|
35
|
-
r.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
|
36
|
-
r.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
|
37
|
-
r.gsub!(%r{<meta http-equiv="Content-Type"},
|
38
|
-
"<meta http-equiv=Content-Type")
|
39
|
-
r.gsub!(%r{&tab;|&tab;},
|
40
|
-
'<span style="mso-tab-count:1">  </span>')
|
41
|
-
r
|
42
|
-
end
|
43
|
-
|
44
|
-
def self.image_resize(orig_filename)
|
45
|
-
image_size = ImageSize.path(orig_filename).size
|
46
|
-
# max width for Word document is 400, max height is 680
|
47
|
-
if image_size[0] > 400
|
48
|
-
image_size[1] = (image_size[1] * 400 / image_size[0]).ceil
|
49
|
-
image_size[0] = 400
|
50
|
-
end
|
51
|
-
if image_size[1] > 680
|
52
|
-
image_size[0] = (image_size[0] * 680 / image_size[1]).ceil
|
53
|
-
image_size[1] = 680
|
54
|
-
end
|
55
|
-
image_size
|
56
|
-
end
|
57
|
-
|
58
|
-
def self.image_cleanup(docxml, dir)
|
59
|
-
docxml.xpath("//*[local-name() = 'img']").each do |i|
|
60
|
-
matched = /\.(?<suffix>\S+)$/.match i["src"]
|
61
|
-
uuid = UUIDTools::UUID.random_create.to_s
|
62
|
-
new_full_filename = File.join(dir, "#{uuid}.#{matched[:suffix]}")
|
63
|
-
# presupposes that the image source is local
|
64
|
-
system "cp #{i['src']} #{new_full_filename}"
|
65
|
-
i["width"], i["height"] = image_resize(i["src"])
|
66
|
-
i["src"] = new_full_filename
|
67
|
-
end
|
68
|
-
docxml
|
69
|
-
end
|
70
|
-
|
71
|
-
def self.define_head1(docxml, dir)
|
72
|
-
docxml.xpath("//*[local-name() = 'head']").each do |h|
|
73
|
-
h.children.first.add_previous_sibling <<~XML
|
74
|
-
<!--[if gte mso 9]>
|
75
|
-
<xml>
|
76
|
-
<w:WordDocument>
|
77
|
-
<w:View>Print</w:View>
|
78
|
-
<w:Zoom>100</w:Zoom>
|
79
|
-
<w:DoNotOptimizeForBrowser/>
|
80
|
-
</w:WordDocument>
|
81
|
-
</xml>
|
82
|
-
<![endif]-->
|
83
|
-
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
84
|
-
<link rel="File-List" href="#{dir}/filelist.xml"/>
|
85
|
-
XML
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.stylesheet(filename, header_filename)
|
90
|
-
fn = File.join(File.dirname(__FILE__), "wordstyle.css")
|
91
|
-
stylesheet = File.read(fn, encoding: "UTF-8")
|
92
|
-
if header_filename.nil?
|
93
|
-
stylesheet.gsub!(/\n[^\n]*FILENAME[^\n]*i\n/, "\n")
|
94
|
-
else
|
95
|
-
stylesheet.gsub!(/FILENAME/, filename)
|
96
|
-
end
|
97
|
-
xml = Nokogiri::XML("<style/>")
|
98
|
-
xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
|
99
|
-
xml.root.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
def self.define_head(docxml, dir, filename, header_file)
|
103
|
-
title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
|
104
|
-
head = docxml.at("//*[local-name() = 'head']")
|
105
|
-
if title.nil?
|
106
|
-
head.children.first.add_previous_sibling stylesheet(filename, header_file)
|
107
|
-
else
|
108
|
-
title.add_next_sibling stylesheet(filename, header_file)
|
109
|
-
end
|
110
|
-
define_head1(docxml, dir)
|
111
|
-
namespace(docxml.root)
|
112
|
-
end
|
113
|
-
|
114
|
-
def self.namespace(root)
|
115
|
-
{
|
116
|
-
o: "urn:schemas-microsoft-com:office:office",
|
117
|
-
w: "urn:schemas-microsoft-com:office:word",
|
118
|
-
m: "http://schemas.microsoft.com/office/2004/12/omml",
|
119
|
-
}.each { |k, v| root.add_namespace_definition(k.to_s, v) }
|
120
|
-
root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
|
121
|
-
end
|
122
|
-
|
123
5
|
def self.mime_preamble(boundary, filename, result)
|
124
6
|
<<~"PREAMBLE"
|
125
7
|
MIME-Version: 1.0
|
@@ -171,27 +53,4 @@ module Html2Doc
|
|
171
53
|
mhtml += "--#{boundary}--"
|
172
54
|
File.open("#{filename}.doc", "w") { |f| f.write mhtml }
|
173
55
|
end
|
174
|
-
|
175
|
-
def self.generate_filelist(filename, dir)
|
176
|
-
File.open(File.join(dir, "filelist.xml"), "w") do |f|
|
177
|
-
f.write(<<~"XML")
|
178
|
-
<xml xmlns:o="urn:schemas-microsoft-com:office:office">
|
179
|
-
<o:MainFile HRef="../#{filename}.htm"/>
|
180
|
-
XML
|
181
|
-
Dir.foreach(dir) do |item|
|
182
|
-
next if item == "." || item == ".." || /^\./.match(item)
|
183
|
-
f.write %{ <o:File HRef="#{item}"/>\n}
|
184
|
-
end
|
185
|
-
f.write("</xml>\n")
|
186
|
-
end
|
187
|
-
end
|
188
|
-
|
189
|
-
def self.msonormal(docxml)
|
190
|
-
docxml.xpath("//*[local-name() = 'p'][not(self::*[@class])]").each do |p|
|
191
|
-
p["class"] = "MsoNormal"
|
192
|
-
end
|
193
|
-
docxml.xpath("//*[local-name() = 'li'][not(self::*[@class])]").each do |p|
|
194
|
-
p["class"] = "MsoNormal"
|
195
|
-
end
|
196
|
-
end
|
197
56
|
end
|
data/lib/html2doc/version.rb
CHANGED
data/lib/html2doc/wordstyle.css
CHANGED
@@ -990,3 +990,9 @@ table.MsoNormalTable
|
|
990
990
|
mso-pagination:widow-orphan;
|
991
991
|
font-size:10.0pt;
|
992
992
|
font-family:"Cambria",serif;}
|
993
|
+
br.section
|
994
|
+
{page-break-before:always;
|
995
|
+
mso-break-type:section-break;}
|
996
|
+
br.pagebreak
|
997
|
+
{page-break-before:always;
|
998
|
+
mso-special-character:line-break;}
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-02-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: htmlentities
|
@@ -94,6 +94,34 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: ruby-xslt
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :runtime
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
- !ruby/object:Gem::Dependency
|
112
|
+
name: asciimath
|
113
|
+
requirement: !ruby/object:Gem::Requirement
|
114
|
+
requirements:
|
115
|
+
- - ">="
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
type: :runtime
|
119
|
+
prerelease: false
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
requirements:
|
122
|
+
- - ">="
|
123
|
+
- !ruby/object:Gem::Version
|
124
|
+
version: '0'
|
97
125
|
- !ruby/object:Gem::Dependency
|
98
126
|
name: bundler
|
99
127
|
requirement: !ruby/object:Gem::Requirement
|
@@ -287,7 +315,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
287
315
|
version: '0'
|
288
316
|
requirements: []
|
289
317
|
rubyforge_project:
|
290
|
-
rubygems_version: 2.6.
|
318
|
+
rubygems_version: 2.6.12
|
291
319
|
signing_key:
|
292
320
|
specification_version: 4
|
293
321
|
summary: Convert HTML document to Microsoft Word document
|