html2doc 1.1.4 → 1.3.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +3 -20
- data/README.adoc +2 -1
- data/html2doc.gemspec +1 -1
- data/lib/html2doc/base.rb +20 -11
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +22 -9
- metadata +8 -8
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6f471eed9c61de156ee7aa0dd279d21335fd193dda721727197c2dc508a7bf56
|
4
|
+
data.tar.gz: 79365ba28433486f8d6442aa939729af2456c570baccbb88b9ddb835d1f94172
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ccdfd5dc149461e651cf9a0ca22504052e76f3980c75498b05f1720451694acc41c8a91c16f92a2611e553a9d93b9caa482c93b47cdbd95288767588895a22ac
|
7
|
+
data.tar.gz: 14b4833c7d5fd9f179b9101b240acca1f2f2e8e40ef6521f0bd244d6a5f526255a2a34928960403b28575685e1d6e29d84156577a54fa13bedea1048eb9eeafb
|
data/.github/workflows/rake.yml
CHANGED
@@ -10,23 +10,6 @@ on:
|
|
10
10
|
|
11
11
|
jobs:
|
12
12
|
rake:
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
strategy:
|
17
|
-
fail-fast: false
|
18
|
-
matrix:
|
19
|
-
ruby: [ '3.0', '2.7', '2.6', '2.5' ]
|
20
|
-
os: [ ubuntu-latest, windows-latest, macos-latest ]
|
21
|
-
experimental: [ false ]
|
22
|
-
steps:
|
23
|
-
- uses: actions/checkout@v2
|
24
|
-
with:
|
25
|
-
submodules: true
|
26
|
-
|
27
|
-
- uses: ruby/setup-ruby@v1
|
28
|
-
with:
|
29
|
-
ruby-version: ${{ matrix.ruby }}
|
30
|
-
bundler-cache: true
|
31
|
-
|
32
|
-
- run: bundle exec rake
|
13
|
+
uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
|
14
|
+
secrets:
|
15
|
+
pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
|
data/README.adoc
CHANGED
@@ -58,11 +58,12 @@ There there are two other Microsoft Word vendors in the Ruby ecosystem.
|
|
58
58
|
--
|
59
59
|
require "html2doc"
|
60
60
|
|
61
|
-
Html2Doc.process(result, filename: filename, stylesheet: stylesheet, header_filename: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles)
|
61
|
+
Html2Doc.process(result, filename: filename, imagedir: imagedir, stylesheet: stylesheet, header_filename: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles)
|
62
62
|
--
|
63
63
|
|
64
64
|
result:: is the Html document to be converted into Word, as a string.
|
65
65
|
filename:: is the name the document is to be saved as, without a file suffix
|
66
|
+
imagedir:: base directory for local image file names in source XML
|
66
67
|
stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided, the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
|
67
68
|
header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/metanorma/metanorma-iso/blob/master/lib/asciidoctor/iso/word/header.html
|
68
69
|
dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided, it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder. External URL images are left alone, and are not downloaded.)
|
data/html2doc.gemspec
CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
|
|
33
33
|
spec.add_dependency "thread_safe"
|
34
34
|
spec.add_dependency "uuidtools"
|
35
35
|
|
36
|
-
spec.add_development_dependency "
|
36
|
+
spec.add_development_dependency "debug"
|
37
37
|
spec.add_development_dependency "equivalent-xml", "~> 0.6"
|
38
38
|
spec.add_development_dependency "guard", "~> 2.14"
|
39
39
|
spec.add_development_dependency "guard-rspec", "~> 4.7"
|
data/lib/html2doc/base.rb
CHANGED
@@ -53,7 +53,7 @@ module Html2Doc
|
|
53
53
|
|
54
54
|
def self.cleanup(docxml, hash)
|
55
55
|
namespace(docxml.root)
|
56
|
-
image_cleanup(docxml, hash[:dir1],
|
56
|
+
image_cleanup(docxml, hash[:dir1], hash[:imagedir])
|
57
57
|
mathml_to_ooml(docxml)
|
58
58
|
lists(docxml, hash[:liststyles])
|
59
59
|
footnotes(docxml)
|
@@ -76,6 +76,8 @@ module Html2Doc
|
|
76
76
|
xml = '<!DOCTYPE html SYSTEM
|
77
77
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
|
78
78
|
end
|
79
|
+
xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
|
80
|
+
.gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
|
79
81
|
Nokogiri::XML.parse(xml)
|
80
82
|
end
|
81
83
|
|
@@ -85,12 +87,16 @@ module Html2Doc
|
|
85
87
|
|
86
88
|
def self.from_xhtml(xml)
|
87
89
|
xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
|
88
|
-
.sub(DOCTYPE, "")
|
89
|
-
.gsub(
|
90
|
+
.sub(DOCTYPE, "").gsub(%{ />}, "/>")
|
91
|
+
.gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
|
92
|
+
.gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
|
93
|
+
.gsub("\n-->\n", "\n-->\n")
|
90
94
|
end
|
91
95
|
|
92
96
|
def self.msword_fix(doc)
|
93
97
|
# brain damage in MSWord parser
|
98
|
+
doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
|
99
|
+
"<w:DoNotOptimizeForBrowser/>")
|
94
100
|
doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
|
95
101
|
'<span style="mso-special-character:footnote"></span>')
|
96
102
|
doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
|
@@ -116,7 +122,7 @@ module Html2Doc
|
|
116
122
|
end
|
117
123
|
|
118
124
|
PRINT_VIEW = <<~XML.freeze
|
119
|
-
|
125
|
+
|
120
126
|
<xml>
|
121
127
|
<w:WordDocument>
|
122
128
|
<w:View>Print</w:View>
|
@@ -124,8 +130,7 @@ module Html2Doc
|
|
124
130
|
<w:DoNotOptimizeForBrowser/>
|
125
131
|
</w:WordDocument>
|
126
132
|
</xml>
|
127
|
-
|
128
|
-
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
133
|
+
<meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
|
129
134
|
XML
|
130
135
|
|
131
136
|
def self.define_head1(docxml, _dir)
|
@@ -148,12 +153,16 @@ module Html2Doc
|
|
148
153
|
end
|
149
154
|
end
|
150
155
|
|
151
|
-
def self.stylesheet(_filename, _header_filename,
|
152
|
-
(
|
153
|
-
|
154
|
-
stylesheet = File.read(
|
156
|
+
def self.stylesheet(_filename, _header_filename, cssname)
|
157
|
+
(cssname.nil? || cssname.empty?) and
|
158
|
+
cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
|
159
|
+
stylesheet = File.read(cssname, encoding: "UTF-8")
|
155
160
|
xml = Nokogiri::XML("<style/>")
|
156
|
-
|
161
|
+
#s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
|
162
|
+
#xml.children.first << Nokogiri::XML::Comment.new(xml, s)
|
163
|
+
xml.children.first << Nokogiri::XML::CDATA
|
164
|
+
.new(xml, "\n<!--\n#{stylesheet}\n-->\n")
|
165
|
+
|
157
166
|
xml.root.to_s
|
158
167
|
end
|
159
168
|
|
data/lib/html2doc/version.rb
CHANGED
data/spec/html2doc_spec.rb
CHANGED
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
|
|
41
41
|
Content-Type: text/html; charset="utf-8"
|
42
42
|
|
43
43
|
<?xml version="1.0"?>
|
44
|
-
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head
|
44
|
+
<html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
|
45
45
|
<xml>
|
46
46
|
<w:WordDocument>
|
47
47
|
<w:View>Print</w:View>
|
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
|
|
49
49
|
<w:DoNotOptimizeForBrowser/>
|
50
50
|
</w:WordDocument>
|
51
51
|
</xml>
|
52
|
-
<![endif]-->
|
53
52
|
<meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
|
54
53
|
|
55
54
|
<link rel=File-List href="cid:filelist.xml"/>
|
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
|
|
278
277
|
expect(Html2Doc::VERSION).not_to be nil
|
279
278
|
end
|
280
279
|
|
280
|
+
it "preserves Word HTML directives" do
|
281
|
+
Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
|
282
|
+
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
283
|
+
.to match_fuzzy(<<~OUTPUT)
|
284
|
+
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
285
|
+
#{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
|
286
|
+
'<div style="mso-element:footnote-list"/>')}
|
287
|
+
#{WORD_FTR1}
|
288
|
+
OUTPUT
|
289
|
+
end
|
290
|
+
|
281
291
|
it "processes a blank document" do
|
282
292
|
Html2Doc.process(html_input(""), filename: "test")
|
283
293
|
expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
|
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
|
|
367
377
|
File.open("spec/header_img1.html", "w:UTF-8") do |f|
|
368
378
|
f.write(
|
369
379
|
doc.sub(%r{spec/19160-6.png},
|
370
|
-
File.expand_path(File.join(File.dirname(__FILE__),
|
380
|
+
File.expand_path(File.join(File.dirname(__FILE__),
|
381
|
+
"19160-6.png"))),
|
371
382
|
)
|
372
383
|
end
|
373
384
|
Html2Doc.process(html_input(""),
|
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
|
|
565
576
|
|
566
577
|
it "resizes images for height, in a file in a subdirectory" do
|
567
578
|
simple_body = '<img src="19160-6.png">'
|
568
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test"
|
579
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
580
|
+
imagedir: "spec")
|
569
581
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
570
582
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
571
583
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -577,7 +589,7 @@ RSpec.describe Html2Doc do
|
|
577
589
|
|
578
590
|
it "resizes images for width" do
|
579
591
|
simple_body = '<img src="spec/19160-7.gif">'
|
580
|
-
Html2Doc.process(html_input(simple_body), filename: "test")
|
592
|
+
Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
|
581
593
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
582
594
|
expect(testdoc).to match(%r{Content-Type: image/gif})
|
583
595
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -589,7 +601,7 @@ RSpec.describe Html2Doc do
|
|
589
601
|
|
590
602
|
it "resizes images for height" do
|
591
603
|
simple_body = '<img src="spec/19160-8.jpg">'
|
592
|
-
Html2Doc.process(html_input(simple_body), filename: "test")
|
604
|
+
Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
|
593
605
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
594
606
|
expect(testdoc).to match(%r{Content-Type: image/jpeg})
|
595
607
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -642,7 +654,7 @@ RSpec.describe Html2Doc do
|
|
642
654
|
|
643
655
|
it "does not move images if they are external URLs" do
|
644
656
|
simple_body = '<img src="https://example.com/19160-6.png">'
|
645
|
-
Html2Doc.process(html_input(simple_body), filename: "test")
|
657
|
+
Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
|
646
658
|
testdoc = File.read("test.doc", encoding: "utf-8")
|
647
659
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
648
660
|
#{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
|
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
|
|
653
665
|
|
654
666
|
it "deals with absolute image locations" do
|
655
667
|
simple_body = %{<img src="#{__dir__}/19160-6.png">}
|
656
|
-
Html2Doc.process(html_input(simple_body), filename: "spec/test"
|
668
|
+
Html2Doc.process(html_input(simple_body), filename: "spec/test",
|
669
|
+
imagedir: ".")
|
657
670
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
658
671
|
expect(testdoc).to match(%r{Content-Type: image/png})
|
659
672
|
expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
|
@@ -836,7 +849,7 @@ RSpec.describe Html2Doc do
|
|
836
849
|
it "test image base64 image encoding" do
|
837
850
|
simple_body = '<img src="19160-6.png">'
|
838
851
|
Html2Doc.process(html_input(simple_body),
|
839
|
-
filename: "spec/test", debug: true)
|
852
|
+
filename: "spec/test", debug: true, imagedir: "spec")
|
840
853
|
testdoc = File.read("spec/test.doc", encoding: "utf-8")
|
841
854
|
base64_image = testdoc[/image\/png\n\n(.*?)\n\n----/m, 1].gsub!("\n", "")
|
842
855
|
base64_image_basename = testdoc[%r{Content-ID: <([0-9a-z\-]+)\.png}m, 1]
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: html2doc
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1
|
4
|
+
version: 1.3.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: asciimath
|
@@ -123,19 +123,19 @@ dependencies:
|
|
123
123
|
- !ruby/object:Gem::Version
|
124
124
|
version: '0'
|
125
125
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
126
|
+
name: debug
|
127
127
|
requirement: !ruby/object:Gem::Requirement
|
128
128
|
requirements:
|
129
|
-
- - "
|
129
|
+
- - ">="
|
130
130
|
- !ruby/object:Gem::Version
|
131
|
-
version: '
|
131
|
+
version: '0'
|
132
132
|
type: :development
|
133
133
|
prerelease: false
|
134
134
|
version_requirements: !ruby/object:Gem::Requirement
|
135
135
|
requirements:
|
136
|
-
- - "
|
136
|
+
- - ">="
|
137
137
|
- !ruby/object:Gem::Version
|
138
|
-
version: '
|
138
|
+
version: '0'
|
139
139
|
- !ruby/object:Gem::Dependency
|
140
140
|
name: equivalent-xml
|
141
141
|
requirement: !ruby/object:Gem::Requirement
|
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
334
334
|
- !ruby/object:Gem::Version
|
335
335
|
version: '0'
|
336
336
|
requirements: []
|
337
|
-
rubygems_version: 3.
|
337
|
+
rubygems_version: 3.2.32
|
338
338
|
signing_key:
|
339
339
|
specification_version: 4
|
340
340
|
summary: Convert HTML document to Microsoft Word document
|