html2doc 1.1.4 → 1.3.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b9ed3f5d01d7910a104f86dfe54090ffc3ddf56730f5885293801b3848b24735
4
- data.tar.gz: 98428b2016bba38f17cb66226e2fb8d96a28c6ad28bd47a3bc0b998ea1c81228
3
+ metadata.gz: 6f471eed9c61de156ee7aa0dd279d21335fd193dda721727197c2dc508a7bf56
4
+ data.tar.gz: 79365ba28433486f8d6442aa939729af2456c570baccbb88b9ddb835d1f94172
5
5
  SHA512:
6
- metadata.gz: ede857348aa47a2f09df5c0c1929056251729b358815130ed6c7823f14e9a49cbb1439d43eb45104cb6be2104f47b4dda15b156680dfefd687c4d6439e162c89
7
- data.tar.gz: 4027da3d313f7efb834efc96666d6aedfa509d3b2fc7335b367259833a0050e29b13da92e40514b2afee76b9f84420b81951d1fb9d577643a077643823dcf23c
6
+ metadata.gz: ccdfd5dc149461e651cf9a0ca22504052e76f3980c75498b05f1720451694acc41c8a91c16f92a2611e553a9d93b9caa482c93b47cdbd95288767588895a22ac
7
+ data.tar.gz: 14b4833c7d5fd9f179b9101b240acca1f2f2e8e40ef6521f0bd244d6a5f526255a2a34928960403b28575685e1d6e29d84156577a54fa13bedea1048eb9eeafb
@@ -10,23 +10,6 @@ on:
10
10
 
11
11
  jobs:
12
12
  rake:
13
- name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
- runs-on: ${{ matrix.os }}
15
- continue-on-error: ${{ matrix.experimental }}
16
- strategy:
17
- fail-fast: false
18
- matrix:
19
- ruby: [ '3.0', '2.7', '2.6', '2.5' ]
20
- os: [ ubuntu-latest, windows-latest, macos-latest ]
21
- experimental: [ false ]
22
- steps:
23
- - uses: actions/checkout@v2
24
- with:
25
- submodules: true
26
-
27
- - uses: ruby/setup-ruby@v1
28
- with:
29
- ruby-version: ${{ matrix.ruby }}
30
- bundler-cache: true
31
-
32
- - run: bundle exec rake
13
+ uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
14
+ secrets:
15
+ pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
data/README.adoc CHANGED
@@ -58,11 +58,12 @@ There there are two other Microsoft Word vendors in the Ruby ecosystem.
58
58
  --
59
59
  require "html2doc"
60
60
 
61
- Html2Doc.process(result, filename: filename, stylesheet: stylesheet, header_filename: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles)
61
+ Html2Doc.process(result, filename: filename, imagedir: imagedir, stylesheet: stylesheet, header_filename: header_filename, dir: dir, asciimathdelims: asciimathdelims, liststyles: liststyles)
62
62
  --
63
63
 
64
64
  result:: is the Html document to be converted into Word, as a string.
65
65
  filename:: is the name the document is to be saved as, without a file suffix
66
+ imagedir:: base directory for local image file names in source XML
66
67
  stylesheet:: is the full path filename of the CSS stylesheet for Microsoft Word-specific styles. If this is not provided, the program will used the default stylesheet included in the gem, `lib/html2doc/wordstyle.css`. The stylsheet provided must match this stylesheet; you can obtain one by saving a Word document with your desired styles to HTML, and extracting the style definitions from the HTML document header.
67
68
  header_filename:: is the filename of the HTML document containing header and footer for the document, as well as footnote/endnote separators; if there is none, use nil. To generate your own such document, save a Word document with headers/footers and/or footnote/endnote separators as an HTML document; the `header.html` will be in the `{filename}.fld` folder generated along with the HTML. A sample file is available at https://github.com/metanorma/metanorma-iso/blob/master/lib/asciidoctor/iso/word/header.html
68
69
  dir:: is the folder that any ancillary files (images, headers, filelist) are to be saved to. If not provided, it will be created as `{filename}_files`. Anything in the directory will be attached to the Word document; so this folder should only contain the images that accompany the document. (If the images are elsewhere on the local drive, the gem will move them into the folder. External URL images are left alone, and are not downloaded.)
data/html2doc.gemspec CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
33
33
  spec.add_dependency "thread_safe"
34
34
  spec.add_dependency "uuidtools"
35
35
 
36
- spec.add_development_dependency "byebug", "~> 9.1"
36
+ spec.add_development_dependency "debug"
37
37
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
38
38
  spec.add_development_dependency "guard", "~> 2.14"
39
39
  spec.add_development_dependency "guard-rspec", "~> 4.7"
data/lib/html2doc/base.rb CHANGED
@@ -53,7 +53,7 @@ module Html2Doc
53
53
 
54
54
  def self.cleanup(docxml, hash)
55
55
  namespace(docxml.root)
56
- image_cleanup(docxml, hash[:dir1], File.dirname(hash[:filename]))
56
+ image_cleanup(docxml, hash[:dir1], hash[:imagedir])
57
57
  mathml_to_ooml(docxml)
58
58
  lists(docxml, hash[:liststyles])
59
59
  footnotes(docxml)
@@ -76,6 +76,8 @@ module Html2Doc
76
76
  xml = '<!DOCTYPE html SYSTEM
77
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
78
78
  end
79
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
80
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
79
81
  Nokogiri::XML.parse(xml)
80
82
  end
81
83
 
@@ -85,12 +87,16 @@ module Html2Doc
85
87
 
86
88
  def self.from_xhtml(xml)
87
89
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
- .sub(DOCTYPE, "")
89
- .gsub(%{ />}, "/>")
90
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
91
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
92
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
93
+ .gsub("\n--&gt;\n", "\n-->\n")
90
94
  end
91
95
 
92
96
  def self.msword_fix(doc)
93
97
  # brain damage in MSWord parser
98
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
99
+ "<w:DoNotOptimizeForBrowser/>")
94
100
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
101
  '<span style="mso-special-character:footnote"></span>')
96
102
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
@@ -116,7 +122,7 @@ module Html2Doc
116
122
  end
117
123
 
118
124
  PRINT_VIEW = <<~XML.freeze
119
- <!--[if gte mso 9]>
125
+
120
126
  <xml>
121
127
  <w:WordDocument>
122
128
  <w:View>Print</w:View>
@@ -124,8 +130,7 @@ module Html2Doc
124
130
  <w:DoNotOptimizeForBrowser/>
125
131
  </w:WordDocument>
126
132
  </xml>
127
- <![endif]-->
128
- <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
133
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
129
134
  XML
130
135
 
131
136
  def self.define_head1(docxml, _dir)
@@ -148,12 +153,16 @@ module Html2Doc
148
153
  end
149
154
  end
150
155
 
151
- def self.stylesheet(_filename, _header_filename, fn)
152
- (fn.nil? || fn.empty?) and
153
- fn = File.join(File.dirname(__FILE__), "wordstyle.css")
154
- stylesheet = File.read(fn, encoding: "UTF-8")
156
+ def self.stylesheet(_filename, _header_filename, cssname)
157
+ (cssname.nil? || cssname.empty?) and
158
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
159
+ stylesheet = File.read(cssname, encoding: "UTF-8")
155
160
  xml = Nokogiri::XML("<style/>")
156
- xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
161
+ #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
162
+ #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
163
+ xml.children.first << Nokogiri::XML::CDATA
164
+ .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
165
+
157
166
  xml.root.to_s
158
167
  end
159
168
 
@@ -1,3 +1,3 @@
1
1
  module Html2Doc
2
- VERSION = "1.1.4".freeze
2
+ VERSION = "1.3.0.1".freeze
3
3
  end
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
41
41
  Content-Type: text/html; charset="utf-8"
42
42
 
43
43
  <?xml version="1.0"?>
44
- <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
44
+ <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
45
45
  <xml>
46
46
  <w:WordDocument>
47
47
  <w:View>Print</w:View>
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
49
49
  <w:DoNotOptimizeForBrowser/>
50
50
  </w:WordDocument>
51
51
  </xml>
52
- <![endif]-->
53
52
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
54
53
 
55
54
  <link rel=File-List href="cid:filelist.xml"/>
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
278
277
  expect(Html2Doc::VERSION).not_to be nil
279
278
  end
280
279
 
280
+ it "preserves Word HTML directives" do
281
+ Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
282
+ expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
283
+ .to match_fuzzy(<<~OUTPUT)
284
+ #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
285
+ #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
286
+ '<div style="mso-element:footnote-list"/>')}
287
+ #{WORD_FTR1}
288
+ OUTPUT
289
+ end
290
+
281
291
  it "processes a blank document" do
282
292
  Html2Doc.process(html_input(""), filename: "test")
283
293
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
367
377
  File.open("spec/header_img1.html", "w:UTF-8") do |f|
368
378
  f.write(
369
379
  doc.sub(%r{spec/19160-6.png},
370
- File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))),
380
+ File.expand_path(File.join(File.dirname(__FILE__),
381
+ "19160-6.png"))),
371
382
  )
372
383
  end
373
384
  Html2Doc.process(html_input(""),
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
565
576
 
566
577
  it "resizes images for height, in a file in a subdirectory" do
567
578
  simple_body = '<img src="19160-6.png">'
568
- Html2Doc.process(html_input(simple_body), filename: "spec/test")
579
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
580
+ imagedir: "spec")
569
581
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
570
582
  expect(testdoc).to match(%r{Content-Type: image/png})
571
583
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -577,7 +589,7 @@ RSpec.describe Html2Doc do
577
589
 
578
590
  it "resizes images for width" do
579
591
  simple_body = '<img src="spec/19160-7.gif">'
580
- Html2Doc.process(html_input(simple_body), filename: "test")
592
+ Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
581
593
  testdoc = File.read("test.doc", encoding: "utf-8")
582
594
  expect(testdoc).to match(%r{Content-Type: image/gif})
583
595
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -589,7 +601,7 @@ RSpec.describe Html2Doc do
589
601
 
590
602
  it "resizes images for height" do
591
603
  simple_body = '<img src="spec/19160-8.jpg">'
592
- Html2Doc.process(html_input(simple_body), filename: "test")
604
+ Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
593
605
  testdoc = File.read("test.doc", encoding: "utf-8")
594
606
  expect(testdoc).to match(%r{Content-Type: image/jpeg})
595
607
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -642,7 +654,7 @@ RSpec.describe Html2Doc do
642
654
 
643
655
  it "does not move images if they are external URLs" do
644
656
  simple_body = '<img src="https://example.com/19160-6.png">'
645
- Html2Doc.process(html_input(simple_body), filename: "test")
657
+ Html2Doc.process(html_input(simple_body), filename: "test", imagedir: ".")
646
658
  testdoc = File.read("test.doc", encoding: "utf-8")
647
659
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
648
660
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
653
665
 
654
666
  it "deals with absolute image locations" do
655
667
  simple_body = %{<img src="#{__dir__}/19160-6.png">}
656
- Html2Doc.process(html_input(simple_body), filename: "spec/test")
668
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
669
+ imagedir: ".")
657
670
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
658
671
  expect(testdoc).to match(%r{Content-Type: image/png})
659
672
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -836,7 +849,7 @@ RSpec.describe Html2Doc do
836
849
  it "test image base64 image encoding" do
837
850
  simple_body = '<img src="19160-6.png">'
838
851
  Html2Doc.process(html_input(simple_body),
839
- filename: "spec/test", debug: true)
852
+ filename: "spec/test", debug: true, imagedir: "spec")
840
853
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
841
854
  base64_image = testdoc[/image\/png\n\n(.*?)\n\n----/m, 1].gsub!("\n", "")
842
855
  base64_image_basename = testdoc[%r{Content-ID: <([0-9a-z\-]+)\.png}m, 1]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.4
4
+ version: 1.3.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-27 00:00:00.000000000 Z
11
+ date: 2022-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: asciimath
@@ -123,19 +123,19 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: byebug
126
+ name: debug
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
- - - "~>"
129
+ - - ">="
130
130
  - !ruby/object:Gem::Version
131
- version: '9.1'
131
+ version: '0'
132
132
  type: :development
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
- - - "~>"
136
+ - - ">="
137
137
  - !ruby/object:Gem::Version
138
- version: '9.1'
138
+ version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: equivalent-xml
141
141
  requirement: !ruby/object:Gem::Requirement
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.1.4
337
+ rubygems_version: 3.2.32
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: Convert HTML document to Microsoft Word document