html2doc 1.2.1 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 99602f2e4d42bf9e809ccc26cdfdba8a602ead1dfbfb68876ca2e90683e091e9
4
- data.tar.gz: aba931c818606124656a92a78760e8870291829fcd4476656bbd1fa656697855
3
+ metadata.gz: a71b394c280e43e4c661958ef48e0d1a7e26f05e9988e3a697837bd972b5a2f5
4
+ data.tar.gz: 243ef6cab6e2674befed8cc1d3190bc3448cceae4360604bacb956fe9bb72efe
5
5
  SHA512:
6
- metadata.gz: 5e94fd597cd70658bd034f6a202c4010b31145b1da2375a59f6f35c648a278b6ad40221c85d6e19c25aaaf9da3e734c3d3fff33b6175226db4a1b468fea842ea
7
- data.tar.gz: 30a8189d7440fa8742c2c83bc3bbff3ac2008e1fc7add0add4534bfaf341b1d80d12434b038051d729eb165aeca9199a1e79b4b9a430a397b386704a5121f461
6
+ metadata.gz: 80dab821665aeccf3c2f89a301af6fc63b911b79659c0c65ffceb4cbe7a1c637342b37d4803f3d41a842ace6d1c694d031d9fc38402a7adbce67d74c30bb15c6
7
+ data.tar.gz: 927dfe85cbbbf65da137465776dc1261364f6267e955b8d26f9fd5de994a79bea210b206dda32522fa758c7ffa0f50549f848d77694b83cbd506c28fc1111c78
@@ -10,23 +10,6 @@ on:
10
10
 
11
11
  jobs:
12
12
  rake:
13
- name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
- runs-on: ${{ matrix.os }}
15
- continue-on-error: ${{ matrix.experimental }}
16
- strategy:
17
- fail-fast: false
18
- matrix:
19
- ruby: [ '3.0', '2.7', '2.6', '2.5' ]
20
- os: [ ubuntu-latest, windows-latest, macos-latest ]
21
- experimental: [ false ]
22
- steps:
23
- - uses: actions/checkout@v2
24
- with:
25
- submodules: true
26
-
27
- - uses: ruby/setup-ruby@v1
28
- with:
29
- ruby-version: ${{ matrix.ruby }}
30
- bundler-cache: true
31
-
32
- - run: bundle exec rake
13
+ uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
14
+ secrets:
15
+ pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
data/lib/html2doc/base.rb CHANGED
@@ -76,6 +76,8 @@ module Html2Doc
76
76
  xml = '<!DOCTYPE html SYSTEM
77
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
78
78
  end
79
+ xml = xml.gsub(/<!--\s*\[([^]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
80
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
79
81
  Nokogiri::XML.parse(xml)
80
82
  end
81
83
 
@@ -85,12 +87,16 @@ module Html2Doc
85
87
 
86
88
  def self.from_xhtml(xml)
87
89
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
- .sub(DOCTYPE, "")
89
- .gsub(%{ />}, "/>")
90
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
91
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
92
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
93
+ .gsub("\n--&gt;\n", "\n-->\n")
90
94
  end
91
95
 
92
96
  def self.msword_fix(doc)
93
97
  # brain damage in MSWord parser
98
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
99
+ "<w:DoNotOptimizeForBrowser/>")
94
100
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
101
  '<span style="mso-special-character:footnote"></span>')
96
102
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
@@ -116,7 +122,7 @@ module Html2Doc
116
122
  end
117
123
 
118
124
  PRINT_VIEW = <<~XML.freeze
119
- <!--[if gte mso 9]>
125
+
120
126
  <xml>
121
127
  <w:WordDocument>
122
128
  <w:View>Print</w:View>
@@ -124,8 +130,7 @@ module Html2Doc
124
130
  <w:DoNotOptimizeForBrowser/>
125
131
  </w:WordDocument>
126
132
  </xml>
127
- <![endif]-->
128
- <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
133
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
129
134
  XML
130
135
 
131
136
  def self.define_head1(docxml, _dir)
@@ -148,12 +153,16 @@ module Html2Doc
148
153
  end
149
154
  end
150
155
 
151
- def self.stylesheet(_filename, _header_filename, fn)
152
- (fn.nil? || fn.empty?) and
153
- fn = File.join(File.dirname(__FILE__), "wordstyle.css")
154
- stylesheet = File.read(fn, encoding: "UTF-8")
156
+ def self.stylesheet(_filename, _header_filename, cssname)
157
+ (cssname.nil? || cssname.empty?) and
158
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
159
+ stylesheet = File.read(cssname, encoding: "UTF-8")
155
160
  xml = Nokogiri::XML("<style/>")
156
- xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
161
+ #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
162
+ #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
163
+ xml.children.first << Nokogiri::XML::CDATA
164
+ .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
165
+
157
166
  xml.root.to_s
158
167
  end
159
168
 
@@ -1,3 +1,3 @@
1
1
  module Html2Doc
2
- VERSION = "1.2.1".freeze
2
+ VERSION = "1.3.0".freeze
3
3
  end
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
41
41
  Content-Type: text/html; charset="utf-8"
42
42
 
43
43
  <?xml version="1.0"?>
44
- <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
44
+ <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
45
45
  <xml>
46
46
  <w:WordDocument>
47
47
  <w:View>Print</w:View>
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
49
49
  <w:DoNotOptimizeForBrowser/>
50
50
  </w:WordDocument>
51
51
  </xml>
52
- <![endif]-->
53
52
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
54
53
 
55
54
  <link rel=File-List href="cid:filelist.xml"/>
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
278
277
  expect(Html2Doc::VERSION).not_to be nil
279
278
  end
280
279
 
280
+ it "preserves Word HTML directives" do
281
+ Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
282
+ expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
283
+ .to match_fuzzy(<<~OUTPUT)
284
+ #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
285
+ #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
286
+ '<div style="mso-element:footnote-list"/>')}
287
+ #{WORD_FTR1}
288
+ OUTPUT
289
+ end
290
+
281
291
  it "processes a blank document" do
282
292
  Html2Doc.process(html_input(""), filename: "test")
283
293
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
367
377
  File.open("spec/header_img1.html", "w:UTF-8") do |f|
368
378
  f.write(
369
379
  doc.sub(%r{spec/19160-6.png},
370
- File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))),
380
+ File.expand_path(File.join(File.dirname(__FILE__),
381
+ "19160-6.png"))),
371
382
  )
372
383
  end
373
384
  Html2Doc.process(html_input(""),
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
565
576
 
566
577
  it "resizes images for height, in a file in a subdirectory" do
567
578
  simple_body = '<img src="19160-6.png">'
568
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: "spec")
579
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
580
+ imagedir: "spec")
569
581
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
570
582
  expect(testdoc).to match(%r{Content-Type: image/png})
571
583
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
653
665
 
654
666
  it "deals with absolute image locations" do
655
667
  simple_body = %{<img src="#{__dir__}/19160-6.png">}
656
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: ".")
668
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
669
+ imagedir: ".")
657
670
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
658
671
  expect(testdoc).to match(%r{Content-Type: image/png})
659
672
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.1
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-11-29 00:00:00.000000000 Z
11
+ date: 2022-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: asciimath
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.2.22
337
+ rubygems_version: 3.2.32
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: Convert HTML document to Microsoft Word document