html2doc 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 64cb262d3176610f0887cc69bab66fa2a3a7ca8445f8ad493d9d65c455a091d6
4
- data.tar.gz: 468dc7a8fb687cdbf6db1497cf9d9b5e164687b7d460a5eac1fb983b4673672b
3
+ metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
4
+ data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
5
5
  SHA512:
6
- metadata.gz: 2a47bbe5df7ae0767ad2f4ccf52c1f96b8e27fc32d08b3b7b25e5051a3d229b29b9852a51c052a701990f9be6dbf0efc97795ea0c0ff4b3745b63f5a0c7adb4f
7
- data.tar.gz: c4c10a84141889d820fd8d2afc273122b28372794edd9fdb3e60aee28773350d3e545f2a3efb75c4d28eff350d367b020d01d6e5cc2874a957ca612124e78fd4
6
+ metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
7
+ data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971
@@ -10,23 +10,6 @@ on:
10
10
 
11
11
  jobs:
12
12
  rake:
13
- name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
- runs-on: ${{ matrix.os }}
15
- continue-on-error: ${{ matrix.experimental }}
16
- strategy:
17
- fail-fast: false
18
- matrix:
19
- ruby: [ '3.0', '2.7', '2.6', '2.5' ]
20
- os: [ ubuntu-latest, windows-latest, macos-latest ]
21
- experimental: [ false ]
22
- steps:
23
- - uses: actions/checkout@v2
24
- with:
25
- submodules: true
26
-
27
- - uses: ruby/setup-ruby@v1
28
- with:
29
- ruby-version: ${{ matrix.ruby }}
30
- bundler-cache: true
31
-
32
- - run: bundle exec rake
13
+ uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
14
+ secrets:
15
+ pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
data/html2doc.gemspec CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
33
33
  spec.add_dependency "thread_safe"
34
34
  spec.add_dependency "uuidtools"
35
35
 
36
- spec.add_development_dependency "byebug", "~> 9.1"
36
+ spec.add_development_dependency "debug"
37
37
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
38
38
  spec.add_development_dependency "guard", "~> 2.14"
39
39
  spec.add_development_dependency "guard-rspec", "~> 4.7"
data/lib/html2doc/base.rb CHANGED
@@ -76,6 +76,8 @@ module Html2Doc
76
76
  xml = '<!DOCTYPE html SYSTEM
77
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
78
78
  end
79
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
80
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
79
81
  Nokogiri::XML.parse(xml)
80
82
  end
81
83
 
@@ -85,12 +87,16 @@ module Html2Doc
85
87
 
86
88
  def self.from_xhtml(xml)
87
89
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
- .sub(DOCTYPE, "")
89
- .gsub(%{ />}, "/>")
90
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
91
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
92
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
93
+ .gsub("\n--&gt;\n", "\n-->\n")
90
94
  end
91
95
 
92
96
  def self.msword_fix(doc)
93
97
  # brain damage in MSWord parser
98
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
99
+ "<w:DoNotOptimizeForBrowser/>")
94
100
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
101
  '<span style="mso-special-character:footnote"></span>')
96
102
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
@@ -116,7 +122,7 @@ module Html2Doc
116
122
  end
117
123
 
118
124
  PRINT_VIEW = <<~XML.freeze
119
- <!--[if gte mso 9]>
125
+
120
126
  <xml>
121
127
  <w:WordDocument>
122
128
  <w:View>Print</w:View>
@@ -124,8 +130,7 @@ module Html2Doc
124
130
  <w:DoNotOptimizeForBrowser/>
125
131
  </w:WordDocument>
126
132
  </xml>
127
- <![endif]-->
128
- <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
133
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
129
134
  XML
130
135
 
131
136
  def self.define_head1(docxml, _dir)
@@ -148,12 +153,16 @@ module Html2Doc
148
153
  end
149
154
  end
150
155
 
151
- def self.stylesheet(_filename, _header_filename, fn)
152
- (fn.nil? || fn.empty?) and
153
- fn = File.join(File.dirname(__FILE__), "wordstyle.css")
154
- stylesheet = File.read(fn, encoding: "UTF-8")
156
+ def self.stylesheet(_filename, _header_filename, cssname)
157
+ (cssname.nil? || cssname.empty?) and
158
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
159
+ stylesheet = File.read(cssname, encoding: "UTF-8")
155
160
  xml = Nokogiri::XML("<style/>")
156
- xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
161
+ #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
162
+ #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
163
+ xml.children.first << Nokogiri::XML::CDATA
164
+ .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
165
+
157
166
  xml.root.to_s
158
167
  end
159
168
 
data/lib/html2doc/math.rb CHANGED
@@ -140,7 +140,7 @@ module Html2Doc
140
140
  end
141
141
  end
142
142
 
143
- # We need span and em not to be namespaced. Word can't deal with explicit
143
+ # We need span and em not to be namespaced. Word can't deal with explicit
144
144
  # namespaces.
145
145
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
146
146
  # insists on inheriting namespace from parent.
@@ -154,11 +154,28 @@ module Html2Doc
154
154
  def self.mathml_to_ooml1(xml, docnamespaces)
155
155
  doc = Nokogiri::XML::Document::new
156
156
  doc.root = ooxml_cleanup(xml, docnamespaces)
157
- ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
157
+ ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
158
158
  ooxml = uncenter(xml, ooxml)
159
159
  xml.swap(ooxml)
160
160
  end
161
161
 
162
+ def self.accent_tr(xml)
163
+ xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
164
+ x["m:val"] &&= accent_tr1(x["m:val"])
165
+ x["val"] &&= accent_tr1(x["val"])
166
+ end
167
+ xml
168
+ end
169
+
170
+ def self.accent_tr1(accent)
171
+ case accent
172
+ when "\u2192" then "\u20D7"
173
+ when "^" then "\u0302"
174
+ when "~" then "\u0303"
175
+ else accent
176
+ end
177
+ end
178
+
162
179
  # escape space as &#x32;; we are removing any spaces generated by
163
180
  # XML indentation
164
181
  def self.esc_space(xml)
@@ -180,7 +197,7 @@ module Html2Doc
180
197
  %w(left right).each do |dir|
181
198
  if alignnode.text.include? ("text-align:#{dir}")
182
199
  ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
183
- "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
200
+ "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
184
201
  end
185
202
  end
186
203
  ooxml
data/lib/html2doc/mime.rb CHANGED
@@ -107,12 +107,13 @@ module Html2Doc
107
107
  # only processes locally stored images
108
108
  def self.image_cleanup(docxml, dir, localdir)
109
109
  docxml.traverse do |i|
110
+ src = i["src"]
110
111
  next unless i.element? && %w(img v:imagedata).include?(i.name)
111
- next if /^http/.match? i["src"]
112
- next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
112
+ next if src.nil? || src.empty? || /^http/.match?(src)
113
+ next if %r{^data:(image|application)/[^;]+;base64}.match? src
113
114
 
114
- local_filename = localname(i["src"], localdir)
115
- new_filename = "#{mkuuid}#{File.extname(i['src'])}"
115
+ local_filename = localname(src, localdir)
116
+ new_filename = "#{mkuuid}#{File.extname(src)}"
116
117
  FileUtils.cp local_filename, File.join(dir, new_filename)
117
118
  i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
118
119
  i["src"] = File.join(File.basename(dir), new_filename)
@@ -1,3 +1,3 @@
1
1
  module Html2Doc
2
- VERSION = "1.2.0".freeze
2
+ VERSION = "1.3.1".freeze
3
3
  end
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
41
41
  Content-Type: text/html; charset="utf-8"
42
42
 
43
43
  <?xml version="1.0"?>
44
- <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
44
+ <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
45
45
  <xml>
46
46
  <w:WordDocument>
47
47
  <w:View>Print</w:View>
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
49
49
  <w:DoNotOptimizeForBrowser/>
50
50
  </w:WordDocument>
51
51
  </xml>
52
- <![endif]-->
53
52
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
54
53
 
55
54
  <link rel=File-List href="cid:filelist.xml"/>
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
278
277
  expect(Html2Doc::VERSION).not_to be nil
279
278
  end
280
279
 
280
+ it "preserves Word HTML directives" do
281
+ Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
282
+ expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
283
+ .to match_fuzzy(<<~OUTPUT)
284
+ #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
285
+ #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
286
+ '<div style="mso-element:footnote-list"/>')}
287
+ #{WORD_FTR1}
288
+ OUTPUT
289
+ end
290
+
281
291
  it "processes a blank document" do
282
292
  Html2Doc.process(html_input(""), filename: "test")
283
293
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
367
377
  File.open("spec/header_img1.html", "w:UTF-8") do |f|
368
378
  f.write(
369
379
  doc.sub(%r{spec/19160-6.png},
370
- File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))),
380
+ File.expand_path(File.join(File.dirname(__FILE__),
381
+ "19160-6.png"))),
371
382
  )
372
383
  end
373
384
  Html2Doc.process(html_input(""),
@@ -450,7 +461,7 @@ RSpec.describe Html2Doc do
450
461
  OUTPUT
451
462
  end
452
463
 
453
- it "unwraps accent in MathML" do
464
+ it "unwraps and converts accent in MathML" do
454
465
  Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
455
466
  <mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
456
467
  </math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
@@ -458,7 +469,7 @@ RSpec.describe Html2Doc do
458
469
  .to match_fuzzy(<<~OUTPUT)
459
470
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
460
471
  #{word_body('<div><m:oMath>
461
- <m:acc><m:accPr><m:chr m:val="^"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
472
+ <m:acc><m:accPr><m:chr m:val="&#x302;"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
462
473
  </m:oMath>
463
474
  </div>', '<div style="mso-element:footnote-list"/>')}
464
475
  #{WORD_FTR1}
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
565
576
 
566
577
  it "resizes images for height, in a file in a subdirectory" do
567
578
  simple_body = '<img src="19160-6.png">'
568
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: "spec")
579
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
580
+ imagedir: "spec")
569
581
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
570
582
  expect(testdoc).to match(%r{Content-Type: image/png})
571
583
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
653
665
 
654
666
  it "deals with absolute image locations" do
655
667
  simple_body = %{<img src="#{__dir__}/19160-6.png">}
656
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: ".")
668
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
669
+ imagedir: ".")
657
670
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
658
671
  expect(testdoc).to match(%r{Content-Type: image/png})
659
672
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-12 00:00:00.000000000 Z
11
+ date: 2022-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: asciimath
@@ -123,19 +123,19 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: byebug
126
+ name: debug
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
- - - "~>"
129
+ - - ">="
130
130
  - !ruby/object:Gem::Version
131
- version: '9.1'
131
+ version: '0'
132
132
  type: :development
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
- - - "~>"
136
+ - - ">="
137
137
  - !ruby/object:Gem::Version
138
- version: '9.1'
138
+ version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: equivalent-xml
141
141
  requirement: !ruby/object:Gem::Requirement
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.1.4
337
+ rubygems_version: 3.2.32
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: Convert HTML document to Microsoft Word document