html2doc 1.2.0 → 1.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 64cb262d3176610f0887cc69bab66fa2a3a7ca8445f8ad493d9d65c455a091d6
4
- data.tar.gz: 468dc7a8fb687cdbf6db1497cf9d9b5e164687b7d460a5eac1fb983b4673672b
3
+ metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
4
+ data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
5
5
  SHA512:
6
- metadata.gz: 2a47bbe5df7ae0767ad2f4ccf52c1f96b8e27fc32d08b3b7b25e5051a3d229b29b9852a51c052a701990f9be6dbf0efc97795ea0c0ff4b3745b63f5a0c7adb4f
7
- data.tar.gz: c4c10a84141889d820fd8d2afc273122b28372794edd9fdb3e60aee28773350d3e545f2a3efb75c4d28eff350d367b020d01d6e5cc2874a957ca612124e78fd4
6
+ metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
7
+ data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971
@@ -10,23 +10,6 @@ on:
10
10
 
11
11
  jobs:
12
12
  rake:
13
- name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
14
- runs-on: ${{ matrix.os }}
15
- continue-on-error: ${{ matrix.experimental }}
16
- strategy:
17
- fail-fast: false
18
- matrix:
19
- ruby: [ '3.0', '2.7', '2.6', '2.5' ]
20
- os: [ ubuntu-latest, windows-latest, macos-latest ]
21
- experimental: [ false ]
22
- steps:
23
- - uses: actions/checkout@v2
24
- with:
25
- submodules: true
26
-
27
- - uses: ruby/setup-ruby@v1
28
- with:
29
- ruby-version: ${{ matrix.ruby }}
30
- bundler-cache: true
31
-
32
- - run: bundle exec rake
13
+ uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
14
+ secrets:
15
+ pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}
data/html2doc.gemspec CHANGED
@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
33
33
  spec.add_dependency "thread_safe"
34
34
  spec.add_dependency "uuidtools"
35
35
 
36
- spec.add_development_dependency "byebug", "~> 9.1"
36
+ spec.add_development_dependency "debug"
37
37
  spec.add_development_dependency "equivalent-xml", "~> 0.6"
38
38
  spec.add_development_dependency "guard", "~> 2.14"
39
39
  spec.add_development_dependency "guard-rspec", "~> 4.7"
data/lib/html2doc/base.rb CHANGED
@@ -76,6 +76,8 @@ module Html2Doc
76
76
  xml = '<!DOCTYPE html SYSTEM
77
77
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
78
78
  end
79
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
80
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
79
81
  Nokogiri::XML.parse(xml)
80
82
  end
81
83
 
@@ -85,12 +87,16 @@ module Html2Doc
85
87
 
86
88
  def self.from_xhtml(xml)
87
89
  xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
88
- .sub(DOCTYPE, "")
89
- .gsub(%{ />}, "/>")
90
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
91
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
92
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
93
+ .gsub("\n--&gt;\n", "\n-->\n")
90
94
  end
91
95
 
92
96
  def self.msword_fix(doc)
93
97
  # brain damage in MSWord parser
98
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
99
+ "<w:DoNotOptimizeForBrowser/>")
94
100
  doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
95
101
  '<span style="mso-special-character:footnote"></span>')
96
102
  doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
@@ -116,7 +122,7 @@ module Html2Doc
116
122
  end
117
123
 
118
124
  PRINT_VIEW = <<~XML.freeze
119
- <!--[if gte mso 9]>
125
+
120
126
  <xml>
121
127
  <w:WordDocument>
122
128
  <w:View>Print</w:View>
@@ -124,8 +130,7 @@ module Html2Doc
124
130
  <w:DoNotOptimizeForBrowser/>
125
131
  </w:WordDocument>
126
132
  </xml>
127
- <![endif]-->
128
- <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
133
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
129
134
  XML
130
135
 
131
136
  def self.define_head1(docxml, _dir)
@@ -148,12 +153,16 @@ module Html2Doc
148
153
  end
149
154
  end
150
155
 
151
- def self.stylesheet(_filename, _header_filename, fn)
152
- (fn.nil? || fn.empty?) and
153
- fn = File.join(File.dirname(__FILE__), "wordstyle.css")
154
- stylesheet = File.read(fn, encoding: "UTF-8")
156
+ def self.stylesheet(_filename, _header_filename, cssname)
157
+ (cssname.nil? || cssname.empty?) and
158
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
159
+ stylesheet = File.read(cssname, encoding: "UTF-8")
155
160
  xml = Nokogiri::XML("<style/>")
156
- xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
161
+ #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
162
+ #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
163
+ xml.children.first << Nokogiri::XML::CDATA
164
+ .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
165
+
157
166
  xml.root.to_s
158
167
  end
159
168
 
data/lib/html2doc/math.rb CHANGED
@@ -140,7 +140,7 @@ module Html2Doc
140
140
  end
141
141
  end
142
142
 
143
- # We need span and em not to be namespaced. Word can't deal with explicit
143
+ # We need span and em not to be namespaced. Word can't deal with explicit
144
144
  # namespaces.
145
145
  # We will end up stripping them out again under Nokogiri 1.11, which correctly
146
146
  # insists on inheriting namespace from parent.
@@ -154,11 +154,28 @@ module Html2Doc
154
154
  def self.mathml_to_ooml1(xml, docnamespaces)
155
155
  doc = Nokogiri::XML::Document::new
156
156
  doc.root = ooxml_cleanup(xml, docnamespaces)
157
- ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
157
+ ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
158
158
  ooxml = uncenter(xml, ooxml)
159
159
  xml.swap(ooxml)
160
160
  end
161
161
 
162
+ def self.accent_tr(xml)
163
+ xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
164
+ x["m:val"] &&= accent_tr1(x["m:val"])
165
+ x["val"] &&= accent_tr1(x["val"])
166
+ end
167
+ xml
168
+ end
169
+
170
+ def self.accent_tr1(accent)
171
+ case accent
172
+ when "\u2192" then "\u20D7"
173
+ when "^" then "\u0302"
174
+ when "~" then "\u0303"
175
+ else accent
176
+ end
177
+ end
178
+
162
179
  # escape space as &#x32;; we are removing any spaces generated by
163
180
  # XML indentation
164
181
  def self.esc_space(xml)
@@ -180,7 +197,7 @@ module Html2Doc
180
197
  %w(left right).each do |dir|
181
198
  if alignnode.text.include? ("text-align:#{dir}")
182
199
  ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
183
- "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
200
+ "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
184
201
  end
185
202
  end
186
203
  ooxml
data/lib/html2doc/mime.rb CHANGED
@@ -107,12 +107,13 @@ module Html2Doc
107
107
  # only processes locally stored images
108
108
  def self.image_cleanup(docxml, dir, localdir)
109
109
  docxml.traverse do |i|
110
+ src = i["src"]
110
111
  next unless i.element? && %w(img v:imagedata).include?(i.name)
111
- next if /^http/.match? i["src"]
112
- next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
112
+ next if src.nil? || src.empty? || /^http/.match?(src)
113
+ next if %r{^data:(image|application)/[^;]+;base64}.match? src
113
114
 
114
- local_filename = localname(i["src"], localdir)
115
- new_filename = "#{mkuuid}#{File.extname(i['src'])}"
115
+ local_filename = localname(src, localdir)
116
+ new_filename = "#{mkuuid}#{File.extname(src)}"
116
117
  FileUtils.cp local_filename, File.join(dir, new_filename)
117
118
  i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
118
119
  i["src"] = File.join(File.basename(dir), new_filename)
@@ -1,3 +1,3 @@
1
1
  module Html2Doc
2
- VERSION = "1.2.0".freeze
2
+ VERSION = "1.3.1".freeze
3
3
  end
@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
41
41
  Content-Type: text/html; charset="utf-8"
42
42
 
43
43
  <?xml version="1.0"?>
44
- <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
44
+ <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
45
45
  <xml>
46
46
  <w:WordDocument>
47
47
  <w:View>Print</w:View>
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
49
49
  <w:DoNotOptimizeForBrowser/>
50
50
  </w:WordDocument>
51
51
  </xml>
52
- <![endif]-->
53
52
  <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
54
53
 
55
54
  <link rel=File-List href="cid:filelist.xml"/>
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
278
277
  expect(Html2Doc::VERSION).not_to be nil
279
278
  end
280
279
 
280
+ it "preserves Word HTML directives" do
281
+ Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
282
+ expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
283
+ .to match_fuzzy(<<~OUTPUT)
284
+ #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
285
+ #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
286
+ '<div style="mso-element:footnote-list"/>')}
287
+ #{WORD_FTR1}
288
+ OUTPUT
289
+ end
290
+
281
291
  it "processes a blank document" do
282
292
  Html2Doc.process(html_input(""), filename: "test")
283
293
  expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
367
377
  File.open("spec/header_img1.html", "w:UTF-8") do |f|
368
378
  f.write(
369
379
  doc.sub(%r{spec/19160-6.png},
370
- File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))),
380
+ File.expand_path(File.join(File.dirname(__FILE__),
381
+ "19160-6.png"))),
371
382
  )
372
383
  end
373
384
  Html2Doc.process(html_input(""),
@@ -450,7 +461,7 @@ RSpec.describe Html2Doc do
450
461
  OUTPUT
451
462
  end
452
463
 
453
- it "unwraps accent in MathML" do
464
+ it "unwraps and converts accent in MathML" do
454
465
  Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
455
466
  <mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
456
467
  </math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
@@ -458,7 +469,7 @@ RSpec.describe Html2Doc do
458
469
  .to match_fuzzy(<<~OUTPUT)
459
470
  #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
460
471
  #{word_body('<div><m:oMath>
461
- <m:acc><m:accPr><m:chr m:val="^"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
472
+ <m:acc><m:accPr><m:chr m:val="&#x302;"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
462
473
  </m:oMath>
463
474
  </div>', '<div style="mso-element:footnote-list"/>')}
464
475
  #{WORD_FTR1}
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
565
576
 
566
577
  it "resizes images for height, in a file in a subdirectory" do
567
578
  simple_body = '<img src="19160-6.png">'
568
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: "spec")
579
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
580
+ imagedir: "spec")
569
581
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
570
582
  expect(testdoc).to match(%r{Content-Type: image/png})
571
583
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
653
665
 
654
666
  it "deals with absolute image locations" do
655
667
  simple_body = %{<img src="#{__dir__}/19160-6.png">}
656
- Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: ".")
668
+ Html2Doc.process(html_input(simple_body), filename: "spec/test",
669
+ imagedir: ".")
657
670
  testdoc = File.read("spec/test.doc", encoding: "utf-8")
658
671
  expect(testdoc).to match(%r{Content-Type: image/png})
659
672
  expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.0
4
+ version: 1.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-12 00:00:00.000000000 Z
11
+ date: 2022-02-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: asciimath
@@ -123,19 +123,19 @@ dependencies:
123
123
  - !ruby/object:Gem::Version
124
124
  version: '0'
125
125
  - !ruby/object:Gem::Dependency
126
- name: byebug
126
+ name: debug
127
127
  requirement: !ruby/object:Gem::Requirement
128
128
  requirements:
129
- - - "~>"
129
+ - - ">="
130
130
  - !ruby/object:Gem::Version
131
- version: '9.1'
131
+ version: '0'
132
132
  type: :development
133
133
  prerelease: false
134
134
  version_requirements: !ruby/object:Gem::Requirement
135
135
  requirements:
136
- - - "~>"
136
+ - - ">="
137
137
  - !ruby/object:Gem::Version
138
- version: '9.1'
138
+ version: '0'
139
139
  - !ruby/object:Gem::Dependency
140
140
  name: equivalent-xml
141
141
  requirement: !ruby/object:Gem::Requirement
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
334
334
  - !ruby/object:Gem::Version
335
335
  version: '0'
336
336
  requirements: []
337
- rubygems_version: 3.1.4
337
+ rubygems_version: 3.2.32
338
338
  signing_key:
339
339
  specification_version: 4
340
340
  summary: Convert HTML document to Microsoft Word document