RubyGems - html2doc - Versions diffs - 1.2.0 → 1.3.1 - Mend

html2doc 1.2.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 64cb262d3176610f0887cc69bab66fa2a3a7ca8445f8ad493d9d65c455a091d6
-  data.tar.gz: 468dc7a8fb687cdbf6db1497cf9d9b5e164687b7d460a5eac1fb983b4673672b
+  metadata.gz: 56d8c42bd609845f35a5a994fed43d12ebc9fb0d8d303fd60f9a064f4da26a7b
+  data.tar.gz: e9310883dbc5991640e66a1c085d6bcb2ca87155449326b7076489e78d64d187
 SHA512:
-  metadata.gz: 2a47bbe5df7ae0767ad2f4ccf52c1f96b8e27fc32d08b3b7b25e5051a3d229b29b9852a51c052a701990f9be6dbf0efc97795ea0c0ff4b3745b63f5a0c7adb4f
-  data.tar.gz: c4c10a84141889d820fd8d2afc273122b28372794edd9fdb3e60aee28773350d3e545f2a3efb75c4d28eff350d367b020d01d6e5cc2874a957ca612124e78fd4
+  metadata.gz: 8d7076b196634dc81a3942a59155c7c80da21b9eb68721dab437170c54876f970b80448fa31f520648145eca9ace1fea0c7751be04021f9c1f95fe0bf3fa64ce
+  data.tar.gz: 532b022bda9cc4fb88eafeb467c7d6d26ba8dc5ea21f5553ba251e8b92469de6e906f5947a3fdb3bbee241cbaea8805f3477978bbe78af359a6bb7140399a971

data/.github/workflows/rake.yml CHANGED Viewed

@@ -10,23 +10,6 @@ on:
 jobs:
   rake:
-    name: Test on Ruby ${{ matrix.ruby }} ${{ matrix.os }}
-    runs-on: ${{ matrix.os }}
-    continue-on-error: ${{ matrix.experimental }}
-    strategy:
-      fail-fast: false
-      matrix:
-        ruby: [ '3.0', '2.7', '2.6', '2.5' ]
-        os: [ ubuntu-latest, windows-latest, macos-latest ]
-        experimental: [ false ]
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          submodules: true
-      - uses: ruby/setup-ruby@v1
-        with:
-          ruby-version: ${{ matrix.ruby }}
-          bundler-cache: true
-      - run: bundle exec rake
+    uses: metanorma/metanorma-build-scripts/.github/workflows/generic-rake.yml@main
+    secrets:
+      pat_token: ${{ secrets.METANORMA_CI_PAT_TOKEN }}

data/html2doc.gemspec CHANGED Viewed

@@ -33,7 +33,7 @@ Gem::Specification.new do |spec|
   spec.add_dependency "thread_safe"
   spec.add_dependency "uuidtools"
-  spec.add_development_dependency "byebug", "~> 9.1"
+  spec.add_development_dependency "debug"
   spec.add_development_dependency "equivalent-xml", "~> 0.6"
   spec.add_development_dependency "guard", "~> 2.14"
   spec.add_development_dependency "guard-rspec", "~> 4.7"

data/lib/html2doc/base.rb CHANGED Viewed

@@ -76,6 +76,8 @@ module Html2Doc
       xml = '<!DOCTYPE html SYSTEM
           "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
     end
+    xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
+      .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
     Nokogiri::XML.parse(xml)
   end
@@ -85,12 +87,16 @@ module Html2Doc
   def self.from_xhtml(xml)
     xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
-      .sub(DOCTYPE, "")
-      .gsub(%{ />}, "/>")
+      .sub(DOCTYPE, "").gsub(%{ />}, "/>")
+      .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
+      .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
+      .gsub("\n--&gt;\n", "\n-->\n")
   end
   def self.msword_fix(doc)
     # brain damage in MSWord parser
+    doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
+              "<w:DoNotOptimizeForBrowser/>")
     doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
               '<span style="mso-special-character:footnote"></span>')
     doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
@@ -116,7 +122,7 @@ module Html2Doc
   end
   PRINT_VIEW = <<~XML.freeze
-    <!--[if gte mso 9]>
     <xml>
     <w:WordDocument>
     <w:View>Print</w:View>
@@ -124,8 +130,7 @@ module Html2Doc
     <w:DoNotOptimizeForBrowser/>
     </w:WordDocument>
     </xml>
-    <![endif]-->
-    <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
+    <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
   XML
   def self.define_head1(docxml, _dir)
@@ -148,12 +153,16 @@ module Html2Doc
     end
   end
-  def self.stylesheet(_filename, _header_filename, fn)
-    (fn.nil? || fn.empty?) and
-      fn = File.join(File.dirname(__FILE__), "wordstyle.css")
-    stylesheet = File.read(fn, encoding: "UTF-8")
+  def self.stylesheet(_filename, _header_filename, cssname)
+    (cssname.nil? || cssname.empty?) and
+      cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
+    stylesheet = File.read(cssname, encoding: "UTF-8")
     xml = Nokogiri::XML("<style/>")
-    xml.children.first << Nokogiri::XML::Comment.new(xml, "\n#{stylesheet}\n")
+    #s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
+    #xml.children.first << Nokogiri::XML::Comment.new(xml, s)
+    xml.children.first << Nokogiri::XML::CDATA
+      .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
     xml.root.to_s
   end

data/lib/html2doc/math.rb CHANGED Viewed

@@ -140,7 +140,7 @@ module Html2Doc
     end
   end
-  # We need span and em not to be namespaced. Word can't deal with explicit
+  # We need span and em not to be namespaced. Word can't deal with explicit
   # namespaces.
   # We will end up stripping them out again under Nokogiri 1.11, which correctly
   # insists on inheriting namespace from parent.
@@ -154,11 +154,28 @@ module Html2Doc
   def self.mathml_to_ooml1(xml, docnamespaces)
     doc = Nokogiri::XML::Document::new
     doc.root = ooxml_cleanup(xml, docnamespaces)
-      ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
+    ooxml = ooml_clean(unitalic(esc_space(accent_tr(@xsltemplate.transform(doc)))))
     ooxml = uncenter(xml, ooxml)
     xml.swap(ooxml)
   end
+  def self.accent_tr(xml)
+    xml.xpath(".//*[local-name()='accPr']/*[local-name()='chr']").each do |x|
+      x["m:val"] &&= accent_tr1(x["m:val"])
+      x["val"] &&= accent_tr1(x["val"])
+    end
+    xml
+  end
+  def self.accent_tr1(accent)
+    case accent
+    when "\u2192" then "\u20D7"
+    when "^" then "\u0302"
+    when "~" then "\u0303"
+    else accent
+    end
+  end
   # escape space as &#x32;; we are removing any spaces generated by
   # XML indentation
   def self.esc_space(xml)
@@ -180,7 +197,7 @@ module Html2Doc
     %w(left right).each do |dir|
       if alignnode.text.include? ("text-align:#{dir}")
         ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
-          "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
+                "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
       end
     end
     ooxml

data/lib/html2doc/mime.rb CHANGED Viewed

@@ -107,12 +107,13 @@ module Html2Doc
   # only processes locally stored images
   def self.image_cleanup(docxml, dir, localdir)
     docxml.traverse do |i|
+      src = i["src"]
       next unless i.element? && %w(img v:imagedata).include?(i.name)
-      next if /^http/.match? i["src"]
-      next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
+      next if src.nil? || src.empty? || /^http/.match?(src)
+      next if %r{^data:(image|application)/[^;]+;base64}.match? src
-      local_filename = localname(i["src"], localdir)
-      new_filename = "#{mkuuid}#{File.extname(i['src'])}"
+      local_filename = localname(src, localdir)
+      new_filename = "#{mkuuid}#{File.extname(src)}"
       FileUtils.cp local_filename, File.join(dir, new_filename)
       i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
       i["src"] = File.join(File.basename(dir), new_filename)

data/lib/html2doc/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Html2Doc
-  VERSION = "1.2.0".freeze
+  VERSION = "1.3.1".freeze
 end

data/spec/html2doc_spec.rb CHANGED Viewed

@@ -41,7 +41,7 @@ WORD_HDR = <<~HDR.freeze
   Content-Type: text/html; charset="utf-8"
   <?xml version="1.0"?>
-  <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><!--[if gte mso 9]>
+  <html xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head>
   <xml>
   <w:WordDocument>
   <w:View>Print</w:View>
@@ -49,7 +49,6 @@ WORD_HDR = <<~HDR.freeze
   <w:DoNotOptimizeForBrowser/>
   </w:WordDocument>
   </xml>
-  <![endif]-->
   <meta http-equiv=Content-Type content="text/html; charset=utf-8"/>
     <link rel=File-List href="cid:filelist.xml"/>
@@ -278,6 +277,17 @@ RSpec.describe Html2Doc do
     expect(Html2Doc::VERSION).not_to be nil
   end
+  it "preserves Word HTML directives" do
+    Html2Doc.process(html_input(%[A<!--[if gte mso 9]>X<![endif]-->B]), filename: "test")
+    expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
+      .to match_fuzzy(<<~OUTPUT)
+        #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
+        #{word_body(%{A<!--[if gte mso 9]>X<![endif]-->B},
+                   '<div style="mso-element:footnote-list"/>')}
+        #{WORD_FTR1}
+      OUTPUT
+  end
   it "processes a blank document" do
     Html2Doc.process(html_input(""), filename: "test")
     expect(guid_clean(File.read("test.doc", encoding: "utf-8")))
@@ -367,7 +377,8 @@ RSpec.describe Html2Doc do
     File.open("spec/header_img1.html", "w:UTF-8") do |f|
       f.write(
         doc.sub(%r{spec/19160-6.png},
-                File.expand_path(File.join(File.dirname(__FILE__), "19160-6.png"))),
+                File.expand_path(File.join(File.dirname(__FILE__),
+                                           "19160-6.png"))),
       )
     end
     Html2Doc.process(html_input(""),
@@ -450,7 +461,7 @@ RSpec.describe Html2Doc do
       OUTPUT
   end
-  it "unwraps accent in MathML" do
+  it "unwraps and converts accent in MathML" do
     Html2Doc.process(html_input("<div><math xmlns='http://www.w3.org/1998/Math/MathML'>
                                 <mover accent='true'><mrow><mi>p</mi></mrow><mrow><mo>^</mo></mrow></mover>
 </math></div>"), filename: "test", asciimathdelims: ["{{", "}}"])
@@ -458,7 +469,7 @@ RSpec.describe Html2Doc do
       .to match_fuzzy(<<~OUTPUT)
         #{WORD_HDR} #{DEFAULT_STYLESHEET} #{WORD_HDR_END}
         #{word_body('<div><m:oMath>
-                <m:acc><m:accPr><m:chr m:val="^"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
+                <m:acc><m:accPr><m:chr m:val="&#x302;"></m:chr></m:accPr><m:e><m:r><m:t>p</m:t></m:r></m:e></m:acc>
                 </m:oMath>
                 </div>', '<div style="mso-element:footnote-list"/>')}
         #{WORD_FTR1}
@@ -565,7 +576,8 @@ RSpec.describe Html2Doc do
   it "resizes images for height, in a file in a subdirectory" do
     simple_body = '<img src="19160-6.png">'
-    Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: "spec")
+    Html2Doc.process(html_input(simple_body), filename: "spec/test",
+                                              imagedir: "spec")
     testdoc = File.read("spec/test.doc", encoding: "utf-8")
     expect(testdoc).to match(%r{Content-Type: image/png})
     expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)
@@ -653,7 +665,8 @@ RSpec.describe Html2Doc do
   it "deals with absolute image locations" do
     simple_body = %{<img src="#{__dir__}/19160-6.png">}
-    Html2Doc.process(html_input(simple_body), filename: "spec/test", imagedir: ".")
+    Html2Doc.process(html_input(simple_body), filename: "spec/test",
+                                              imagedir: ".")
     testdoc = File.read("spec/test.doc", encoding: "utf-8")
     expect(testdoc).to match(%r{Content-Type: image/png})
     expect(image_clean(guid_clean(testdoc))).to match_fuzzy(<<~OUTPUT)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2doc
 version: !ruby/object:Gem::Version
-  version: 1.2.0
+  version: 1.3.1
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-10-12 00:00:00.000000000 Z
+date: 2022-02-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: asciimath
@@ -123,19 +123,19 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '0'
 - !ruby/object:Gem::Dependency
-  name: byebug
+  name: debug
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '9.1'
+        version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '9.1'
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: equivalent-xml
   requirement: !ruby/object:Gem::Requirement
@@ -334,7 +334,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.1.4
+rubygems_version: 3.2.32
 signing_key:
 specification_version: 4
 summary: Convert HTML document to Microsoft Word document