RubyGems - html2doc - Versions diffs - 1.5.3 → 1.5.5 - Mend

html2doc 1.5.3 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 114c21f3bbc33c244fb49577d4c85334789a388be61d65c04765d33ed1913208
-  data.tar.gz: 60c00e3a300eb16db1f2332393d5ff9a45f65c60a8a1d40a1cfaba2767744249
+  metadata.gz: 46856bf56ad5dd95f8f5781dc11049bb4600060c28c49715a262837ece8028bf
+  data.tar.gz: 866ba19867f233b45aeee436df719679623d671902b30476b61952f7a6357e1f
 SHA512:
-  metadata.gz: 606e87a4dcfd0a3e270588461c6150dd38b1baad57bfc90b3965806477945deeae0b2eaeca87af9800b70d214133b11588a66f8205108e3af87f13464306b34a
-  data.tar.gz: ecc2ad3631a0cafb9ff3b7f04022b9a977efac71099f7b256f840ec6d15e418bb18f426c729d6a40e9a9cc196f3e02d4cf0c5c448a99c6097c2da9d6b2bfb4d0
+  metadata.gz: b949f47c356437ce418f65ce7fd1c497648d0d0e960fe1e05d7318d280ddf6de23ddad8e5ab94a18f447b3eaba948b2a4db69ca2d00f0dcfebd692933a64c1da
+  data.tar.gz: 953999bd39aa1c1b6a0e1a34c939dcbdba01242c898f5a587eab546b4c9a4578e8051c9c68b49900390fcb3ab5d2de4ad6f0f1bce1af5a8ee6e7bd4daf300966

data/lib/html2doc/base.rb CHANGED Viewed

@@ -30,8 +30,7 @@ class Html2Doc
   end
   def process_header(headerfile)
-    return if headerfile.nil?
+    headerfile.nil? and return
     doc = File.read(headerfile, encoding: "utf-8")
     doc = header_image_cleanup(doc, @dir1, @filename,
                                File.dirname(@filename))
@@ -66,6 +65,7 @@ class Html2Doc
   end
   def cleanup(docxml)
+    locate_landscape(docxml)
     namespace(docxml.root)
     image_cleanup(docxml, @dir1, @imagedir)
     mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
     docxml
   end
-  NOKOHEAD = <<~HERE.freeze
-    <!DOCTYPE html SYSTEM
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-    <html xmlns="http://www.w3.org/1999/xhtml">
-    <head> <title></title> <meta charset="UTF-8" /> </head>
-    <body> </body> </html>
-  HERE
-  def to_xhtml(xml)
-    xml.gsub!(/<\?xml[^>]*>/, "")
-    unless /<!DOCTYPE /.match? xml
-      xml = '<!DOCTYPE html SYSTEM
-          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
-    end
-    xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
-      .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
-    Nokogiri::XML.parse(xml)
-  end
-  DOCTYPE = <<~"DOCTYPE".freeze
-    <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-  DOCTYPE
-  def from_xhtml(xml)
-    xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
-      .sub(DOCTYPE, "").gsub(%{ />}, "/>")
-      .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
-      .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
-      .gsub("\n--&gt;\n", "\n-->\n")
-  end
-  def msword_fix(doc)
-    # brain damage in MSWord parser
-    doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
-              "<w:DoNotOptimizeForBrowser/>")
-    doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
-              '<span style="mso-special-character:footnote"></span>')
-    doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
-              '<div style="mso-element:footnote-list"/>')
-    doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
-    doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
-    doc.gsub!(%r{<meta http-equiv="Content-Type"},
-              "<meta http-equiv=Content-Type")
-    doc.gsub!(%r{></m:jc>}, "/>")
-    doc.gsub!(%r{></v:stroke>}, "/>")
-    doc.gsub!(%r{></v:f>}, "/>")
-    doc.gsub!(%r{></v:path>}, "/>")
-    doc.gsub!(%r{></o:lock>}, "/>")
-    doc.gsub!(%r{></v:imagedata>}, "/>")
-    doc.gsub!(%r{></w:wrap>}, "/>")
-    doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
-    doc.gsub!(%r{&tab;|&amp;tab;},
-              '<span style="mso-tab-count:1">&#xA0; </span>')
-    doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
-      a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
-      a
-    end.join
-  end
-  PRINT_VIEW = <<~XML.freeze
-    <xml>
-    <w:WordDocument>
-    <w:View>Print</w:View>
-    <w:Zoom>100</w:Zoom>
-    <w:DoNotOptimizeForBrowser/>
-    </w:WordDocument>
-    </xml>
-    <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
-  XML
+  def locate_landscape(_docxml)
+    css = read_stylesheet(@stylesheet)
+    @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
+      .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
+  end
   def define_head1(docxml, _dir)
     docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -168,18 +103,21 @@ class Html2Doc
   end
   def stylesheet(_filename, _header_filename, cssname)
-    (cssname.nil? || cssname.empty?) and
-      cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
-    stylesheet = File.read(cssname, encoding: "UTF-8")
+    stylesheet = read_stylesheet(cssname)
     xml = Nokogiri::XML("<style/>")
     # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
     # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
     xml.children.first << Nokogiri::XML::CDATA
       .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
     xml.root.to_s
   end
+  def read_stylesheet(cssname)
+    (cssname.nil? || cssname.empty?) and
+      cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
+    File.read(cssname, encoding: "UTF-8")
+  end
   def define_head(docxml)
     title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
     head = docxml.at("//*[local-name() = 'head']")
@@ -195,30 +133,15 @@ class Html2Doc
       head.add_child css
     elsif title.nil?
       head.children.first.add_previous_sibling css
-    else
-      title.add_next_sibling css
+    else title.add_next_sibling css
     end
   end
-  def namespace(root)
-    {
-      o: "urn:schemas-microsoft-com:office:office",
-      w: "urn:schemas-microsoft-com:office:word",
-      v: "urn:schemas-microsoft-com:vml",
-      m: "http://schemas.microsoft.com/office/2004/12/omml",
-    }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
-  end
-  def rootnamespace(root)
-    root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
-  end
   def bookmarks(docxml)
     docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
       .each do |x|
-      next if x["id"].empty? ||
-        %w(shapetype v:shapetype shape v:shape).include?(x.name)
+      (x["id"].empty? ||
+        %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
       if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
       else x.children.first.previous = "<a name='#{x['id']}'></a>"
       end

data/lib/html2doc/mime.rb CHANGED Viewed

@@ -76,11 +76,9 @@ class Html2Doc
     end
   end
-  # max width for Word document is 400, max height is 680
   def image_resize(img, path, maxheight, maxwidth)
     s, realsize = get_image_size(img, path)
-    return s if s[0] == nil && s[1] == nil
+    s[0] == nil && s[1] == nil and return s
     if img.name == "svg" && !img["viewBox"]
       img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
     end
@@ -115,21 +113,100 @@ class Html2Doc
   # only processes locally stored images
   def image_cleanup(docxml, dir, localdir)
+    maxheight, maxwidth = page_dimensions(docxml)
     docxml.traverse do |i|
-      src = i["src"]
-      next unless i.element? && %w(img v:imagedata).include?(i.name)
-      next if src.nil? || src.empty? || /^http/.match?(src)
-      next if %r{^data:(image|application)/[^;]+;base64}.match? src
-      local_filename = localname(src, localdir)
-      new_filename = "#{mkuuid}#{File.extname(src)}"
-      FileUtils.cp local_filename, File.join(dir, new_filename)
-      i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
-      i["src"] = File.join(File.basename(dir), new_filename)
+      skip_image_cleanup?(i) and next
+      local_filename = rename_image(i, dir, localdir)
+      i["width"], i["height"] =
+        if landscape?(i)
+          image_resize(i, local_filename, maxwidth, maxheight)
+        else
+          image_resize(i, local_filename, maxheight, maxwidth)
+        end
     end
     docxml
   end
+  def landscape?(img)
+    img.ancestors.each do |a|
+      a.name == "div" or next
+      @landscape.include?(a["class"]) and return true
+    end
+    false
+  end
+  def rename_image(img, dir, localdir)
+    local_filename = localname(img["src"], localdir)
+    new_filename = "#{mkuuid}#{File.extname(img['src'])}"
+    FileUtils.cp local_filename, File.join(dir, new_filename)
+    img["src"] = File.join(File.basename(dir), new_filename)
+    local_filename
+  end
+  def skip_image_cleanup?(img)
+    src = img["src"]
+    (img.element? && %w(img v:imagedata).include?(img.name)) or return true
+    (src.nil? || src.empty? || /^http/.match?(src) ||
+      %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
+    false
+  end
+  # we are going to use the 2nd instance of @page in the Word CSS,
+  # skipping the cover page. Currently doesn't deal with Landscape.
+  # Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
+  # Allow 0.9 * height to fit caption
+  def page_dimensions(docxml)
+    stylesheet = read_stylesheet(@stylesheet)
+    page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
+      return [680, 400]
+    m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
+    m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
+      return [680, 400]
+    [0.9 * (units_to_px(m_size[2]) - units_to_px(m_marg[1]) - units_to_px(m_marg[3])),
+     units_to_px(m_size[1]) - units_to_px(m_marg[2]) - units_to_px(m_marg[4])]
+  rescue StandardError
+    [680, 400]
+  end
+  def find_page_size_in_doc(stylesheet, doc)
+    find_page_size(stylesheet, "WordSection2", false) ||
+      find_page_size(stylesheet, "WordSection3", false) ||
+      find_page_size(doc, "WordSection2", true) ||
+      find_page_size(doc, "WordSection3", true) ||
+      find_page_size(stylesheet, "", false) || find_page_size(doc, "", true)
+  end
+  # if in_xml, CSS is embedded in XML <style> tag
+  def find_page_size(stylesheet, klass, in_xml)
+    xml_found = false
+    found = false
+    ret = ""
+    stylesheet&.lines&.each do |l|
+      in_xml && l.include?("<style") and xml_found = true and found = false
+      in_xml && l.include?("</style>") and xml_found = false
+      /^\s*@page\s+#{klass}/.match?(l) and found = true
+      found && /^\s*\{?size:/.match?(l) and ret += l
+      found && /^\s*\{?margin:/.match?(l) and ret += l
+      if found && /}/.match?(l)
+        !ret.blank? && (!in_xml || xml_found) and return ret
+        ret = ""
+        found = false
+      end
+    end
+    nil
+  end
+  def units_to_px(measure)
+    m = /^(\S+)(pt|cm)/.match(measure)
+    ret = case m[2]
+          when "px" then (m[1].to_f * 0.75)
+          when "pt" then m[1].to_f
+          when "cm" then (m[1].to_f * 28.346456693)
+          when "in" then (m[1].to_f * 72)
+          end
+    ret.to_i
+  end
   # do not parse the header through Nokogiri, since it will contain
   # non-XML like <![if !supportFootnotes]>
   def header_image_cleanup(doc, dir, filename, localdir)
@@ -155,8 +232,7 @@ class Html2Doc
       f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
         <o:MainFile HRef="../#{filename}.htm"/>}
       Dir.entries(dir).sort.each do |item|
-        next if item == "." || item == ".." || /^\./.match(item)
+        (item == "." || item == ".." || /^\./.match(item)) and next
         f.write %{  <o:File HRef="#{item}"/>\n}
       end
       f.write("</xml>\n")

data/lib/html2doc/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class Html2Doc
-  VERSION = "1.5.3".freeze
+  VERSION = "1.5.5".freeze
 end

data/lib/html2doc/xml.rb ADDED Viewed

@@ -0,0 +1,83 @@
+class Html2Doc
+  NOKOHEAD = <<~HERE.freeze
+    <!DOCTYPE html SYSTEM
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+    <html xmlns="http://www.w3.org/1999/xhtml">
+    <head> <title></title> <meta charset="UTF-8" /> </head>
+    <body> </body> </html>
+  HERE
+  def to_xhtml(xml)
+    xml.gsub!(/<\?xml[^>]*>/, "")
+    unless /<!DOCTYPE /.match? xml
+      xml = '<!DOCTYPE html SYSTEM
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
+    end
+    xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
+      .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
+    Nokogiri::XML.parse(xml)
+  end
+  DOCTYPE = <<~DOCTYPE.freeze
+    <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+  DOCTYPE
+  def from_xhtml(xml)
+    xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
+      .sub(DOCTYPE, "").gsub(%{ />}, "/>")
+      .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
+      .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
+      .gsub("\n--&gt;\n", "\n-->\n")
+  end
+  def msword_fix(doc)
+    # brain damage in MSWord parser
+    doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
+              "<w:DoNotOptimizeForBrowser/>")
+    doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
+              '<span style="mso-special-character:footnote"></span>')
+    doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
+              '<div style="mso-element:footnote-list"/>')
+    doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
+    doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
+    doc.gsub!(%r{<meta http-equiv="Content-Type"},
+              "<meta http-equiv=Content-Type")
+    doc.gsub!(%r{></m:jc>}, "/>")
+    doc.gsub!(%r{></v:stroke>}, "/>")
+    doc.gsub!(%r{></v:f>}, "/>")
+    doc.gsub!(%r{></v:path>}, "/>")
+    doc.gsub!(%r{></o:lock>}, "/>")
+    doc.gsub!(%r{></v:imagedata>}, "/>")
+    doc.gsub!(%r{></w:wrap>}, "/>")
+    doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
+    doc.gsub!(%r{&tab;|&amp;tab;},
+              '<span style="mso-tab-count:1">&#xA0; </span>')
+    doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
+      a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
+      a
+    end.join
+  end
+  PRINT_VIEW = <<~XML.freeze
+    <xml>
+    <w:WordDocument>
+    <w:View>Print</w:View>
+    <w:Zoom>100</w:Zoom>
+    <w:DoNotOptimizeForBrowser/>
+    </w:WordDocument>
+    </xml>
+    <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
+  XML
+  def namespace(root)
+    { o: "urn:schemas-microsoft-com:office:office",
+      w: "urn:schemas-microsoft-com:office:word",
+      v: "urn:schemas-microsoft-com:vml",
+      m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
+  end
+  def rootnamespace(root)
+    root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
+  end
+end

data/lib/html2doc.rb CHANGED Viewed

@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
 require_relative "html2doc/notes"
 require_relative "html2doc/math"
 require_relative "html2doc/lists"
+require_relative "html2doc/xml"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2doc
 version: !ruby/object:Gem::Version
-  version: 1.5.3
+  version: 1.5.5
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-04-10 00:00:00.000000000 Z
+date: 2023-06-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: htmlentities
@@ -293,6 +293,7 @@ files:
 - lib/html2doc/notes.rb
 - lib/html2doc/version.rb
 - lib/html2doc/wordstyle.css
+- lib/html2doc/xml.rb
 homepage: https://github.com/metanorma/html2doc
 licenses:
 - CC-BY-SA-3.0