RubyGems - html2doc - Versions diffs - 1.5.4 → 1.6.0 - Mend

html2doc 1.5.4 → 1.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 74b05f46f1fd365f9ff0766e95d884bd2959c01b92c70d4a080651adfc2e8d3c
-  data.tar.gz: f70eb009e705ff767b34922fc0444740be8dde80da8b78c503784e02be0e4560
+  metadata.gz: 47535bf46876ee49a732b6c136f78b58a9ac009f880b95c5a73c8770293f3735
+  data.tar.gz: a052e0c3ba3ee27ca208b2624d7a832cba67bcc959f1ec5da36f9a7049c26c35
 SHA512:
-  metadata.gz: e3d93501d63bd27ed6e5245cb18dbc49013fcecd83bc57acf3a5d3c797636b928b91e148e33e8326f10f77f9b94a7175d85294eb86a1b4b2261aafb7dfe9d7a4
-  data.tar.gz: 4cbb8887089e622b9d9d1fd82dc4e5fd4e8e81a28a59dcf02ccce22cb3c9e6e7c4c7802177259557c268d697ced17ec09e4181e82c0dc851a613553e7f5b58c1
+  metadata.gz: '096dc5a7fe4b35e5afdec632f37b28f9980fcbcba4222e1ec1eb81fe4653a62cc00c5d9b90ed38ae54d4320ea8bf7e0fd0698625045504d371bcb90fb6247a54'
+  data.tar.gz: 7aeebef3892dc2273bc4ab9899624fc113b1989c3af097204b4f73eb250a7d52e8b3cfe62439e84fb179c1e9bbc96563af669d87f56b15dd82b4fc99953a2227

data/lib/html2doc/base.rb CHANGED Viewed

@@ -30,8 +30,7 @@ class Html2Doc
   end
   def process_header(headerfile)
-    return if headerfile.nil?
+    headerfile.nil? and return
     doc = File.read(headerfile, encoding: "utf-8")
     doc = header_image_cleanup(doc, @dir1, @filename,
                                File.dirname(@filename))
@@ -54,7 +53,7 @@ class Html2Doc
   end
   def process_html(result)
-    docxml = to_xhtml(asciimath_to_mathml(result, @asciimathdelims))
+    docxml = to_xhtml(result)
     define_head(cleanup(docxml))
     msword_fix(from_xhtml(docxml))
   end
@@ -66,6 +65,7 @@ class Html2Doc
   end
   def cleanup(docxml)
+    locate_landscape(docxml)
     namespace(docxml.root)
     image_cleanup(docxml, @dir1, @imagedir)
     mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
     docxml
   end
-  NOKOHEAD = <<~HERE.freeze
-    <!DOCTYPE html SYSTEM
-    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-    <html xmlns="http://www.w3.org/1999/xhtml">
-    <head> <title></title> <meta charset="UTF-8" /> </head>
-    <body> </body> </html>
-  HERE
-  def to_xhtml(xml)
-    xml.gsub!(/<\?xml[^>]*>/, "")
-    unless /<!DOCTYPE /.match? xml
-      xml = '<!DOCTYPE html SYSTEM
-          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
-    end
-    xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
-      .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
-    Nokogiri::XML.parse(xml)
-  end
-  DOCTYPE = <<~"DOCTYPE".freeze
-    <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
-  DOCTYPE
-  def from_xhtml(xml)
-    xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
-      .sub(DOCTYPE, "").gsub(%{ />}, "/>")
-      .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
-      .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
-      .gsub("\n--&gt;\n", "\n-->\n")
-  end
-  def msword_fix(doc)
-    # brain damage in MSWord parser
-    doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
-              "<w:DoNotOptimizeForBrowser/>")
-    doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
-              '<span style="mso-special-character:footnote"></span>')
-    doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
-              '<div style="mso-element:footnote-list"/>')
-    doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
-    doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
-    doc.gsub!(%r{<meta http-equiv="Content-Type"},
-              "<meta http-equiv=Content-Type")
-    doc.gsub!(%r{></m:jc>}, "/>")
-    doc.gsub!(%r{></v:stroke>}, "/>")
-    doc.gsub!(%r{></v:f>}, "/>")
-    doc.gsub!(%r{></v:path>}, "/>")
-    doc.gsub!(%r{></o:lock>}, "/>")
-    doc.gsub!(%r{></v:imagedata>}, "/>")
-    doc.gsub!(%r{></w:wrap>}, "/>")
-    doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
-    doc.gsub!(%r{&tab;|&amp;tab;},
-              '<span style="mso-tab-count:1">&#xA0; </span>')
-    doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
-      a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
-      a
-    end.join
-  end
-  PRINT_VIEW = <<~XML.freeze
-    <xml>
-    <w:WordDocument>
-    <w:View>Print</w:View>
-    <w:Zoom>100</w:Zoom>
-    <w:DoNotOptimizeForBrowser/>
-    </w:WordDocument>
-    </xml>
-    <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
-  XML
+  def locate_landscape(_docxml)
+    css = read_stylesheet(@stylesheet)
+    @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
+      .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
+  end
   def define_head1(docxml, _dir)
     docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -174,7 +109,6 @@ class Html2Doc
     # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
     xml.children.first << Nokogiri::XML::CDATA
       .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
     xml.root.to_s
   end
@@ -199,30 +133,15 @@ class Html2Doc
       head.add_child css
     elsif title.nil?
       head.children.first.add_previous_sibling css
-    else
-      title.add_next_sibling css
+    else title.add_next_sibling css
     end
   end
-  def namespace(root)
-    {
-      o: "urn:schemas-microsoft-com:office:office",
-      w: "urn:schemas-microsoft-com:office:word",
-      v: "urn:schemas-microsoft-com:vml",
-      m: "http://schemas.microsoft.com/office/2004/12/omml",
-    }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
-  end
-  def rootnamespace(root)
-    root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
-  end
   def bookmarks(docxml)
     docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
       .each do |x|
-      next if x["id"].empty? ||
-        %w(shapetype v:shapetype shape v:shape).include?(x.name)
+      (x["id"].empty? ||
+        %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
       if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
       else x.children.first.previous = "<a name='#{x['id']}'></a>"
       end

data/lib/html2doc/math.rb CHANGED Viewed

@@ -5,29 +5,6 @@ require "nokogiri"
 require "plane1converter"
 class Html2Doc
-  def asciimath_to_mathml1(expr, retain_asciimath)
-    ret = Plurimath::Math.parse(HTMLEntities.new.decode(expr), "asciimath").to_mathml
-      .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>").strip
-    retain_asciimath and
-      ret += "<asciimath>#{@c.encode(@c.decode(expr), :basic)}</asciimath>"
-    ret
-  rescue StandardError => e
-    puts "parsing: #{expr}"
-    puts e.message
-    raise e
-  end
-  def asciimath_to_mathml(doc, delims, retain_asciimath: false)
-    return doc if delims.nil? || delims.size < 2
-    m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
-    m.each_slice(4).map.with_index do |(*a), i|
-      progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
-      a[2].nil? or a[2] = asciimath_to_mathml1(a[2], retain_asciimath)
-      a.size > 1 ? a[0] + a[2] : a[0]
-    end.join
-  end
   def progress_conv(idx, step, total, threshold, msg)
     return unless (idx % step).zero? && total > threshold && idx.positive?

data/lib/html2doc/mime.rb CHANGED Viewed

@@ -78,8 +78,7 @@ class Html2Doc
   def image_resize(img, path, maxheight, maxwidth)
     s, realsize = get_image_size(img, path)
-    return s if s[0] == nil && s[1] == nil
+    s[0] == nil && s[1] == nil and return s
     if img.name == "svg" && !img["viewBox"]
       img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
     end
@@ -118,12 +117,24 @@ class Html2Doc
     docxml.traverse do |i|
       skip_image_cleanup?(i) and next
       local_filename = rename_image(i, dir, localdir)
-      i["width"], i["height"] = image_resize(i, local_filename, maxheight,
-                                             maxwidth)
+      i["width"], i["height"] =
+        if landscape?(i)
+          image_resize(i, local_filename, maxwidth, maxheight)
+        else
+          image_resize(i, local_filename, maxheight, maxwidth)
+        end
     end
     docxml
   end
+  def landscape?(img)
+    img.ancestors.each do |a|
+      a.name == "div" or next
+      @landscape.include?(a["class"]) and return true
+    end
+    false
+  end
   def rename_image(img, dir, localdir)
     local_filename = localname(img["src"], localdir)
     new_filename = "#{mkuuid}#{File.extname(img['src'])}"
@@ -134,10 +145,9 @@ class Html2Doc
   def skip_image_cleanup?(img)
     src = img["src"]
-    return true unless img.element? && %w(img v:imagedata).include?(img.name)
-    return true if src.nil? || src.empty? || /^http/.match?(src) ||
-      %r{^data:(image|application)/[^;]+;base64}.match?(src)
+    (img.element? && %w(img v:imagedata).include?(img.name)) or return true
+    (src.nil? || src.empty? || /^http/.match?(src) ||
+      %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
     false
   end
@@ -222,8 +232,7 @@ class Html2Doc
       f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
         <o:MainFile HRef="../#{filename}.htm"/>}
       Dir.entries(dir).sort.each do |item|
-        next if item == "." || item == ".." || /^\./.match(item)
+        (item == "." || item == ".." || /^\./.match(item)) and next
         f.write %{  <o:File HRef="#{item}"/>\n}
       end
       f.write("</xml>\n")

data/lib/html2doc/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 class Html2Doc
-  VERSION = "1.5.4".freeze
+  VERSION = "1.6.0".freeze
 end

data/lib/html2doc/xml.rb ADDED Viewed

@@ -0,0 +1,83 @@
+class Html2Doc
+  NOKOHEAD = <<~HERE.freeze
+    <!DOCTYPE html SYSTEM
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+    <html xmlns="http://www.w3.org/1999/xhtml">
+    <head> <title></title> <meta charset="UTF-8" /> </head>
+    <body> </body> </html>
+  HERE
+  def to_xhtml(xml)
+    xml.gsub!(/<\?xml[^>]*>/, "")
+    unless /<!DOCTYPE /.match? xml
+      xml = '<!DOCTYPE html SYSTEM
+          "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
+    end
+    xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
+      .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
+    Nokogiri::XML.parse(xml)
+  end
+  DOCTYPE = <<~DOCTYPE.freeze
+    <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+  DOCTYPE
+  def from_xhtml(xml)
+    xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
+      .sub(DOCTYPE, "").gsub(%{ />}, "/>")
+      .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
+      .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
+      .gsub("\n--&gt;\n", "\n-->\n")
+  end
+  def msword_fix(doc)
+    # brain damage in MSWord parser
+    doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
+              "<w:DoNotOptimizeForBrowser/>")
+    doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
+              '<span style="mso-special-character:footnote"></span>')
+    doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
+              '<div style="mso-element:footnote-list"/>')
+    doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
+    doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
+    doc.gsub!(%r{<meta http-equiv="Content-Type"},
+              "<meta http-equiv=Content-Type")
+    doc.gsub!(%r{></m:jc>}, "/>")
+    doc.gsub!(%r{></v:stroke>}, "/>")
+    doc.gsub!(%r{></v:f>}, "/>")
+    doc.gsub!(%r{></v:path>}, "/>")
+    doc.gsub!(%r{></o:lock>}, "/>")
+    doc.gsub!(%r{></v:imagedata>}, "/>")
+    doc.gsub!(%r{></w:wrap>}, "/>")
+    doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
+    doc.gsub!(%r{&tab;|&amp;tab;},
+              '<span style="mso-tab-count:1">&#xA0; </span>')
+    doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
+      a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
+      a
+    end.join
+  end
+  PRINT_VIEW = <<~XML.freeze
+    <xml>
+    <w:WordDocument>
+    <w:View>Print</w:View>
+    <w:Zoom>100</w:Zoom>
+    <w:DoNotOptimizeForBrowser/>
+    </w:WordDocument>
+    </xml>
+    <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
+  XML
+  def namespace(root)
+    { o: "urn:schemas-microsoft-com:office:office",
+      w: "urn:schemas-microsoft-com:office:word",
+      v: "urn:schemas-microsoft-com:vml",
+      m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
+  end
+  def rootnamespace(root)
+    root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
+  end
+end

data/lib/html2doc.rb CHANGED Viewed

@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
 require_relative "html2doc/notes"
 require_relative "html2doc/math"
 require_relative "html2doc/lists"
+require_relative "html2doc/xml"

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: html2doc
 version: !ruby/object:Gem::Version
-  version: 1.5.4
+  version: 1.6.0
 platform: ruby
 authors:
 - Ribose Inc.
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-05-19 00:00:00.000000000 Z
+date: 2023-08-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: htmlentities
@@ -293,6 +293,7 @@ files:
 - lib/html2doc/notes.rb
 - lib/html2doc/version.rb
 - lib/html2doc/wordstyle.css
+- lib/html2doc/xml.rb
 homepage: https://github.com/metanorma/html2doc
 licenses:
 - CC-BY-SA-3.0