html2doc 1.6.0 → 1.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/html2doc.gemspec +2 -1
- data/lib/html2doc/base.rb +1 -1
- data/lib/html2doc/math.rb +106 -47
- data/lib/html2doc/version.rb +1 -1
- metadata +20 -6
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 020bfc8d51718ee0fbc78ba2ce57fe1d306a50e93cb41949e98088228190dee5
         | 
| 4 | 
            +
              data.tar.gz: 4fbad0486ed1cc59f7b67be1a5c2629ff840063b2bd8f1ffdbe50a7af082cc4e
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 45e4f66e7ebc8591620a04e35362761ec48a20b51f735dd9977f05de6698e6cdbca67bb0ddf1615596d2924b7ee336b6e4b196f0dbccb00132e59ec660fab425
         | 
| 7 | 
            +
              data.tar.gz: 868e01b43b930657accc0861c917116f78a9342512163803d038a70da8926c401ffe58a486d824a80115283cfc4a327743ab8d04931ce02c720a0f4aef5d8207
         | 
    
        data/html2doc.gemspec
    CHANGED
    
    | @@ -28,10 +28,11 @@ Gem::Specification.new do |spec| | |
| 28 28 |  | 
| 29 29 | 
             
              spec.add_dependency "htmlentities", "~> 4.3.4"
         | 
| 30 30 | 
             
              spec.add_dependency "image_size", ">= 3.2.0"
         | 
| 31 | 
            +
              spec.add_dependency "metanorma-utils"
         | 
| 31 32 | 
             
              spec.add_dependency "mime-types"
         | 
| 32 33 | 
             
              spec.add_dependency "nokogiri", "~> 1.14"
         | 
| 33 34 | 
             
              spec.add_dependency "plane1converter", "~> 0.0.1"
         | 
| 34 | 
            -
              spec.add_dependency "plurimath"
         | 
| 35 | 
            +
              spec.add_dependency "plurimath", "~> 0.5.0"
         | 
| 35 36 | 
             
              spec.add_dependency "thread_safe"
         | 
| 36 37 | 
             
              spec.add_dependency "uuidtools"
         | 
| 37 38 |  | 
    
        data/lib/html2doc/base.rb
    CHANGED
    
    | @@ -141,7 +141,7 @@ class Html2Doc | |
| 141 141 | 
             
                docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
         | 
| 142 142 | 
             
                  .each do |x|
         | 
| 143 143 | 
             
                  (x["id"].empty? ||
         | 
| 144 | 
            -
                    %w(shapetype v: | 
| 144 | 
            +
                    %w(v:shapetype v:shape v:rect v:line v:group).include?(x.name)) and next
         | 
| 145 145 | 
             
                  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
         | 
| 146 146 | 
             
                  else x.children.first.previous = "<a name='#{x['id']}'></a>"
         | 
| 147 147 | 
             
                  end
         | 
    
        data/lib/html2doc/math.rb
    CHANGED
    
    | @@ -3,6 +3,20 @@ require "plurimath" | |
| 3 3 | 
             
            require "htmlentities"
         | 
| 4 4 | 
             
            require "nokogiri"
         | 
| 5 5 | 
             
            require "plane1converter"
         | 
| 6 | 
            +
            require "metanorma-utils"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            module Nokogiri
         | 
| 9 | 
            +
              module XML
         | 
| 10 | 
            +
                class Node
         | 
| 11 | 
            +
                  OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  def ooxml_xpath(path)
         | 
| 14 | 
            +
                    p = Metanorma::Utils::ns(path).gsub("xmlns:", "m:")
         | 
| 15 | 
            +
                    xpath(p, "m" => OOXML_NS)
         | 
| 16 | 
            +
                  end
         | 
| 17 | 
            +
                end
         | 
| 18 | 
            +
              end
         | 
| 19 | 
            +
            end
         | 
| 6 20 |  | 
| 7 21 | 
             
            class Html2Doc
         | 
| 8 22 | 
             
              def progress_conv(idx, step, total, threshold, msg)
         | 
| @@ -20,17 +34,30 @@ class Html2Doc | |
| 20 34 | 
             
                doc
         | 
| 21 35 | 
             
              end
         | 
| 22 36 |  | 
| 37 | 
            +
              MATHML_NS = "http://www.w3.org/1998/Math/MathML".freeze
         | 
| 38 | 
            +
             | 
| 23 39 | 
             
              # random fixes to MathML input that OOXML needs to render properly
         | 
| 24 40 | 
             
              def ooxml_cleanup(math, docnamespaces)
         | 
| 25 | 
            -
                 | 
| 26 | 
            -
                   | 
| 27 | 
            -
                     | 
| 28 | 
            -
             | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 41 | 
            +
                #encode_math(
         | 
| 42 | 
            +
                  unwrap_accents(
         | 
| 43 | 
            +
                    mathml_preserve_space(
         | 
| 44 | 
            +
                      mathml_insert_rows(math, docnamespaces), docnamespaces
         | 
| 45 | 
            +
                    ),
         | 
| 46 | 
            +
                  )
         | 
| 47 | 
            +
                #)
         | 
| 48 | 
            +
                math.add_namespace(nil, MATHML_NS)
         | 
| 31 49 | 
             
                math
         | 
| 32 50 | 
             
              end
         | 
| 33 51 |  | 
| 52 | 
            +
              def encode_math(elem)
         | 
| 53 | 
            +
                elem.traverse do |e|
         | 
| 54 | 
            +
                  e.text? or next
         | 
| 55 | 
            +
                  e.text.strip.empty? and next
         | 
| 56 | 
            +
                  e.replace(@c.encode(e.text, :hexadecimal))
         | 
| 57 | 
            +
                end
         | 
| 58 | 
            +
                elem
         | 
| 59 | 
            +
              end
         | 
| 60 | 
            +
             | 
| 34 61 | 
             
              def mathml_insert_rows(math, docnamespaces)
         | 
| 35 62 | 
             
                math.xpath(%w(msup msub msubsup munder mover munderover)
         | 
| 36 63 | 
             
                        .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
         | 
| @@ -50,47 +77,57 @@ class Html2Doc | |
| 50 77 |  | 
| 51 78 | 
             
              HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
         | 
| 52 79 |  | 
| 80 | 
            +
              def wrap_text(elem, wrapper)
         | 
| 81 | 
            +
                elem.traverse do |e|
         | 
| 82 | 
            +
                  e.text? or next
         | 
| 83 | 
            +
                  e.text.strip.empty? and next
         | 
| 84 | 
            +
                  e.wrap(wrapper)
         | 
| 85 | 
            +
                end
         | 
| 86 | 
            +
              end
         | 
| 87 | 
            +
             | 
| 53 88 | 
             
              def unitalic(math)
         | 
| 54 | 
            -
                math. | 
| 55 | 
            -
                  x | 
| 89 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'p']]").each do |x|
         | 
| 90 | 
            +
                  wrap_text(x, "<span #{HTML_NS} style='font-style:normal;'></span>")
         | 
| 56 91 | 
             
                end
         | 
| 57 | 
            -
                math. | 
| 58 | 
            -
                  x | 
| 92 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'bi']]").each do |x|
         | 
| 93 | 
            +
                  wrap_text(x,
         | 
| 94 | 
            +
                            "<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
         | 
| 59 95 | 
             
                end
         | 
| 60 | 
            -
                math. | 
| 61 | 
            -
                  x | 
| 96 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'i']]").each do |x|
         | 
| 97 | 
            +
                  wrap_text(x, "<span #{HTML_NS} class='nostem'><em></em></span>")
         | 
| 62 98 | 
             
                end
         | 
| 63 | 
            -
                math. | 
| 64 | 
            -
                  x | 
| 99 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:scr)]/sty[@m:val = 'b']]").each do |x|
         | 
| 100 | 
            +
                  wrap_text(x,
         | 
| 101 | 
            +
                            "<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
         | 
| 65 102 | 
             
                end
         | 
| 66 | 
            -
                math. | 
| 103 | 
            +
                math.ooxml_xpath(".//r[rPr/scr[@m:val = 'monospace']]").each do |x|
         | 
| 67 104 | 
             
                  to_plane1(x, :monospace)
         | 
| 68 105 | 
             
                end
         | 
| 69 | 
            -
                math. | 
| 106 | 
            +
                math.ooxml_xpath(".//r[rPr/scr[@m:val = 'double-struck']]").each do |x|
         | 
| 70 107 | 
             
                  to_plane1(x, :doublestruck)
         | 
| 71 108 | 
             
                end
         | 
| 72 | 
            -
                math. | 
| 109 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'script']]").each do |x|
         | 
| 73 110 | 
             
                  to_plane1(x, :script)
         | 
| 74 111 | 
             
                end
         | 
| 75 | 
            -
                math. | 
| 112 | 
            +
                math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'script']]").each do |x|
         | 
| 76 113 | 
             
                  to_plane1(x, :scriptbold)
         | 
| 77 114 | 
             
                end
         | 
| 78 | 
            -
                math. | 
| 115 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'fraktur']]").each do |x|
         | 
| 79 116 | 
             
                  to_plane1(x, :fraktur)
         | 
| 80 117 | 
             
                end
         | 
| 81 | 
            -
                math. | 
| 118 | 
            +
                math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'fraktur']]").each do |x|
         | 
| 82 119 | 
             
                  to_plane1(x, :frakturbold)
         | 
| 83 120 | 
             
                end
         | 
| 84 | 
            -
                math. | 
| 121 | 
            +
                math.ooxml_xpath(".//r[rPr[not(m:sty) or m:sty/@m:val = 'p']/scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 85 122 | 
             
                  to_plane1(x, :sans)
         | 
| 86 123 | 
             
                end
         | 
| 87 | 
            -
                math. | 
| 124 | 
            +
                math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'b']/scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 88 125 | 
             
                  to_plane1(x, :sansbold)
         | 
| 89 126 | 
             
                end
         | 
| 90 | 
            -
                math. | 
| 127 | 
            +
                math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'i']/scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 91 128 | 
             
                  to_plane1(x, :sansitalic)
         | 
| 92 129 | 
             
                end
         | 
| 93 | 
            -
                math. | 
| 130 | 
            +
                math.ooxml_xpath(".//r[rPr[m:sty/@m:val = 'bi']/scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 94 131 | 
             
                  to_plane1(x, :sansbolditalic)
         | 
| 95 132 | 
             
                end
         | 
| 96 133 | 
             
                math
         | 
| @@ -119,22 +156,26 @@ class Html2Doc | |
| 119 156 | 
             
              # We will end up stripping them out again under Nokogiri 1.11, which correctly
         | 
| 120 157 | 
             
              # insists on inheriting namespace from parent.
         | 
| 121 158 | 
             
              def ooml_clean(xml)
         | 
| 122 | 
            -
                xml. | 
| 159 | 
            +
                xml.to_xml(indent: 0)
         | 
| 123 160 | 
             
                  .gsub(/<\?[^>]+>\s*/, "")
         | 
| 124 161 | 
             
                  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
         | 
| 125 | 
            -
             | 
| 162 | 
            +
                # .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
         | 
| 126 163 | 
             
              end
         | 
| 127 164 |  | 
| 128 165 | 
             
              def mathml_to_ooml1(xml, docnamespaces)
         | 
| 129 166 | 
             
                doc = Nokogiri::XML::Document::new
         | 
| 130 167 | 
             
                doc.root = ooxml_cleanup(xml, docnamespaces)
         | 
| 131 | 
            -
                ooxml =  | 
| 132 | 
            -
                 | 
| 168 | 
            +
                # ooxml = @xsltemplate.transform(doc)
         | 
| 169 | 
            +
                d = xml.parent["block"] != "false" # display_style
         | 
| 170 | 
            +
                ooxml = Nokogiri::XML(Plurimath::Math.parse(doc.to_xml(indent: 0),
         | 
| 171 | 
            +
                                                            :mathml).to_omml)
         | 
| 172 | 
            +
                ooxml = unitalic(accent_tr(ooxml))
         | 
| 173 | 
            +
                ooxml = ooml_clean(uncenter(xml, ooxml))
         | 
| 133 174 | 
             
                xml.swap(ooxml)
         | 
| 134 175 | 
             
              end
         | 
| 135 176 |  | 
| 136 177 | 
             
              def accent_tr(xml)
         | 
| 137 | 
            -
                xml. | 
| 178 | 
            +
                xml.ooxml_xpath(".//accPr/chr").each do |x|
         | 
| 138 179 | 
             
                  x["m:val"] &&= accent_tr1(x["m:val"])
         | 
| 139 180 | 
             
                  x["val"] &&= accent_tr1(x["val"])
         | 
| 140 181 | 
             
                end
         | 
| @@ -150,30 +191,48 @@ class Html2Doc | |
| 150 191 | 
             
                end
         | 
| 151 192 | 
             
              end
         | 
| 152 193 |  | 
| 153 | 
            -
               | 
| 154 | 
            -
              # XML indentation
         | 
| 155 | 
            -
              def esc_space(xml)
         | 
| 156 | 
            -
                xml.traverse do |n|
         | 
| 157 | 
            -
                  next unless n.text?
         | 
| 194 | 
            +
              OOXML_NS = "http://schemas.openxmlformats.org/officeDocument/2006/math".freeze
         | 
| 158 195 |  | 
| 159 | 
            -
             | 
| 160 | 
            -
                 | 
| 161 | 
            -
                 | 
| 196 | 
            +
              def math_only_para?(node)
         | 
| 197 | 
            +
                x = node.dup
         | 
| 198 | 
            +
                x.xpath(".//m:math", "m" => MATHML_NS).each(&:remove)
         | 
| 199 | 
            +
                x.xpath(".//m:oMathPara | .//m:oMath", "m" => OOXML_NS).each(&:remove)
         | 
| 200 | 
            +
                x.xpath(".//m:oMathPara | .//m:oMath").each(&:remove)
         | 
| 201 | 
            +
                # namespace can go missing during processing
         | 
| 202 | 
            +
                x.text.strip.empty?
         | 
| 203 | 
            +
              end
         | 
| 204 | 
            +
             | 
| 205 | 
            +
              def math_block?(_ooxml, mathml)
         | 
| 206 | 
            +
                # ooxml.name == "oMathPara" || mathml["displaystyle"] == "true"
         | 
| 207 | 
            +
                mathml["displaystyle"] == "true"
         | 
| 162 208 | 
             
              end
         | 
| 163 209 |  | 
| 210 | 
            +
              STYLE_BEARING_NODE =
         | 
| 211 | 
            +
                %w(p div td th li).map { |x| ".//ancestor::#{x}" }.join(" | ").freeze
         | 
| 212 | 
            +
             | 
| 164 213 | 
             
              # if oomml has no siblings, by default it is centered; override this with
         | 
| 165 214 | 
             
              # left/right if parent is so tagged
         | 
| 215 | 
            +
              # also if ooml has mathPara already, or is in para with only oMath content
         | 
| 166 216 | 
             
              def uncenter(math, ooxml)
         | 
| 167 | 
            -
                alignnode = math. | 
| 168 | 
            -
             | 
| 169 | 
            -
                 | 
| 170 | 
            -
             | 
| 171 | 
            -
                 | 
| 172 | 
            -
             | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
            -
                   | 
| 176 | 
            -
                end
         | 
| 217 | 
            +
                alignnode = math.xpath(STYLE_BEARING_NODE).last
         | 
| 218 | 
            +
                ooxml.document? and ooxml = ooxml.root
         | 
| 219 | 
            +
                ret = uncenter_unneeded(math, ooxml, alignnode) and return ret
         | 
| 220 | 
            +
                dir = "left"
         | 
| 221 | 
            +
                alignnode["style"]&.include?("text-align:right") and dir = "right"
         | 
| 222 | 
            +
                ooxml.name == "oMathPara" or
         | 
| 223 | 
            +
                  ooxml.wrap("<m:oMathPara></m:oMathPara>")
         | 
| 224 | 
            +
                ooxml.elements.first.previous =
         | 
| 225 | 
            +
                  "<m:oMathParaPr><m:jc m:val='#{dir}'/></m:oMathParaPr>"
         | 
| 177 226 | 
             
                ooxml
         | 
| 178 227 | 
             
              end
         | 
| 228 | 
            +
             | 
| 229 | 
            +
              def uncenter_unneeded(math, ooxml, alignnode)
         | 
| 230 | 
            +
                (math_block?(ooxml, math) || !alignnode) and return ooxml
         | 
| 231 | 
            +
                if !math_only_para?(alignnode)
         | 
| 232 | 
            +
                  ooxml.name == "oMathPara" and
         | 
| 233 | 
            +
                    ooxml = ooxml.elements.detect { |x| x.name == "oMath" }
         | 
| 234 | 
            +
                  return ooxml
         | 
| 235 | 
            +
                end
         | 
| 236 | 
            +
                nil
         | 
| 237 | 
            +
              end
         | 
| 179 238 | 
             
            end
         | 
    
        data/lib/html2doc/version.rb
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: html2doc
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 1. | 
| 4 | 
            +
              version: 1.7.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Ribose Inc.
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2023-08 | 
| 11 | 
            +
            date: 2023-09-08 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: htmlentities
         | 
| @@ -38,6 +38,20 @@ dependencies: | |
| 38 38 | 
             
                - - ">="
         | 
| 39 39 | 
             
                  - !ruby/object:Gem::Version
         | 
| 40 40 | 
             
                    version: 3.2.0
         | 
| 41 | 
            +
            - !ruby/object:Gem::Dependency
         | 
| 42 | 
            +
              name: metanorma-utils
         | 
| 43 | 
            +
              requirement: !ruby/object:Gem::Requirement
         | 
| 44 | 
            +
                requirements:
         | 
| 45 | 
            +
                - - ">="
         | 
| 46 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 47 | 
            +
                    version: '0'
         | 
| 48 | 
            +
              type: :runtime
         | 
| 49 | 
            +
              prerelease: false
         | 
| 50 | 
            +
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 51 | 
            +
                requirements:
         | 
| 52 | 
            +
                - - ">="
         | 
| 53 | 
            +
                  - !ruby/object:Gem::Version
         | 
| 54 | 
            +
                    version: '0'
         | 
| 41 55 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 42 56 | 
             
              name: mime-types
         | 
| 43 57 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| @@ -84,16 +98,16 @@ dependencies: | |
| 84 98 | 
             
              name: plurimath
         | 
| 85 99 | 
             
              requirement: !ruby/object:Gem::Requirement
         | 
| 86 100 | 
             
                requirements:
         | 
| 87 | 
            -
                - - " | 
| 101 | 
            +
                - - "~>"
         | 
| 88 102 | 
             
                  - !ruby/object:Gem::Version
         | 
| 89 | 
            -
                    version:  | 
| 103 | 
            +
                    version: 0.5.0
         | 
| 90 104 | 
             
              type: :runtime
         | 
| 91 105 | 
             
              prerelease: false
         | 
| 92 106 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 93 107 | 
             
                requirements:
         | 
| 94 | 
            -
                - - " | 
| 108 | 
            +
                - - "~>"
         | 
| 95 109 | 
             
                  - !ruby/object:Gem::Version
         | 
| 96 | 
            -
                    version:  | 
| 110 | 
            +
                    version: 0.5.0
         | 
| 97 111 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 98 112 | 
             
              name: thread_safe
         | 
| 99 113 | 
             
              requirement: !ruby/object:Gem::Requirement
         |