html2doc 1.0.7 → 1.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/rake.yml +6 -37
- data/.gitignore +2 -0
- data/.hound.yml +3 -1
- data/.rubocop.yml +7 -7
- data/Gemfile +2 -2
- data/Rakefile +1 -1
- data/bin/html2doc +1 -2
- data/bin/rspec +1 -1
- data/html2doc.gemspec +8 -9
- data/lib/html2doc.rb +0 -3
- data/lib/html2doc/base.rb +58 -47
- data/lib/html2doc/lists.rb +47 -42
- data/lib/html2doc/math.rb +100 -73
- data/lib/html2doc/mime.rb +53 -37
- data/lib/html2doc/notes.rb +42 -36
- data/lib/html2doc/version.rb +1 -1
- data/spec/html2doc_spec.rb +575 -517
- metadata +44 -46
- data/.rubocop.ribose.yml +0 -65
- data/.rubocop.tb.yml +0 -650
    
        data/lib/html2doc/math.rb
    CHANGED
    
    | @@ -9,23 +9,34 @@ module Html2Doc | |
| 9 9 | 
             
                Nokogiri::XSLT(File.read(File.join(File.dirname(__FILE__), "mml2omml.xsl"),
         | 
| 10 10 | 
             
                                         encoding: "utf-8"))
         | 
| 11 11 |  | 
| 12 | 
            -
              def self.asciimath_to_mathml1( | 
| 13 | 
            -
                AsciiMath::MathMLBuilder.new(: | 
| 14 | 
            -
                  AsciiMath.parse(HTMLEntities.new.decode( | 
| 15 | 
            -
             | 
| 12 | 
            +
              def self.asciimath_to_mathml1(expr)
         | 
| 13 | 
            +
                AsciiMath::MathMLBuilder.new(msword: true).append_expression(
         | 
| 14 | 
            +
                  AsciiMath.parse(HTMLEntities.new.decode(expr)).ast,
         | 
| 15 | 
            +
                ).to_s
         | 
| 16 | 
            +
                  .gsub(/<math>/, "<math xmlns='http://www.w3.org/1998/Math/MathML'>")
         | 
| 17 | 
            +
              rescue StandardError => e
         | 
| 18 | 
            +
                puts "parsing: #{expr}"
         | 
| 19 | 
            +
                puts e.message
         | 
| 20 | 
            +
                raise e
         | 
| 16 21 | 
             
              end
         | 
| 17 22 |  | 
| 18 23 | 
             
              def self.asciimath_to_mathml(doc, delims)
         | 
| 19 24 | 
             
                return doc if delims.nil? || delims.size < 2
         | 
| 25 | 
            +
             | 
| 20 26 | 
             
                m = doc.split(/(#{Regexp.escape(delims[0])}|#{Regexp.escape(delims[1])})/)
         | 
| 21 27 | 
             
                m.each_slice(4).map.with_index do |(*a), i|
         | 
| 22 | 
            -
                  i  | 
| 23 | 
            -
                    warn "MathML #{i} of #{(m.size / 4).floor}"
         | 
| 28 | 
            +
                  progress_conv(i, 500, (m.size / 4).floor, 1000, "AsciiMath")
         | 
| 24 29 | 
             
                  a[2].nil? || a[2] = asciimath_to_mathml1(a[2])
         | 
| 25 30 | 
             
                  a.size > 1 ? a[0] + a[2] : a[0]
         | 
| 26 31 | 
             
                end.join
         | 
| 27 32 | 
             
              end
         | 
| 28 33 |  | 
| 34 | 
            +
              def self.progress_conv(idx, step, total, threshold, msg)
         | 
| 35 | 
            +
                return unless (idx % step).zero? && total > threshold && idx.positive?
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                warn "#{msg} #{idx} of #{total}"
         | 
| 38 | 
            +
              end
         | 
| 39 | 
            +
             | 
| 29 40 | 
             
              def self.unwrap_accents(doc)
         | 
| 30 41 | 
             
                doc.xpath("//*[@accent = 'true']").each do |x|
         | 
| 31 42 | 
             
                  x.elements.length > 1 or next
         | 
| @@ -36,106 +47,124 @@ module Html2Doc | |
| 36 47 | 
             
              end
         | 
| 37 48 |  | 
| 38 49 | 
             
              # random fixes to MathML input that OOXML needs to render properly
         | 
| 39 | 
            -
              def self.ooxml_cleanup( | 
| 40 | 
            -
                 | 
| 41 | 
            -
                   | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 50 | 
            +
              def self.ooxml_cleanup(math, docnamespaces)
         | 
| 51 | 
            +
                math = unwrap_accents(
         | 
| 52 | 
            +
                  mathml_preserve_space(
         | 
| 53 | 
            +
                    mathml_insert_rows(math, docnamespaces), docnamespaces
         | 
| 54 | 
            +
                  ),
         | 
| 55 | 
            +
                )
         | 
| 56 | 
            +
                math.add_namespace(nil, "http://www.w3.org/1998/Math/MathML")
         | 
| 57 | 
            +
                math
         | 
| 44 58 | 
             
              end
         | 
| 45 59 |  | 
| 46 | 
            -
              def self.mathml_insert_rows( | 
| 47 | 
            -
                 | 
| 48 | 
            -
                        map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
         | 
| 60 | 
            +
              def self.mathml_insert_rows(math, docnamespaces)
         | 
| 61 | 
            +
                math.xpath(%w(msup msub msubsup munder mover munderover)
         | 
| 62 | 
            +
                        .map { |m| ".//xmlns:#{m}" }.join(" | "), docnamespaces).each do |x|
         | 
| 49 63 | 
             
                  next unless x.next_element && x.next_element != "mrow"
         | 
| 64 | 
            +
             | 
| 50 65 | 
             
                  x.next_element.wrap("<mrow/>")
         | 
| 51 66 | 
             
                end
         | 
| 52 | 
            -
                 | 
| 67 | 
            +
                math
         | 
| 53 68 | 
             
              end
         | 
| 54 69 |  | 
| 55 | 
            -
              def self.mathml_preserve_space( | 
| 56 | 
            -
                 | 
| 70 | 
            +
              def self.mathml_preserve_space(math, docnamespaces)
         | 
| 71 | 
            +
                math.xpath(".//xmlns:mtext", docnamespaces).each do |x|
         | 
| 57 72 | 
             
                  x.children = x.children.to_xml.gsub(/^\s/, " ").gsub(/\s$/, " ")
         | 
| 58 73 | 
             
                end
         | 
| 59 | 
            -
                 | 
| 74 | 
            +
                math
         | 
| 60 75 | 
             
              end
         | 
| 61 76 |  | 
| 62 | 
            -
               | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 77 | 
            +
              HTML_NS = 'xmlns="http://www.w3.org/1999/xhtml"'.freeze
         | 
| 78 | 
            +
             | 
| 79 | 
            +
              def self.unitalic(math)
         | 
| 80 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'p']]").each do |x|
         | 
| 81 | 
            +
                  x.wrap("<span #{HTML_NS} style='font-style:normal;'></span>")
         | 
| 65 82 | 
             
                end
         | 
| 66 | 
            -
                 | 
| 67 | 
            -
                  x.wrap("<span class='nostem' style='font-weight:bold;'><em></em></span>")
         | 
| 83 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'bi']]").each do |x|
         | 
| 84 | 
            +
                  x.wrap("<span #{HTML_NS} class='nostem' style='font-weight:bold;'><em></em></span>")
         | 
| 68 85 | 
             
                end
         | 
| 69 | 
            -
                 | 
| 70 | 
            -
                  x.wrap("<span class='nostem'><em></em></span>")
         | 
| 86 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'i']]").each do |x|
         | 
| 87 | 
            +
                  x.wrap("<span #{HTML_NS} class='nostem'><em></em></span>")
         | 
| 71 88 | 
             
                end
         | 
| 72 | 
            -
                 | 
| 73 | 
            -
                  x.wrap("<span style='font-style:normal;font-weight:bold;'></span>")
         | 
| 89 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:scr)]/xmlns:sty[@m:val = 'b']]").each do |x|
         | 
| 90 | 
            +
                  x.wrap("<span #{HTML_NS} style='font-style:normal;font-weight:bold;'></span>")
         | 
| 74 91 | 
             
                end
         | 
| 75 | 
            -
                 | 
| 76 | 
            -
                   | 
| 92 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'monospace']]").each do |x|
         | 
| 93 | 
            +
                  to_plane1(x, :monospace)
         | 
| 77 94 | 
             
                end
         | 
| 78 | 
            -
                 | 
| 79 | 
            -
                   | 
| 95 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr/xmlns:scr[@m:val = 'double-struck']]").each do |x|
         | 
| 96 | 
            +
                  to_plane1(x, :doublestruck)
         | 
| 80 97 | 
             
                end
         | 
| 81 | 
            -
                 | 
| 82 | 
            -
                   | 
| 98 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'script']]").each do |x|
         | 
| 99 | 
            +
                  to_plane1(x, :script)
         | 
| 83 100 | 
             
                end
         | 
| 84 | 
            -
                 | 
| 85 | 
            -
                   | 
| 101 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'script']]").each do |x|
         | 
| 102 | 
            +
                  to_plane1(x, :scriptbold)
         | 
| 86 103 | 
             
                end
         | 
| 87 | 
            -
                 | 
| 88 | 
            -
                   | 
| 104 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
         | 
| 105 | 
            +
                  to_plane1(x, :fraktur)
         | 
| 89 106 | 
             
                end
         | 
| 90 | 
            -
                 | 
| 91 | 
            -
                   | 
| 107 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'fraktur']]").each do |x|
         | 
| 108 | 
            +
                  to_plane1(x, :frakturbold)
         | 
| 92 109 | 
             
                end
         | 
| 93 | 
            -
                 | 
| 94 | 
            -
                   | 
| 110 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[not(xmlns:sty) or xmlns:sty/@m:val = 'p']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 111 | 
            +
                  to_plane1(x, :sans)
         | 
| 95 112 | 
             
                end
         | 
| 96 | 
            -
                 | 
| 97 | 
            -
                   | 
| 113 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'b']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 114 | 
            +
                  to_plane1(x, :sansbold)
         | 
| 98 115 | 
             
                end
         | 
| 99 | 
            -
                 | 
| 100 | 
            -
                   | 
| 116 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'i']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 117 | 
            +
                  to_plane1(x, :sansitalic)
         | 
| 101 118 | 
             
                end
         | 
| 102 | 
            -
                 | 
| 103 | 
            -
                   | 
| 119 | 
            +
                math.xpath(".//xmlns:r[xmlns:rPr[xmlns:sty/@m:val = 'bi']/xmlns:scr[@m:val = 'sans-serif']]").each do |x|
         | 
| 120 | 
            +
                  to_plane1(x, :sansbolditalic)
         | 
| 104 121 | 
             
                end
         | 
| 105 | 
            -
                 | 
| 122 | 
            +
                math
         | 
| 106 123 | 
             
              end
         | 
| 107 124 |  | 
| 108 | 
            -
              def self. | 
| 109 | 
            -
                 | 
| 125 | 
            +
              def self.to_plane1(xml, font)
         | 
| 126 | 
            +
                xml.traverse do |n|
         | 
| 110 127 | 
             
                  next unless n.text?
         | 
| 128 | 
            +
             | 
| 111 129 | 
             
                  n.replace(Plane1Converter.conv(HTMLEntities.new.decode(n.text), font))
         | 
| 112 130 | 
             
                end
         | 
| 113 | 
            -
                 | 
| 131 | 
            +
                xml
         | 
| 114 132 | 
             
              end
         | 
| 115 133 |  | 
| 116 134 | 
             
              def self.mathml_to_ooml(docxml)
         | 
| 117 135 | 
             
                docnamespaces = docxml.collect_namespaces
         | 
| 118 136 | 
             
                m = docxml.xpath("//*[local-name() = 'math']")
         | 
| 119 137 | 
             
                m.each_with_index do |x, i|
         | 
| 120 | 
            -
                  i  | 
| 121 | 
            -
             | 
| 122 | 
            -
                  element = ooxml_cleanup(x, docnamespaces)
         | 
| 123 | 
            -
                  doc = Nokogiri::XML::Document::new()
         | 
| 124 | 
            -
                  doc.root = element
         | 
| 125 | 
            -
                  ooxml = (unitalic(esc_space(@xsltemplate.transform(doc)))).to_s.
         | 
| 126 | 
            -
                    gsub(/<\?[^>]+>\s*/, "").
         | 
| 127 | 
            -
                    gsub(/ xmlns(:[^=]+)?="[^"]+"/, "").
         | 
| 128 | 
            -
                    gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
         | 
| 129 | 
            -
                  ooxml = uncenter(x, ooxml)
         | 
| 130 | 
            -
                  x.swap(ooxml)
         | 
| 138 | 
            +
                  progress_conv(i, 100, m.size, 500, "Math OOXML")
         | 
| 139 | 
            +
                  mathml_to_ooml1(x, docnamespaces)
         | 
| 131 140 | 
             
                end
         | 
| 132 141 | 
             
              end
         | 
| 133 142 |  | 
| 134 | 
            -
              #  | 
| 143 | 
            +
              # We need span and em not to be namespaced. Word can't deal with explicit 
         | 
| 144 | 
            +
              # namespaces.
         | 
| 145 | 
            +
              # We will end up stripping them out again under Nokogiri 1.11, which correctly
         | 
| 146 | 
            +
              # insists on inheriting namespace from parent.
         | 
| 147 | 
            +
              def self.ooml_clean(xml)
         | 
| 148 | 
            +
                xml.to_s
         | 
| 149 | 
            +
                  .gsub(/<\?[^>]+>\s*/, "")
         | 
| 150 | 
            +
                  .gsub(/ xmlns(:[^=]+)?="[^"]+"/, "")
         | 
| 151 | 
            +
                  .gsub(%r{<(/)?(?!span)(?!em)([a-z])}, "<\\1m:\\2")
         | 
| 152 | 
            +
              end
         | 
| 153 | 
            +
             | 
| 154 | 
            +
              def self.mathml_to_ooml1(xml, docnamespaces)
         | 
| 155 | 
            +
                doc = Nokogiri::XML::Document::new
         | 
| 156 | 
            +
                doc.root = ooxml_cleanup(xml, docnamespaces)
         | 
| 157 | 
            +
                  ooxml = ooml_clean(unitalic(esc_space(@xsltemplate.transform(doc))))
         | 
| 158 | 
            +
                ooxml = uncenter(xml, ooxml)
         | 
| 159 | 
            +
                xml.swap(ooxml)
         | 
| 160 | 
            +
              end
         | 
| 161 | 
            +
             | 
| 162 | 
            +
              # escape space as 2; we are removing any spaces generated by
         | 
| 135 163 | 
             
              # XML indentation
         | 
| 136 164 | 
             
              def self.esc_space(xml)
         | 
| 137 165 | 
             
                xml.traverse do |n|
         | 
| 138 166 | 
             
                  next unless n.text?
         | 
| 167 | 
            +
             | 
| 139 168 | 
             
                  n = n.text.gsub(/ /, "2")
         | 
| 140 169 | 
             
                end
         | 
| 141 170 | 
             
                xml
         | 
| @@ -143,17 +172,15 @@ module Html2Doc | |
| 143 172 |  | 
| 144 173 | 
             
              # if oomml has no siblings, by default it is centered; override this with
         | 
| 145 174 | 
             
              # left/right if parent is so tagged
         | 
| 146 | 
            -
              def self.uncenter( | 
| 147 | 
            -
                 | 
| 148 | 
            -
             | 
| 149 | 
            -
             | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
                      "m:val='left'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
         | 
| 154 | 
            -
                  elsif alignnode.text.include? ("text-align:right")
         | 
| 175 | 
            +
              def self.uncenter(math, ooxml)
         | 
| 176 | 
            +
                alignnode = math.at(".//ancestor::*[@style][local-name() = 'p' or "\
         | 
| 177 | 
            +
                                    "local-name() = 'div' or local-name() = 'td']/@style")
         | 
| 178 | 
            +
                return ooxml unless alignnode && (math.next == nil && math.previous == nil)
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                %w(left right).each do |dir|
         | 
| 181 | 
            +
                  if alignnode.text.include? ("text-align:#{dir}")
         | 
| 155 182 | 
             
                    ooxml = "<m:oMathPara><m:oMathParaPr><m:jc "\
         | 
| 156 | 
            -
                      "m:val=' | 
| 183 | 
            +
                      "m:val='#{dir}'/></m:oMathParaPr>#{ooxml}</m:oMathPara>"
         | 
| 157 184 | 
             
                  end
         | 
| 158 185 | 
             
                end
         | 
| 159 186 | 
             
                ooxml
         | 
    
        data/lib/html2doc/mime.rb
    CHANGED
    
    | @@ -7,19 +7,20 @@ require "fileutils" | |
| 7 7 | 
             
            module Html2Doc
         | 
| 8 8 | 
             
              def self.mime_preamble(boundary, filename, result)
         | 
| 9 9 | 
             
                <<~"PREAMBLE"
         | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 10 | 
            +
                  MIME-Version: 1.0
         | 
| 11 | 
            +
                  Content-Type: multipart/related; boundary="#{boundary}"
         | 
| 12 12 |  | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 13 | 
            +
                  --#{boundary}
         | 
| 14 | 
            +
                  Content-ID: <#{File.basename(filename)}>
         | 
| 15 | 
            +
                  Content-Disposition: inline; filename="#{File.basename(filename)}"
         | 
| 16 | 
            +
                  Content-Type: text/html; charset="utf-8"
         | 
| 16 17 |  | 
| 17 | 
            -
             | 
| 18 | 
            +
                  #{result}
         | 
| 18 19 |  | 
| 19 20 | 
             
                PREAMBLE
         | 
| 20 21 | 
             
              end
         | 
| 21 22 |  | 
| 22 | 
            -
              def self.mime_attachment(boundary,  | 
| 23 | 
            +
              def self.mime_attachment(boundary, _filename, item, dir)
         | 
| 23 24 | 
             
                content_type = mime_type(item)
         | 
| 24 25 | 
             
                text_mode = %w[text application].any? { |p| content_type.start_with? p }
         | 
| 25 26 |  | 
| @@ -28,12 +29,13 @@ module Html2Doc | |
| 28 29 |  | 
| 29 30 | 
             
                encoded_file = Base64.strict_encode64(content).gsub(/(.{76})/, "\\1\n")
         | 
| 30 31 | 
             
                <<~"FILE"
         | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 32 | 
            +
                  --#{boundary}
         | 
| 33 | 
            +
                  Content-ID: <#{File.basename(item)}>
         | 
| 34 | 
            +
                  Content-Disposition: inline; filename="#{File.basename(item)}"
         | 
| 35 | 
            +
                  Content-Transfer-Encoding: base64
         | 
| 36 | 
            +
                  Content-Type: #{content_type}
         | 
| 35 37 |  | 
| 36 | 
            -
             | 
| 38 | 
            +
                  #{encoded_file}
         | 
| 37 39 |  | 
| 38 40 | 
             
                FILE
         | 
| 39 41 | 
             
              end
         | 
| @@ -41,7 +43,7 @@ module Html2Doc | |
| 41 43 | 
             
              def self.mime_type(item)
         | 
| 42 44 | 
             
                types = MIME::Types.type_for(item)
         | 
| 43 45 | 
             
                type = types ? types.first.to_s : 'text/plain; charset="utf-8"'
         | 
| 44 | 
            -
                type = type  | 
| 46 | 
            +
                type = %(#{type} charset="utf-8") if /^text/.match(type) && types
         | 
| 45 47 | 
             
                type
         | 
| 46 48 | 
             
              end
         | 
| 47 49 |  | 
| @@ -52,25 +54,37 @@ module Html2Doc | |
| 52 54 |  | 
| 53 55 | 
             
              def self.mime_package(result, filename, dir)
         | 
| 54 56 | 
             
                boundary = mime_boundary
         | 
| 55 | 
            -
                mhtml = mime_preamble(boundary, filename, result)
         | 
| 56 | 
            -
                mhtml += mime_attachment(boundary, filename, "filelist.xml", dir)
         | 
| 57 | 
            +
                mhtml = mime_preamble(boundary, "#{filename}.htm", result)
         | 
| 58 | 
            +
                mhtml += mime_attachment(boundary, "#{filename}.htm", "filelist.xml", dir)
         | 
| 57 59 | 
             
                Dir.foreach(dir) do |item|
         | 
| 58 60 | 
             
                  next if item == "." || item == ".." || /^\./.match(item) ||
         | 
| 59 61 | 
             
                    item == "filelist.xml"
         | 
| 60 | 
            -
             | 
| 62 | 
            +
             | 
| 63 | 
            +
                  mhtml += mime_attachment(boundary, "#{filename}.htm", item, dir)
         | 
| 61 64 | 
             
                end
         | 
| 62 65 | 
             
                mhtml += "--#{boundary}--"
         | 
| 63 | 
            -
                File.open("#{filename}.doc", "w:UTF-8") { |f| f.write mhtml }
         | 
| 66 | 
            +
                File.open("#{filename}.doc", "w:UTF-8") { |f| f.write contentid(mhtml) }
         | 
| 67 | 
            +
              end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
              def self.contentid(mhtml)
         | 
| 70 | 
            +
                mhtml.gsub %r{(<img[^>]*?src=")([^\"']+)(['"])}m do |m|
         | 
| 71 | 
            +
                  repl = "#{$1}cid:#{File.basename($2)}#{$3}"
         | 
| 72 | 
            +
                  /^data:|^https?:/.match($2) ? m : repl
         | 
| 73 | 
            +
                end.gsub %r{(<v:imagedata[^>]*?src=")([^\"']+)(['"])}m do |m|
         | 
| 74 | 
            +
                  repl = "#{$1}cid:#{File.basename($2)}#{$3}"
         | 
| 75 | 
            +
                  /^data:|^https?:/.match($2) ? m : repl
         | 
| 76 | 
            +
                end
         | 
| 64 77 | 
             
              end
         | 
| 65 78 |  | 
| 66 79 | 
             
              # max width for Word document is 400, max height is 680
         | 
| 67 | 
            -
              def self.image_resize( | 
| 68 | 
            -
                 | 
| 69 | 
            -
                s = [ | 
| 70 | 
            -
                s =  | 
| 71 | 
            -
                return [nil, nil] if  | 
| 72 | 
            -
             | 
| 73 | 
            -
                s[ | 
| 80 | 
            +
              def self.image_resize(img, path, maxheight, maxwidth)
         | 
| 81 | 
            +
                realsize = ImageSize.path(path).size
         | 
| 82 | 
            +
                s = [img["width"].to_i, img["height"].to_i]
         | 
| 83 | 
            +
                s = realsize if s[0].zero? && s[1].zero?
         | 
| 84 | 
            +
                return [nil, nil] if realsize.nil? || realsize[0].nil? || realsize[1].nil?
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                s[1] = s[0] * realsize[1] / realsize[0] if s[1].zero? && !s[0].zero?
         | 
| 87 | 
            +
                s[0] = s[1] * realsize[0] / realsize[1] if s[0].zero? && !s[1].zero?
         | 
| 74 88 | 
             
                s = [(s[0] * maxheight / s[1]).ceil, maxheight] if s[1] > maxheight
         | 
| 75 89 | 
             
                s = [maxwidth, (s[1] * maxwidth / s[0]).ceil] if s[0] > maxwidth
         | 
| 76 90 | 
             
                s
         | 
| @@ -83,19 +97,22 @@ module Html2Doc | |
| 83 97 | 
             
              end
         | 
| 84 98 |  | 
| 85 99 | 
             
              def self.warnsvg(src)
         | 
| 86 | 
            -
                warn "#{src}: SVG not supported" if /\.svg$/i.match(src)
         | 
| 100 | 
            +
                warn "#{src}: SVG not supported" if /\.svg$/i.match?(src)
         | 
| 101 | 
            +
              end
         | 
| 102 | 
            +
             | 
| 103 | 
            +
              def self.localname(src, localdir)
         | 
| 104 | 
            +
                %r{^([A-Z]:)?/}.match?(src) ? src : File.join(localdir, src)
         | 
| 87 105 | 
             
              end
         | 
| 88 106 |  | 
| 89 107 | 
             
              # only processes locally stored images
         | 
| 90 108 | 
             
              def self.image_cleanup(docxml, dir, localdir)
         | 
| 91 109 | 
             
                docxml.traverse do |i|
         | 
| 92 110 | 
             
                  next unless i.element? && %w(img v:imagedata).include?(i.name)
         | 
| 93 | 
            -
                   | 
| 94 | 
            -
                  next if  | 
| 95 | 
            -
             | 
| 96 | 
            -
                  local_filename =  | 
| 97 | 
            -
             | 
| 98 | 
            -
                  new_filename = "#{mkuuid}#{File.extname(i["src"])}"
         | 
| 111 | 
            +
                  next if /^http/.match? i["src"]
         | 
| 112 | 
            +
                  next if %r{^data:(image|application)/[^;]+;base64}.match? i["src"]
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                  local_filename = localname(i["src"], localdir)
         | 
| 115 | 
            +
                  new_filename = "#{mkuuid}#{File.extname(i['src'])}"
         | 
| 99 116 | 
             
                  FileUtils.cp local_filename, File.join(dir, new_filename)
         | 
| 100 117 | 
             
                  i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
         | 
| 101 118 | 
             
                  i["src"] = File.join(File.basename(dir), new_filename)
         | 
| @@ -103,7 +120,7 @@ module Html2Doc | |
| 103 120 | 
             
                docxml
         | 
| 104 121 | 
             
              end
         | 
| 105 122 |  | 
| 106 | 
            -
              # do not parse the header through Nokogiri, since it will contain | 
| 123 | 
            +
              # do not parse the header through Nokogiri, since it will contain
         | 
| 107 124 | 
             
              # non-XML like <![if !supportFootnotes]>
         | 
| 108 125 | 
             
              def self.header_image_cleanup(doc, dir, filename, localdir)
         | 
| 109 126 | 
             
                doc.split(%r{(<img [^>]*>|<v:imagedata [^>]*>)}).each_slice(2).map do |a|
         | 
| @@ -111,16 +128,14 @@ module Html2Doc | |
| 111 128 | 
             
                end.join
         | 
| 112 129 | 
             
              end
         | 
| 113 130 |  | 
| 114 | 
            -
              def self.header_image_cleanup1(a, dir,  | 
| 131 | 
            +
              def self.header_image_cleanup1(a, dir, _filename, localdir)
         | 
| 115 132 | 
             
                if a.size == 2 && !(/ src="https?:/.match a[1]) &&
         | 
| 116 133 | 
             
                    !(%r{ src="data:(image|application)/[^;]+;base64}.match a[1])
         | 
| 117 134 | 
             
                  m = / src=['"](?<src>[^"']+)['"]/.match a[1]
         | 
| 118 | 
            -
                  #warnsvg(m[:src])
         | 
| 119 135 | 
             
                  m2 = /\.(?<suffix>[a-zA-Z_0-9]+)$/.match m[:src]
         | 
| 120 136 | 
             
                  new_filename = "#{mkuuid}.#{m2[:suffix]}"
         | 
| 121 | 
            -
                   | 
| 122 | 
            -
                   | 
| 123 | 
            -
                  a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='file:///C:/Doc/#{filename}_files/#{new_filename}'")
         | 
| 137 | 
            +
                  FileUtils.cp localname(m[:src], localdir), File.join(dir, new_filename)
         | 
| 138 | 
            +
                  a[1].sub!(%r{ src=['"](?<src>[^"']+)['"]}, " src='cid:#{new_filename}'")
         | 
| 124 139 | 
             
                end
         | 
| 125 140 | 
             
                a.join
         | 
| 126 141 | 
             
              end
         | 
| @@ -131,6 +146,7 @@ module Html2Doc | |
| 131 146 | 
             
                    <o:MainFile HRef="../#{filename}.htm"/>}
         | 
| 132 147 | 
             
                  Dir.entries(dir).sort.each do |item|
         | 
| 133 148 | 
             
                    next if item == "." || item == ".." || /^\./.match(item)
         | 
| 149 | 
            +
             | 
| 134 150 | 
             
                    f.write %{  <o:File HRef="#{item}"/>\n}
         | 
| 135 151 | 
             
                  end
         | 
| 136 152 | 
             
                  f.write("</xml>\n")
         | 
    
        data/lib/html2doc/notes.rb
    CHANGED
    
    | @@ -6,6 +6,7 @@ module Html2Doc | |
| 6 6 | 
             
                fn = []
         | 
| 7 7 | 
             
                docxml.xpath("//a").each do |a|
         | 
| 8 8 | 
             
                  next unless process_footnote_link(docxml, a, i, fn)
         | 
| 9 | 
            +
             | 
| 9 10 | 
             
                  i += 1
         | 
| 10 11 | 
             
                end
         | 
| 11 12 | 
             
                process_footnote_texts(docxml, fn)
         | 
| @@ -22,13 +23,13 @@ module Html2Doc | |
| 22 23 | 
             
                footnote_cleanup(docxml)
         | 
| 23 24 | 
             
              end
         | 
| 24 25 |  | 
| 25 | 
            -
              def self.footnote_div_to_p( | 
| 26 | 
            -
                if %w{div aside}.include?  | 
| 27 | 
            -
                  if  | 
| 28 | 
            -
                     | 
| 26 | 
            +
              def self.footnote_div_to_p(elem)
         | 
| 27 | 
            +
                if %w{div aside}.include? elem.name
         | 
| 28 | 
            +
                  if elem.at(".//p")
         | 
| 29 | 
            +
                    elem.replace(elem.children)
         | 
| 29 30 | 
             
                  else
         | 
| 30 | 
            -
                     | 
| 31 | 
            -
                     | 
| 31 | 
            +
                    elem.name = "p"
         | 
| 32 | 
            +
                    elem["class"] = "MsoFootnoteText"
         | 
| 32 33 | 
             
                  end
         | 
| 33 34 | 
             
                end
         | 
| 34 35 | 
             
              end
         | 
| @@ -36,34 +37,39 @@ module Html2Doc | |
| 36 37 | 
             
              FN = "<span class='MsoFootnoteReference'>"\
         | 
| 37 38 | 
             
                "<span style='mso-special-character:footnote'/></span>".freeze
         | 
| 38 39 |  | 
| 39 | 
            -
              def self.footnote_container(docxml,  | 
| 40 | 
            -
                ref = docxml&.at("//a[@href='#_ftn#{ | 
| 41 | 
            -
                  gsub(/>\n</, "><") || FN
         | 
| 40 | 
            +
              def self.footnote_container(docxml, idx)
         | 
| 41 | 
            +
                ref = docxml&.at("//a[@href='#_ftn#{idx}']")&.children&.to_xml(indent: 0)
         | 
| 42 | 
            +
                  &.gsub(/>\n</, "><") || FN
         | 
| 42 43 | 
             
                <<~DIV
         | 
| 43 | 
            -
                  <div style='mso-element:footnote' id='ftn#{ | 
| 44 | 
            -
                    <a style='mso-footnote-id:ftn#{ | 
| 45 | 
            -
                       name='_ftnref#{ | 
| 44 | 
            +
                  <div style='mso-element:footnote' id='ftn#{idx}'>
         | 
| 45 | 
            +
                    <a style='mso-footnote-id:ftn#{idx}' href='#_ftn#{idx}'
         | 
| 46 | 
            +
                       name='_ftnref#{idx}' title='' id='_ftnref#{idx}'>#{ref.strip}</a></div>
         | 
| 46 47 | 
             
                DIV
         | 
| 47 48 | 
             
              end
         | 
| 48 49 |  | 
| 49 | 
            -
              def self.process_footnote_link(docxml,  | 
| 50 | 
            -
                return false unless footnote?( | 
| 51 | 
            -
             | 
| 50 | 
            +
              def self.process_footnote_link(docxml, elem, idx, footnote)
         | 
| 51 | 
            +
                return false unless footnote?(elem)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                href = elem["href"].gsub(/^#/, "")
         | 
| 52 54 | 
             
                note = docxml.at("//*[@name = '#{href}' or @id = '#{href}']")
         | 
| 53 55 | 
             
                return false if note.nil?
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                 | 
| 56 | 
            -
             | 
| 57 | 
            -
             | 
| 58 | 
            -
             | 
| 59 | 
            -
             | 
| 60 | 
            -
             | 
| 61 | 
            -
             | 
| 56 | 
            +
             | 
| 57 | 
            +
                set_footnote_link_attrs(elem, idx)
         | 
| 58 | 
            +
                if elem.at("./span[@class = 'MsoFootnoteReference']")
         | 
| 59 | 
            +
                  process_footnote_link1(elem)
         | 
| 60 | 
            +
                else elem.children = FN
         | 
| 61 | 
            +
                end
         | 
| 62 | 
            +
                footnote << transform_footnote_text(note)
         | 
| 63 | 
            +
              end
         | 
| 64 | 
            +
             | 
| 65 | 
            +
              def self.process_footnote_link1(elem)
         | 
| 66 | 
            +
                elem.children.each do |c|
         | 
| 67 | 
            +
                  if c.name == "span" && c["class"] == "MsoFootnoteReference"
         | 
| 68 | 
            +
                    c.replace(FN)
         | 
| 69 | 
            +
                  else
         | 
| 70 | 
            +
                    c.wrap("<span class='MsoFootnoteReference'></span>")
         | 
| 62 71 | 
             
                  end
         | 
| 63 | 
            -
                else
         | 
| 64 | 
            -
                  a.children = FN
         | 
| 65 72 | 
             
                end
         | 
| 66 | 
            -
                fn << transform_footnote_text(note)
         | 
| 67 73 | 
             
              end
         | 
| 68 74 |  | 
| 69 75 | 
             
              def self.transform_footnote_text(note)
         | 
| @@ -76,16 +82,16 @@ module Html2Doc | |
| 76 82 | 
             
                note.remove
         | 
| 77 83 | 
             
              end
         | 
| 78 84 |  | 
| 79 | 
            -
              def self.footnote?( | 
| 80 | 
            -
                 | 
| 81 | 
            -
                   | 
| 85 | 
            +
              def self.footnote?(elem)
         | 
| 86 | 
            +
                elem["epub:type"]&.casecmp("footnote")&.zero? ||
         | 
| 87 | 
            +
                  elem["class"]&.casecmp("footnote")&.zero?
         | 
| 82 88 | 
             
              end
         | 
| 83 89 |  | 
| 84 | 
            -
              def self.set_footnote_link_attrs( | 
| 85 | 
            -
                 | 
| 86 | 
            -
                 | 
| 87 | 
            -
                 | 
| 88 | 
            -
                 | 
| 90 | 
            +
              def self.set_footnote_link_attrs(elem, idx)
         | 
| 91 | 
            +
                elem["style"] = "mso-footnote-id:ftn#{idx}"
         | 
| 92 | 
            +
                elem["href"] = "#_ftn#{idx}"
         | 
| 93 | 
            +
                elem["name"] = "_ftnref#{idx}"
         | 
| 94 | 
            +
                elem["title"] = ""
         | 
| 89 95 | 
             
              end
         | 
| 90 96 |  | 
| 91 97 | 
             
              # We expect that the content of the footnote text received is one or
         | 
| @@ -94,8 +100,8 @@ module Html2Doc | |
| 94 100 | 
             
              # are present in the HTML, they need to have been cleaned out before
         | 
| 95 101 | 
             
              # passing to this gem
         | 
| 96 102 | 
             
              def self.footnote_cleanup(docxml)
         | 
| 97 | 
            -
                docxml.xpath('//div[@style="mso-element:footnote"]/a') | 
| 98 | 
            -
                  each do |x|
         | 
| 103 | 
            +
                docxml.xpath('//div[@style="mso-element:footnote"]/a')
         | 
| 104 | 
            +
                  .each do |x|
         | 
| 99 105 | 
             
                  n = x.next_element
         | 
| 100 106 | 
             
                  n&.children&.first&.add_previous_sibling(x.remove)
         | 
| 101 107 | 
             
                end
         |