html2doc 1.10.4 → 1.10.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 152467633084047452eb3055fb70033671821c5ba0b9918b9eff4a19373f8aea
4
- data.tar.gz: 283ec00b5bb8660322c1cd0d55aa0f63bad3d4bbde9a1b23619d466ff681ae6f
3
+ metadata.gz: 891f686836c7fa66be96cbcde889b12c9a45df09ed3321e9b78fef625b8f4102
4
+ data.tar.gz: 0135175121e52e6ca97ed471b8659b36f79bcbf1ee72453bb51112b7dba738f3
5
5
  SHA512:
6
- metadata.gz: 73004619c0d6f067037b411721fe580129a8a9ba781df3a180764b8283ab931ab8e82620605aeccffe9605ea9cee08b96caa9554fb176bbbd4adbd22339eaa78
7
- data.tar.gz: a5fa1f0947456b12d2fa902a3beeddb05397b9d7f816ceb18214345b3c06364def154387eedf0878771ee7ef5dfde46b69a0f4faf4bbc1c10c1b7816503391f7
6
+ metadata.gz: 5d614fff856c3cadf255cb364aedb8d697a226fd1c8f6dd41431dd84c2b72aaf2b7616a7f0632a6ed30df1a8d6bbbbb0ef9a60b11f8103e63924f8c618dfae71
7
+ data.tar.gz: 9aa106cfb435df572e57303a3fad1c58531f8da9d4d728dd9a8fddbbbee6bf23360b3e1024722009b23710e1c052809dc3ea4eeaa197273b5b4efe8861847903
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.10.4".freeze
2
+ VERSION = "1.10.5".freeze
3
3
  end
data/lib/html2doc/xml.rb CHANGED
@@ -15,19 +15,49 @@ class Html2Doc
15
15
  end
16
16
  xml = xml.gsub(/<!--\s*\[([^\<\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
17
17
  .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18
+ # Escape & to &amp; in href attributes before XML parsing to prevent stripping
19
+ xml = escape_amp_in_hrefs(xml)
18
20
  Nokogiri::XML.parse(xml)
19
21
  end
20
22
 
23
+ # Escape plain & to &amp; in href attributes
24
+ # This prevents Nokogiri from stripping invalid HTML entities during XML parsing
25
+ def escape_amp_in_hrefs(html)
26
+ # Match href="..." and href='...' separately
27
+ html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
28
+ if Regexp.last_match(1)
29
+ "#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&', '&amp;')}\""
30
+ else
31
+ "#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&', '&amp;')}'"
32
+ end
33
+ end
34
+ end
35
+
21
36
  DOCTYPE = <<~DOCTYPE.freeze
22
37
  <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
23
38
  DOCTYPE
24
39
 
25
40
  def from_xhtml(xml)
26
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
41
+ result = xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
27
42
  .sub(DOCTYPE, "").gsub(%{ />}, "/>")
28
43
  .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
29
44
  .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
30
45
  .gsub("\n--&gt;\n", "\n-->\n")
46
+ # Unescape &amp; to & in href attributes for proper URL handling
47
+ unescape_amp_in_hrefs(result)
48
+ end
49
+
50
+ # Unescape &amp; to & in href attributes only
51
+ # This ensures URLs work correctly in Word while preserving &amp; in text
52
+ def unescape_amp_in_hrefs(html)
53
+ # Match href="..." and href='...' separately
54
+ html.gsub(/(href\s*=\s*")([^"]*)"|(href\s*=\s*')([^']*)'/) do
55
+ if Regexp.last_match(1)
56
+ "#{Regexp.last_match(1)}#{Regexp.last_match(2).gsub('&amp;', '&')}\""
57
+ else
58
+ "#{Regexp.last_match(3)}#{Regexp.last_match(4).gsub('&amp;', '&')}'"
59
+ end
60
+ end
31
61
  end
32
62
 
33
63
  def msword_fix(doc)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.10.4
4
+ version: 1.10.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2026-03-02 00:00:00.000000000 Z
11
+ date: 2026-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64