html2doc 1.5.4 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 74b05f46f1fd365f9ff0766e95d884bd2959c01b92c70d4a080651adfc2e8d3c
4
- data.tar.gz: f70eb009e705ff767b34922fc0444740be8dde80da8b78c503784e02be0e4560
3
+ metadata.gz: 46856bf56ad5dd95f8f5781dc11049bb4600060c28c49715a262837ece8028bf
4
+ data.tar.gz: 866ba19867f233b45aeee436df719679623d671902b30476b61952f7a6357e1f
5
5
  SHA512:
6
- metadata.gz: e3d93501d63bd27ed6e5245cb18dbc49013fcecd83bc57acf3a5d3c797636b928b91e148e33e8326f10f77f9b94a7175d85294eb86a1b4b2261aafb7dfe9d7a4
7
- data.tar.gz: 4cbb8887089e622b9d9d1fd82dc4e5fd4e8e81a28a59dcf02ccce22cb3c9e6e7c4c7802177259557c268d697ced17ec09e4181e82c0dc851a613553e7f5b58c1
6
+ metadata.gz: b949f47c356437ce418f65ce7fd1c497648d0d0e960fe1e05d7318d280ddf6de23ddad8e5ab94a18f447b3eaba948b2a4db69ca2d00f0dcfebd692933a64c1da
7
+ data.tar.gz: 953999bd39aa1c1b6a0e1a34c939dcbdba01242c898f5a587eab546b4c9a4578e8051c9c68b49900390fcb3ab5d2de4ad6f0f1bce1af5a8ee6e7bd4daf300966
data/lib/html2doc/base.rb CHANGED
@@ -30,8 +30,7 @@ class Html2Doc
30
30
  end
31
31
 
32
32
  def process_header(headerfile)
33
- return if headerfile.nil?
34
-
33
+ headerfile.nil? and return
35
34
  doc = File.read(headerfile, encoding: "utf-8")
36
35
  doc = header_image_cleanup(doc, @dir1, @filename,
37
36
  File.dirname(@filename))
@@ -66,6 +65,7 @@ class Html2Doc
66
65
  end
67
66
 
68
67
  def cleanup(docxml)
68
+ locate_landscape(docxml)
69
69
  namespace(docxml.root)
70
70
  image_cleanup(docxml, @dir1, @imagedir)
71
71
  mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
76
76
  docxml
77
77
  end
78
78
 
79
- NOKOHEAD = <<~HERE.freeze
80
- <!DOCTYPE html SYSTEM
81
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
82
- <html xmlns="http://www.w3.org/1999/xhtml">
83
- <head> <title></title> <meta charset="UTF-8" /> </head>
84
- <body> </body> </html>
85
- HERE
86
-
87
- def to_xhtml(xml)
88
- xml.gsub!(/<\?xml[^>]*>/, "")
89
- unless /<!DOCTYPE /.match? xml
90
- xml = '<!DOCTYPE html SYSTEM
91
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
92
- end
93
- xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
94
- .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
95
- Nokogiri::XML.parse(xml)
96
- end
97
-
98
- DOCTYPE = <<~"DOCTYPE".freeze
99
- <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
100
- DOCTYPE
101
-
102
- def from_xhtml(xml)
103
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
104
- .sub(DOCTYPE, "").gsub(%{ />}, "/>")
105
- .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
106
- .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
107
- .gsub("\n--&gt;\n", "\n-->\n")
108
- end
109
-
110
- def msword_fix(doc)
111
- # brain damage in MSWord parser
112
- doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
113
- "<w:DoNotOptimizeForBrowser/>")
114
- doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
115
- '<span style="mso-special-character:footnote"></span>')
116
- doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
117
- '<div style="mso-element:footnote-list"/>')
118
- doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
119
- doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
120
- doc.gsub!(%r{<meta http-equiv="Content-Type"},
121
- "<meta http-equiv=Content-Type")
122
- doc.gsub!(%r{></m:jc>}, "/>")
123
- doc.gsub!(%r{></v:stroke>}, "/>")
124
- doc.gsub!(%r{></v:f>}, "/>")
125
- doc.gsub!(%r{></v:path>}, "/>")
126
- doc.gsub!(%r{></o:lock>}, "/>")
127
- doc.gsub!(%r{></v:imagedata>}, "/>")
128
- doc.gsub!(%r{></w:wrap>}, "/>")
129
- doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
130
- doc.gsub!(%r{&tab;|&amp;tab;},
131
- '<span style="mso-tab-count:1">&#xA0; </span>')
132
- doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
133
- a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
134
- a
135
- end.join
136
- end
137
-
138
- PRINT_VIEW = <<~XML.freeze
139
-
140
- <xml>
141
- <w:WordDocument>
142
- <w:View>Print</w:View>
143
- <w:Zoom>100</w:Zoom>
144
- <w:DoNotOptimizeForBrowser/>
145
- </w:WordDocument>
146
- </xml>
147
- <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
148
- XML
79
+ def locate_landscape(_docxml)
80
+ css = read_stylesheet(@stylesheet)
81
+ @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
82
+ .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
83
+ end
149
84
 
150
85
  def define_head1(docxml, _dir)
151
86
  docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -174,7 +109,6 @@ class Html2Doc
174
109
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
175
110
  xml.children.first << Nokogiri::XML::CDATA
176
111
  .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
177
-
178
112
  xml.root.to_s
179
113
  end
180
114
 
@@ -199,30 +133,15 @@ class Html2Doc
199
133
  head.add_child css
200
134
  elsif title.nil?
201
135
  head.children.first.add_previous_sibling css
202
- else
203
- title.add_next_sibling css
136
+ else title.add_next_sibling css
204
137
  end
205
138
  end
206
139
 
207
- def namespace(root)
208
- {
209
- o: "urn:schemas-microsoft-com:office:office",
210
- w: "urn:schemas-microsoft-com:office:word",
211
- v: "urn:schemas-microsoft-com:vml",
212
- m: "http://schemas.microsoft.com/office/2004/12/omml",
213
- }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
214
- end
215
-
216
- def rootnamespace(root)
217
- root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
218
- end
219
-
220
140
  def bookmarks(docxml)
221
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
222
142
  .each do |x|
223
- next if x["id"].empty? ||
224
- %w(shapetype v:shapetype shape v:shape).include?(x.name)
225
-
143
+ (x["id"].empty? ||
144
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
226
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
227
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
228
147
  end
data/lib/html2doc/mime.rb CHANGED
@@ -78,8 +78,7 @@ class Html2Doc
78
78
 
79
79
  def image_resize(img, path, maxheight, maxwidth)
80
80
  s, realsize = get_image_size(img, path)
81
- return s if s[0] == nil && s[1] == nil
82
-
81
+ s[0] == nil && s[1] == nil and return s
83
82
  if img.name == "svg" && !img["viewBox"]
84
83
  img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
85
84
  end
@@ -118,12 +117,24 @@ class Html2Doc
118
117
  docxml.traverse do |i|
119
118
  skip_image_cleanup?(i) and next
120
119
  local_filename = rename_image(i, dir, localdir)
121
- i["width"], i["height"] = image_resize(i, local_filename, maxheight,
122
- maxwidth)
120
+ i["width"], i["height"] =
121
+ if landscape?(i)
122
+ image_resize(i, local_filename, maxwidth, maxheight)
123
+ else
124
+ image_resize(i, local_filename, maxheight, maxwidth)
125
+ end
123
126
  end
124
127
  docxml
125
128
  end
126
129
 
130
+ def landscape?(img)
131
+ img.ancestors.each do |a|
132
+ a.name == "div" or next
133
+ @landscape.include?(a["class"]) and return true
134
+ end
135
+ false
136
+ end
137
+
127
138
  def rename_image(img, dir, localdir)
128
139
  local_filename = localname(img["src"], localdir)
129
140
  new_filename = "#{mkuuid}#{File.extname(img['src'])}"
@@ -134,10 +145,9 @@ class Html2Doc
134
145
 
135
146
  def skip_image_cleanup?(img)
136
147
  src = img["src"]
137
- return true unless img.element? && %w(img v:imagedata).include?(img.name)
138
- return true if src.nil? || src.empty? || /^http/.match?(src) ||
139
- %r{^data:(image|application)/[^;]+;base64}.match?(src)
140
-
148
+ (img.element? && %w(img v:imagedata).include?(img.name)) or return true
149
+ (src.nil? || src.empty? || /^http/.match?(src) ||
150
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
141
151
  false
142
152
  end
143
153
 
@@ -222,8 +232,7 @@ class Html2Doc
222
232
  f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
223
233
  <o:MainFile HRef="../#{filename}.htm"/>}
224
234
  Dir.entries(dir).sort.each do |item|
225
- next if item == "." || item == ".." || /^\./.match(item)
226
-
235
+ (item == "." || item == ".." || /^\./.match(item)) and next
227
236
  f.write %{ <o:File HRef="#{item}"/>\n}
228
237
  end
229
238
  f.write("</xml>\n")
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.4".freeze
2
+ VERSION = "1.5.5".freeze
3
3
  end
@@ -0,0 +1,83 @@
1
+ class Html2Doc
2
+ NOKOHEAD = <<~HERE.freeze
3
+ <!DOCTYPE html SYSTEM
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5
+ <html xmlns="http://www.w3.org/1999/xhtml">
6
+ <head> <title></title> <meta charset="UTF-8" /> </head>
7
+ <body> </body> </html>
8
+ HERE
9
+
10
+ def to_xhtml(xml)
11
+ xml.gsub!(/<\?xml[^>]*>/, "")
12
+ unless /<!DOCTYPE /.match? xml
13
+ xml = '<!DOCTYPE html SYSTEM
14
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
15
+ end
16
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
17
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18
+ Nokogiri::XML.parse(xml)
19
+ end
20
+
21
+ DOCTYPE = <<~DOCTYPE.freeze
22
+ <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
23
+ DOCTYPE
24
+
25
+ def from_xhtml(xml)
26
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
27
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
28
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
29
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
30
+ .gsub("\n--&gt;\n", "\n-->\n")
31
+ end
32
+
33
+ def msword_fix(doc)
34
+ # brain damage in MSWord parser
35
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
36
+ "<w:DoNotOptimizeForBrowser/>")
37
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
38
+ '<span style="mso-special-character:footnote"></span>')
39
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
40
+ '<div style="mso-element:footnote-list"/>')
41
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
42
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
43
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
44
+ "<meta http-equiv=Content-Type")
45
+ doc.gsub!(%r{></m:jc>}, "/>")
46
+ doc.gsub!(%r{></v:stroke>}, "/>")
47
+ doc.gsub!(%r{></v:f>}, "/>")
48
+ doc.gsub!(%r{></v:path>}, "/>")
49
+ doc.gsub!(%r{></o:lock>}, "/>")
50
+ doc.gsub!(%r{></v:imagedata>}, "/>")
51
+ doc.gsub!(%r{></w:wrap>}, "/>")
52
+ doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
53
+ doc.gsub!(%r{&tab;|&amp;tab;},
54
+ '<span style="mso-tab-count:1">&#xA0; </span>')
55
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
56
+ a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
57
+ a
58
+ end.join
59
+ end
60
+
61
+ PRINT_VIEW = <<~XML.freeze
62
+
63
+ <xml>
64
+ <w:WordDocument>
65
+ <w:View>Print</w:View>
66
+ <w:Zoom>100</w:Zoom>
67
+ <w:DoNotOptimizeForBrowser/>
68
+ </w:WordDocument>
69
+ </xml>
70
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
71
+ XML
72
+
73
+ def namespace(root)
74
+ { o: "urn:schemas-microsoft-com:office:office",
75
+ w: "urn:schemas-microsoft-com:office:word",
76
+ v: "urn:schemas-microsoft-com:vml",
77
+ m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
78
+ end
79
+
80
+ def rootnamespace(root)
81
+ root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
82
+ end
83
+ end
data/lib/html2doc.rb CHANGED
@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
4
4
  require_relative "html2doc/notes"
5
5
  require_relative "html2doc/math"
6
6
  require_relative "html2doc/lists"
7
+ require_relative "html2doc/xml"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.4
4
+ version: 1.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-05-19 00:00:00.000000000 Z
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -293,6 +293,7 @@ files:
293
293
  - lib/html2doc/notes.rb
294
294
  - lib/html2doc/version.rb
295
295
  - lib/html2doc/wordstyle.css
296
+ - lib/html2doc/xml.rb
296
297
  homepage: https://github.com/metanorma/html2doc
297
298
  licenses:
298
299
  - CC-BY-SA-3.0