html2doc 1.5.3 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114c21f3bbc33c244fb49577d4c85334789a388be61d65c04765d33ed1913208
4
- data.tar.gz: 60c00e3a300eb16db1f2332393d5ff9a45f65c60a8a1d40a1cfaba2767744249
3
+ metadata.gz: 46856bf56ad5dd95f8f5781dc11049bb4600060c28c49715a262837ece8028bf
4
+ data.tar.gz: 866ba19867f233b45aeee436df719679623d671902b30476b61952f7a6357e1f
5
5
  SHA512:
6
- metadata.gz: 606e87a4dcfd0a3e270588461c6150dd38b1baad57bfc90b3965806477945deeae0b2eaeca87af9800b70d214133b11588a66f8205108e3af87f13464306b34a
7
- data.tar.gz: ecc2ad3631a0cafb9ff3b7f04022b9a977efac71099f7b256f840ec6d15e418bb18f426c729d6a40e9a9cc196f3e02d4cf0c5c448a99c6097c2da9d6b2bfb4d0
6
+ metadata.gz: b949f47c356437ce418f65ce7fd1c497648d0d0e960fe1e05d7318d280ddf6de23ddad8e5ab94a18f447b3eaba948b2a4db69ca2d00f0dcfebd692933a64c1da
7
+ data.tar.gz: 953999bd39aa1c1b6a0e1a34c939dcbdba01242c898f5a587eab546b4c9a4578e8051c9c68b49900390fcb3ab5d2de4ad6f0f1bce1af5a8ee6e7bd4daf300966
data/lib/html2doc/base.rb CHANGED
@@ -30,8 +30,7 @@ class Html2Doc
30
30
  end
31
31
 
32
32
  def process_header(headerfile)
33
- return if headerfile.nil?
34
-
33
+ headerfile.nil? and return
35
34
  doc = File.read(headerfile, encoding: "utf-8")
36
35
  doc = header_image_cleanup(doc, @dir1, @filename,
37
36
  File.dirname(@filename))
@@ -66,6 +65,7 @@ class Html2Doc
66
65
  end
67
66
 
68
67
  def cleanup(docxml)
68
+ locate_landscape(docxml)
69
69
  namespace(docxml.root)
70
70
  image_cleanup(docxml, @dir1, @imagedir)
71
71
  mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
76
76
  docxml
77
77
  end
78
78
 
79
- NOKOHEAD = <<~HERE.freeze
80
- <!DOCTYPE html SYSTEM
81
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
82
- <html xmlns="http://www.w3.org/1999/xhtml">
83
- <head> <title></title> <meta charset="UTF-8" /> </head>
84
- <body> </body> </html>
85
- HERE
86
-
87
- def to_xhtml(xml)
88
- xml.gsub!(/<\?xml[^>]*>/, "")
89
- unless /<!DOCTYPE /.match? xml
90
- xml = '<!DOCTYPE html SYSTEM
91
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
92
- end
93
- xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
94
- .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
95
- Nokogiri::XML.parse(xml)
96
- end
97
-
98
- DOCTYPE = <<~"DOCTYPE".freeze
99
- <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
100
- DOCTYPE
101
-
102
- def from_xhtml(xml)
103
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
104
- .sub(DOCTYPE, "").gsub(%{ />}, "/>")
105
- .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
106
- .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
107
- .gsub("\n--&gt;\n", "\n-->\n")
108
- end
109
-
110
- def msword_fix(doc)
111
- # brain damage in MSWord parser
112
- doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
113
- "<w:DoNotOptimizeForBrowser/>")
114
- doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
115
- '<span style="mso-special-character:footnote"></span>')
116
- doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
117
- '<div style="mso-element:footnote-list"/>')
118
- doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
119
- doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
120
- doc.gsub!(%r{<meta http-equiv="Content-Type"},
121
- "<meta http-equiv=Content-Type")
122
- doc.gsub!(%r{></m:jc>}, "/>")
123
- doc.gsub!(%r{></v:stroke>}, "/>")
124
- doc.gsub!(%r{></v:f>}, "/>")
125
- doc.gsub!(%r{></v:path>}, "/>")
126
- doc.gsub!(%r{></o:lock>}, "/>")
127
- doc.gsub!(%r{></v:imagedata>}, "/>")
128
- doc.gsub!(%r{></w:wrap>}, "/>")
129
- doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
130
- doc.gsub!(%r{&tab;|&amp;tab;},
131
- '<span style="mso-tab-count:1">&#xA0; </span>')
132
- doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
133
- a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
134
- a
135
- end.join
136
- end
137
-
138
- PRINT_VIEW = <<~XML.freeze
139
-
140
- <xml>
141
- <w:WordDocument>
142
- <w:View>Print</w:View>
143
- <w:Zoom>100</w:Zoom>
144
- <w:DoNotOptimizeForBrowser/>
145
- </w:WordDocument>
146
- </xml>
147
- <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
148
- XML
79
+ def locate_landscape(_docxml)
80
+ css = read_stylesheet(@stylesheet)
81
+ @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
82
+ .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
83
+ end
149
84
 
150
85
  def define_head1(docxml, _dir)
151
86
  docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -168,18 +103,21 @@ class Html2Doc
168
103
  end
169
104
 
170
105
  def stylesheet(_filename, _header_filename, cssname)
171
- (cssname.nil? || cssname.empty?) and
172
- cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
173
- stylesheet = File.read(cssname, encoding: "UTF-8")
106
+ stylesheet = read_stylesheet(cssname)
174
107
  xml = Nokogiri::XML("<style/>")
175
108
  # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
176
109
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
177
110
  xml.children.first << Nokogiri::XML::CDATA
178
111
  .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
179
-
180
112
  xml.root.to_s
181
113
  end
182
114
 
115
+ def read_stylesheet(cssname)
116
+ (cssname.nil? || cssname.empty?) and
117
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
118
+ File.read(cssname, encoding: "UTF-8")
119
+ end
120
+
183
121
  def define_head(docxml)
184
122
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
185
123
  head = docxml.at("//*[local-name() = 'head']")
@@ -195,30 +133,15 @@ class Html2Doc
195
133
  head.add_child css
196
134
  elsif title.nil?
197
135
  head.children.first.add_previous_sibling css
198
- else
199
- title.add_next_sibling css
136
+ else title.add_next_sibling css
200
137
  end
201
138
  end
202
139
 
203
- def namespace(root)
204
- {
205
- o: "urn:schemas-microsoft-com:office:office",
206
- w: "urn:schemas-microsoft-com:office:word",
207
- v: "urn:schemas-microsoft-com:vml",
208
- m: "http://schemas.microsoft.com/office/2004/12/omml",
209
- }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
210
- end
211
-
212
- def rootnamespace(root)
213
- root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
214
- end
215
-
216
140
  def bookmarks(docxml)
217
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
218
142
  .each do |x|
219
- next if x["id"].empty? ||
220
- %w(shapetype v:shapetype shape v:shape).include?(x.name)
221
-
143
+ (x["id"].empty? ||
144
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
222
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
223
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
224
147
  end
data/lib/html2doc/mime.rb CHANGED
@@ -76,11 +76,9 @@ class Html2Doc
76
76
  end
77
77
  end
78
78
 
79
- # max width for Word document is 400, max height is 680
80
79
  def image_resize(img, path, maxheight, maxwidth)
81
80
  s, realsize = get_image_size(img, path)
82
- return s if s[0] == nil && s[1] == nil
83
-
81
+ s[0] == nil && s[1] == nil and return s
84
82
  if img.name == "svg" && !img["viewBox"]
85
83
  img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
86
84
  end
@@ -115,21 +113,100 @@ class Html2Doc
115
113
 
116
114
  # only processes locally stored images
117
115
  def image_cleanup(docxml, dir, localdir)
116
+ maxheight, maxwidth = page_dimensions(docxml)
118
117
  docxml.traverse do |i|
119
- src = i["src"]
120
- next unless i.element? && %w(img v:imagedata).include?(i.name)
121
- next if src.nil? || src.empty? || /^http/.match?(src)
122
- next if %r{^data:(image|application)/[^;]+;base64}.match? src
123
-
124
- local_filename = localname(src, localdir)
125
- new_filename = "#{mkuuid}#{File.extname(src)}"
126
- FileUtils.cp local_filename, File.join(dir, new_filename)
127
- i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
128
- i["src"] = File.join(File.basename(dir), new_filename)
118
+ skip_image_cleanup?(i) and next
119
+ local_filename = rename_image(i, dir, localdir)
120
+ i["width"], i["height"] =
121
+ if landscape?(i)
122
+ image_resize(i, local_filename, maxwidth, maxheight)
123
+ else
124
+ image_resize(i, local_filename, maxheight, maxwidth)
125
+ end
129
126
  end
130
127
  docxml
131
128
  end
132
129
 
130
+ def landscape?(img)
131
+ img.ancestors.each do |a|
132
+ a.name == "div" or next
133
+ @landscape.include?(a["class"]) and return true
134
+ end
135
+ false
136
+ end
137
+
138
+ def rename_image(img, dir, localdir)
139
+ local_filename = localname(img["src"], localdir)
140
+ new_filename = "#{mkuuid}#{File.extname(img['src'])}"
141
+ FileUtils.cp local_filename, File.join(dir, new_filename)
142
+ img["src"] = File.join(File.basename(dir), new_filename)
143
+ local_filename
144
+ end
145
+
146
+ def skip_image_cleanup?(img)
147
+ src = img["src"]
148
+ (img.element? && %w(img v:imagedata).include?(img.name)) or return true
149
+ (src.nil? || src.empty? || /^http/.match?(src) ||
150
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
151
+ false
152
+ end
153
+
154
+ # we are going to use the 2nd instance of @page in the Word CSS,
155
+ # skipping the cover page. Currently doesn't deal with Landscape.
156
+ # Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
157
+ # Allow 0.9 * height to fit caption
158
+ def page_dimensions(docxml)
159
+ stylesheet = read_stylesheet(@stylesheet)
160
+ page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
161
+ return [680, 400]
162
+ m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
163
+ m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
164
+ return [680, 400]
165
+ [0.9 * (units_to_px(m_size[2]) - units_to_px(m_marg[1]) - units_to_px(m_marg[3])),
166
+ units_to_px(m_size[1]) - units_to_px(m_marg[2]) - units_to_px(m_marg[4])]
167
+ rescue StandardError
168
+ [680, 400]
169
+ end
170
+
171
+ def find_page_size_in_doc(stylesheet, doc)
172
+ find_page_size(stylesheet, "WordSection2", false) ||
173
+ find_page_size(stylesheet, "WordSection3", false) ||
174
+ find_page_size(doc, "WordSection2", true) ||
175
+ find_page_size(doc, "WordSection3", true) ||
176
+ find_page_size(stylesheet, "", false) || find_page_size(doc, "", true)
177
+ end
178
+
179
+ # if in_xml, CSS is embedded in XML <style> tag
180
+ def find_page_size(stylesheet, klass, in_xml)
181
+ xml_found = false
182
+ found = false
183
+ ret = ""
184
+ stylesheet&.lines&.each do |l|
185
+ in_xml && l.include?("<style") and xml_found = true and found = false
186
+ in_xml && l.include?("</style>") and xml_found = false
187
+ /^\s*@page\s+#{klass}/.match?(l) and found = true
188
+ found && /^\s*\{?size:/.match?(l) and ret += l
189
+ found && /^\s*\{?margin:/.match?(l) and ret += l
190
+ if found && /}/.match?(l)
191
+ !ret.blank? && (!in_xml || xml_found) and return ret
192
+ ret = ""
193
+ found = false
194
+ end
195
+ end
196
+ nil
197
+ end
198
+
199
+ def units_to_px(measure)
200
+ m = /^(\S+)(pt|cm)/.match(measure)
201
+ ret = case m[2]
202
+ when "px" then (m[1].to_f * 0.75)
203
+ when "pt" then m[1].to_f
204
+ when "cm" then (m[1].to_f * 28.346456693)
205
+ when "in" then (m[1].to_f * 72)
206
+ end
207
+ ret.to_i
208
+ end
209
+
133
210
  # do not parse the header through Nokogiri, since it will contain
134
211
  # non-XML like <![if !supportFootnotes]>
135
212
  def header_image_cleanup(doc, dir, filename, localdir)
@@ -155,8 +232,7 @@ class Html2Doc
155
232
  f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
156
233
  <o:MainFile HRef="../#{filename}.htm"/>}
157
234
  Dir.entries(dir).sort.each do |item|
158
- next if item == "." || item == ".." || /^\./.match(item)
159
-
235
+ (item == "." || item == ".." || /^\./.match(item)) and next
160
236
  f.write %{ <o:File HRef="#{item}"/>\n}
161
237
  end
162
238
  f.write("</xml>\n")
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.3".freeze
2
+ VERSION = "1.5.5".freeze
3
3
  end
@@ -0,0 +1,83 @@
1
+ class Html2Doc
2
+ NOKOHEAD = <<~HERE.freeze
3
+ <!DOCTYPE html SYSTEM
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5
+ <html xmlns="http://www.w3.org/1999/xhtml">
6
+ <head> <title></title> <meta charset="UTF-8" /> </head>
7
+ <body> </body> </html>
8
+ HERE
9
+
10
+ def to_xhtml(xml)
11
+ xml.gsub!(/<\?xml[^>]*>/, "")
12
+ unless /<!DOCTYPE /.match? xml
13
+ xml = '<!DOCTYPE html SYSTEM
14
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
15
+ end
16
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
17
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18
+ Nokogiri::XML.parse(xml)
19
+ end
20
+
21
+ DOCTYPE = <<~DOCTYPE.freeze
22
+ <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
23
+ DOCTYPE
24
+
25
+ def from_xhtml(xml)
26
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
27
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
28
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
29
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
30
+ .gsub("\n--&gt;\n", "\n-->\n")
31
+ end
32
+
33
+ def msword_fix(doc)
34
+ # brain damage in MSWord parser
35
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
36
+ "<w:DoNotOptimizeForBrowser/>")
37
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
38
+ '<span style="mso-special-character:footnote"></span>')
39
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
40
+ '<div style="mso-element:footnote-list"/>')
41
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
42
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
43
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
44
+ "<meta http-equiv=Content-Type")
45
+ doc.gsub!(%r{></m:jc>}, "/>")
46
+ doc.gsub!(%r{></v:stroke>}, "/>")
47
+ doc.gsub!(%r{></v:f>}, "/>")
48
+ doc.gsub!(%r{></v:path>}, "/>")
49
+ doc.gsub!(%r{></o:lock>}, "/>")
50
+ doc.gsub!(%r{></v:imagedata>}, "/>")
51
+ doc.gsub!(%r{></w:wrap>}, "/>")
52
+ doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
53
+ doc.gsub!(%r{&tab;|&amp;tab;},
54
+ '<span style="mso-tab-count:1">&#xA0; </span>')
55
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
56
+ a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
57
+ a
58
+ end.join
59
+ end
60
+
61
+ PRINT_VIEW = <<~XML.freeze
62
+
63
+ <xml>
64
+ <w:WordDocument>
65
+ <w:View>Print</w:View>
66
+ <w:Zoom>100</w:Zoom>
67
+ <w:DoNotOptimizeForBrowser/>
68
+ </w:WordDocument>
69
+ </xml>
70
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
71
+ XML
72
+
73
+ def namespace(root)
74
+ { o: "urn:schemas-microsoft-com:office:office",
75
+ w: "urn:schemas-microsoft-com:office:word",
76
+ v: "urn:schemas-microsoft-com:vml",
77
+ m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
78
+ end
79
+
80
+ def rootnamespace(root)
81
+ root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
82
+ end
83
+ end
data/lib/html2doc.rb CHANGED
@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
4
4
  require_relative "html2doc/notes"
5
5
  require_relative "html2doc/math"
6
6
  require_relative "html2doc/lists"
7
+ require_relative "html2doc/xml"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.3
4
+ version: 1.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-10 00:00:00.000000000 Z
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -293,6 +293,7 @@ files:
293
293
  - lib/html2doc/notes.rb
294
294
  - lib/html2doc/version.rb
295
295
  - lib/html2doc/wordstyle.css
296
+ - lib/html2doc/xml.rb
296
297
  homepage: https://github.com/metanorma/html2doc
297
298
  licenses:
298
299
  - CC-BY-SA-3.0