html2doc 1.5.3 → 1.5.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114c21f3bbc33c244fb49577d4c85334789a388be61d65c04765d33ed1913208
4
- data.tar.gz: 60c00e3a300eb16db1f2332393d5ff9a45f65c60a8a1d40a1cfaba2767744249
3
+ metadata.gz: 46856bf56ad5dd95f8f5781dc11049bb4600060c28c49715a262837ece8028bf
4
+ data.tar.gz: 866ba19867f233b45aeee436df719679623d671902b30476b61952f7a6357e1f
5
5
  SHA512:
6
- metadata.gz: 606e87a4dcfd0a3e270588461c6150dd38b1baad57bfc90b3965806477945deeae0b2eaeca87af9800b70d214133b11588a66f8205108e3af87f13464306b34a
7
- data.tar.gz: ecc2ad3631a0cafb9ff3b7f04022b9a977efac71099f7b256f840ec6d15e418bb18f426c729d6a40e9a9cc196f3e02d4cf0c5c448a99c6097c2da9d6b2bfb4d0
6
+ metadata.gz: b949f47c356437ce418f65ce7fd1c497648d0d0e960fe1e05d7318d280ddf6de23ddad8e5ab94a18f447b3eaba948b2a4db69ca2d00f0dcfebd692933a64c1da
7
+ data.tar.gz: 953999bd39aa1c1b6a0e1a34c939dcbdba01242c898f5a587eab546b4c9a4578e8051c9c68b49900390fcb3ab5d2de4ad6f0f1bce1af5a8ee6e7bd4daf300966
data/lib/html2doc/base.rb CHANGED
@@ -30,8 +30,7 @@ class Html2Doc
30
30
  end
31
31
 
32
32
  def process_header(headerfile)
33
- return if headerfile.nil?
34
-
33
+ headerfile.nil? and return
35
34
  doc = File.read(headerfile, encoding: "utf-8")
36
35
  doc = header_image_cleanup(doc, @dir1, @filename,
37
36
  File.dirname(@filename))
@@ -66,6 +65,7 @@ class Html2Doc
66
65
  end
67
66
 
68
67
  def cleanup(docxml)
68
+ locate_landscape(docxml)
69
69
  namespace(docxml.root)
70
70
  image_cleanup(docxml, @dir1, @imagedir)
71
71
  mathml_to_ooml(docxml)
@@ -76,76 +76,11 @@ class Html2Doc
76
76
  docxml
77
77
  end
78
78
 
79
- NOKOHEAD = <<~HERE.freeze
80
- <!DOCTYPE html SYSTEM
81
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
82
- <html xmlns="http://www.w3.org/1999/xhtml">
83
- <head> <title></title> <meta charset="UTF-8" /> </head>
84
- <body> </body> </html>
85
- HERE
86
-
87
- def to_xhtml(xml)
88
- xml.gsub!(/<\?xml[^>]*>/, "")
89
- unless /<!DOCTYPE /.match? xml
90
- xml = '<!DOCTYPE html SYSTEM
91
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
92
- end
93
- xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
94
- .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
95
- Nokogiri::XML.parse(xml)
96
- end
97
-
98
- DOCTYPE = <<~"DOCTYPE".freeze
99
- <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
100
- DOCTYPE
101
-
102
- def from_xhtml(xml)
103
- xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
104
- .sub(DOCTYPE, "").gsub(%{ />}, "/>")
105
- .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
106
- .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
107
- .gsub("\n--&gt;\n", "\n-->\n")
108
- end
109
-
110
- def msword_fix(doc)
111
- # brain damage in MSWord parser
112
- doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
113
- "<w:DoNotOptimizeForBrowser/>")
114
- doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
115
- '<span style="mso-special-character:footnote"></span>')
116
- doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
117
- '<div style="mso-element:footnote-list"/>')
118
- doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
119
- doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
120
- doc.gsub!(%r{<meta http-equiv="Content-Type"},
121
- "<meta http-equiv=Content-Type")
122
- doc.gsub!(%r{></m:jc>}, "/>")
123
- doc.gsub!(%r{></v:stroke>}, "/>")
124
- doc.gsub!(%r{></v:f>}, "/>")
125
- doc.gsub!(%r{></v:path>}, "/>")
126
- doc.gsub!(%r{></o:lock>}, "/>")
127
- doc.gsub!(%r{></v:imagedata>}, "/>")
128
- doc.gsub!(%r{></w:wrap>}, "/>")
129
- doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
130
- doc.gsub!(%r{&tab;|&amp;tab;},
131
- '<span style="mso-tab-count:1">&#xA0; </span>')
132
- doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
133
- a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
134
- a
135
- end.join
136
- end
137
-
138
- PRINT_VIEW = <<~XML.freeze
139
-
140
- <xml>
141
- <w:WordDocument>
142
- <w:View>Print</w:View>
143
- <w:Zoom>100</w:Zoom>
144
- <w:DoNotOptimizeForBrowser/>
145
- </w:WordDocument>
146
- </xml>
147
- <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
148
- XML
79
+ def locate_landscape(_docxml)
80
+ css = read_stylesheet(@stylesheet)
81
+ @landscape = css.scan(/div\.\S+\s+\{\s*page:\s*[^;]+L;\s*\}/m)
82
+ .map { |e| e.sub(/^div\.(\S+).*$/m, "\\1") }
83
+ end
149
84
 
150
85
  def define_head1(docxml, _dir)
151
86
  docxml.xpath("//*[local-name() = 'head']").each do |h|
@@ -168,18 +103,21 @@ class Html2Doc
168
103
  end
169
104
 
170
105
  def stylesheet(_filename, _header_filename, cssname)
171
- (cssname.nil? || cssname.empty?) and
172
- cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
173
- stylesheet = File.read(cssname, encoding: "UTF-8")
106
+ stylesheet = read_stylesheet(cssname)
174
107
  xml = Nokogiri::XML("<style/>")
175
108
  # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
176
109
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
177
110
  xml.children.first << Nokogiri::XML::CDATA
178
111
  .new(xml, "\n<!--\n#{stylesheet}\n-->\n")
179
-
180
112
  xml.root.to_s
181
113
  end
182
114
 
115
+ def read_stylesheet(cssname)
116
+ (cssname.nil? || cssname.empty?) and
117
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
118
+ File.read(cssname, encoding: "UTF-8")
119
+ end
120
+
183
121
  def define_head(docxml)
184
122
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
185
123
  head = docxml.at("//*[local-name() = 'head']")
@@ -195,30 +133,15 @@ class Html2Doc
195
133
  head.add_child css
196
134
  elsif title.nil?
197
135
  head.children.first.add_previous_sibling css
198
- else
199
- title.add_next_sibling css
136
+ else title.add_next_sibling css
200
137
  end
201
138
  end
202
139
 
203
- def namespace(root)
204
- {
205
- o: "urn:schemas-microsoft-com:office:office",
206
- w: "urn:schemas-microsoft-com:office:word",
207
- v: "urn:schemas-microsoft-com:vml",
208
- m: "http://schemas.microsoft.com/office/2004/12/omml",
209
- }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
210
- end
211
-
212
- def rootnamespace(root)
213
- root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
214
- end
215
-
216
140
  def bookmarks(docxml)
217
141
  docxml.xpath("//*[@id][not(@name)][not(@style = 'mso-element:footnote')]")
218
142
  .each do |x|
219
- next if x["id"].empty? ||
220
- %w(shapetype v:shapetype shape v:shape).include?(x.name)
221
-
143
+ (x["id"].empty? ||
144
+ %w(shapetype v:shapetype shape v:shape).include?(x.name)) and next
222
145
  if x.children.empty? then x.add_child("<a name='#{x['id']}'></a>")
223
146
  else x.children.first.previous = "<a name='#{x['id']}'></a>"
224
147
  end
data/lib/html2doc/mime.rb CHANGED
@@ -76,11 +76,9 @@ class Html2Doc
76
76
  end
77
77
  end
78
78
 
79
- # max width for Word document is 400, max height is 680
80
79
  def image_resize(img, path, maxheight, maxwidth)
81
80
  s, realsize = get_image_size(img, path)
82
- return s if s[0] == nil && s[1] == nil
83
-
81
+ s[0] == nil && s[1] == nil and return s
84
82
  if img.name == "svg" && !img["viewBox"]
85
83
  img["viewBox"] = "0 0 #{s[0]} #{s[1]}"
86
84
  end
@@ -115,21 +113,100 @@ class Html2Doc
115
113
 
116
114
  # only processes locally stored images
117
115
  def image_cleanup(docxml, dir, localdir)
116
+ maxheight, maxwidth = page_dimensions(docxml)
118
117
  docxml.traverse do |i|
119
- src = i["src"]
120
- next unless i.element? && %w(img v:imagedata).include?(i.name)
121
- next if src.nil? || src.empty? || /^http/.match?(src)
122
- next if %r{^data:(image|application)/[^;]+;base64}.match? src
123
-
124
- local_filename = localname(src, localdir)
125
- new_filename = "#{mkuuid}#{File.extname(src)}"
126
- FileUtils.cp local_filename, File.join(dir, new_filename)
127
- i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
128
- i["src"] = File.join(File.basename(dir), new_filename)
118
+ skip_image_cleanup?(i) and next
119
+ local_filename = rename_image(i, dir, localdir)
120
+ i["width"], i["height"] =
121
+ if landscape?(i)
122
+ image_resize(i, local_filename, maxwidth, maxheight)
123
+ else
124
+ image_resize(i, local_filename, maxheight, maxwidth)
125
+ end
129
126
  end
130
127
  docxml
131
128
  end
132
129
 
130
+ def landscape?(img)
131
+ img.ancestors.each do |a|
132
+ a.name == "div" or next
133
+ @landscape.include?(a["class"]) and return true
134
+ end
135
+ false
136
+ end
137
+
138
+ def rename_image(img, dir, localdir)
139
+ local_filename = localname(img["src"], localdir)
140
+ new_filename = "#{mkuuid}#{File.extname(img['src'])}"
141
+ FileUtils.cp local_filename, File.join(dir, new_filename)
142
+ img["src"] = File.join(File.basename(dir), new_filename)
143
+ local_filename
144
+ end
145
+
146
+ def skip_image_cleanup?(img)
147
+ src = img["src"]
148
+ (img.element? && %w(img v:imagedata).include?(img.name)) or return true
149
+ (src.nil? || src.empty? || /^http/.match?(src) ||
150
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)) and return true
151
+ false
152
+ end
153
+
154
+ # we are going to use the 2nd instance of @page in the Word CSS,
155
+ # skipping the cover page. Currently doesn't deal with Landscape.
156
+ # Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
157
+ # Allow 0.9 * height to fit caption
158
+ def page_dimensions(docxml)
159
+ stylesheet = read_stylesheet(@stylesheet)
160
+ page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
161
+ return [680, 400]
162
+ m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
163
+ m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
164
+ return [680, 400]
165
+ [0.9 * (units_to_px(m_size[2]) - units_to_px(m_marg[1]) - units_to_px(m_marg[3])),
166
+ units_to_px(m_size[1]) - units_to_px(m_marg[2]) - units_to_px(m_marg[4])]
167
+ rescue StandardError
168
+ [680, 400]
169
+ end
170
+
171
+ def find_page_size_in_doc(stylesheet, doc)
172
+ find_page_size(stylesheet, "WordSection2", false) ||
173
+ find_page_size(stylesheet, "WordSection3", false) ||
174
+ find_page_size(doc, "WordSection2", true) ||
175
+ find_page_size(doc, "WordSection3", true) ||
176
+ find_page_size(stylesheet, "", false) || find_page_size(doc, "", true)
177
+ end
178
+
179
+ # if in_xml, CSS is embedded in XML <style> tag
180
+ def find_page_size(stylesheet, klass, in_xml)
181
+ xml_found = false
182
+ found = false
183
+ ret = ""
184
+ stylesheet&.lines&.each do |l|
185
+ in_xml && l.include?("<style") and xml_found = true and found = false
186
+ in_xml && l.include?("</style>") and xml_found = false
187
+ /^\s*@page\s+#{klass}/.match?(l) and found = true
188
+ found && /^\s*\{?size:/.match?(l) and ret += l
189
+ found && /^\s*\{?margin:/.match?(l) and ret += l
190
+ if found && /}/.match?(l)
191
+ !ret.blank? && (!in_xml || xml_found) and return ret
192
+ ret = ""
193
+ found = false
194
+ end
195
+ end
196
+ nil
197
+ end
198
+
199
+ def units_to_px(measure)
200
+ m = /^(\S+)(pt|cm)/.match(measure)
201
+ ret = case m[2]
202
+ when "px" then (m[1].to_f * 0.75)
203
+ when "pt" then m[1].to_f
204
+ when "cm" then (m[1].to_f * 28.346456693)
205
+ when "in" then (m[1].to_f * 72)
206
+ end
207
+ ret.to_i
208
+ end
209
+
133
210
  # do not parse the header through Nokogiri, since it will contain
134
211
  # non-XML like <![if !supportFootnotes]>
135
212
  def header_image_cleanup(doc, dir, filename, localdir)
@@ -155,8 +232,7 @@ class Html2Doc
155
232
  f.write %{<xml xmlns:o="urn:schemas-microsoft-com:office:office">
156
233
  <o:MainFile HRef="../#{filename}.htm"/>}
157
234
  Dir.entries(dir).sort.each do |item|
158
- next if item == "." || item == ".." || /^\./.match(item)
159
-
235
+ (item == "." || item == ".." || /^\./.match(item)) and next
160
236
  f.write %{ <o:File HRef="#{item}"/>\n}
161
237
  end
162
238
  f.write("</xml>\n")
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.3".freeze
2
+ VERSION = "1.5.5".freeze
3
3
  end
@@ -0,0 +1,83 @@
1
+ class Html2Doc
2
+ NOKOHEAD = <<~HERE.freeze
3
+ <!DOCTYPE html SYSTEM
4
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
5
+ <html xmlns="http://www.w3.org/1999/xhtml">
6
+ <head> <title></title> <meta charset="UTF-8" /> </head>
7
+ <body> </body> </html>
8
+ HERE
9
+
10
+ def to_xhtml(xml)
11
+ xml.gsub!(/<\?xml[^>]*>/, "")
12
+ unless /<!DOCTYPE /.match? xml
13
+ xml = '<!DOCTYPE html SYSTEM
14
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' + xml
15
+ end
16
+ xml = xml.gsub(/<!--\s*\[([^\]]+)\]>/, "<!-- MSWORD-COMMENT \\1 -->")
17
+ .gsub(/<!\s*\[endif\]\s*-->/, "<!-- MSWORD-COMMENT-END -->")
18
+ Nokogiri::XML.parse(xml)
19
+ end
20
+
21
+ DOCTYPE = <<~DOCTYPE.freeze
22
+ <!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
23
+ DOCTYPE
24
+
25
+ def from_xhtml(xml)
26
+ xml.to_xml.sub(%{ xmlns="http://www.w3.org/1999/xhtml"}, "")
27
+ .sub(DOCTYPE, "").gsub(%{ />}, "/>")
28
+ .gsub(/<!-- MSWORD-COMMENT (.+?) -->/, "<!--[\\1]>")
29
+ .gsub(/<!-- MSWORD-COMMENT-END -->/, "<![endif]-->")
30
+ .gsub("\n--&gt;\n", "\n-->\n")
31
+ end
32
+
33
+ def msword_fix(doc)
34
+ # brain damage in MSWord parser
35
+ doc.gsub!(%r{<w:DoNotOptimizeForBrowser></w:DoNotOptimizeForBrowser>},
36
+ "<w:DoNotOptimizeForBrowser/>")
37
+ doc.gsub!(%r{<span style="mso-special-character:footnote"/>},
38
+ '<span style="mso-special-character:footnote"></span>')
39
+ doc.gsub!(%r{<div style="mso-element:footnote-list"></div>},
40
+ '<div style="mso-element:footnote-list"/>')
41
+ doc.gsub!(%r{(<a style="mso-comment-reference:[^>/]+)/>}, "\\1></a>")
42
+ doc.gsub!(%r{<link rel="File-List"}, "<link rel=File-List")
43
+ doc.gsub!(%r{<meta http-equiv="Content-Type"},
44
+ "<meta http-equiv=Content-Type")
45
+ doc.gsub!(%r{></m:jc>}, "/>")
46
+ doc.gsub!(%r{></v:stroke>}, "/>")
47
+ doc.gsub!(%r{></v:f>}, "/>")
48
+ doc.gsub!(%r{></v:path>}, "/>")
49
+ doc.gsub!(%r{></o:lock>}, "/>")
50
+ doc.gsub!(%r{></v:imagedata>}, "/>")
51
+ doc.gsub!(%r{></w:wrap>}, "/>")
52
+ doc.gsub!(%r{<(/)?m:(span|em)\b}, "<\\1\\2")
53
+ doc.gsub!(%r{&tab;|&amp;tab;},
54
+ '<span style="mso-tab-count:1">&#xA0; </span>')
55
+ doc.split(%r{(<m:oMath>|</m:oMath>)}).each_slice(4).map do |a|
56
+ a.size > 2 and a[2] = a[2].gsub(/>\s+</, "><")
57
+ a
58
+ end.join
59
+ end
60
+
61
+ PRINT_VIEW = <<~XML.freeze
62
+
63
+ <xml>
64
+ <w:WordDocument>
65
+ <w:View>Print</w:View>
66
+ <w:Zoom>100</w:Zoom>
67
+ <w:DoNotOptimizeForBrowser/>
68
+ </w:WordDocument>
69
+ </xml>
70
+ <meta http-equiv='Content-Type' content="text/html; charset=utf-8"/>
71
+ XML
72
+
73
+ def namespace(root)
74
+ { o: "urn:schemas-microsoft-com:office:office",
75
+ w: "urn:schemas-microsoft-com:office:word",
76
+ v: "urn:schemas-microsoft-com:vml",
77
+ m: "http://schemas.microsoft.com/office/2004/12/omml" }.each { |k, v| root.add_namespace_definition(k.to_s, v) }
78
+ end
79
+
80
+ def rootnamespace(root)
81
+ root.add_namespace(nil, "http://www.w3.org/TR/REC-html40")
82
+ end
83
+ end
data/lib/html2doc.rb CHANGED
@@ -4,3 +4,4 @@ require_relative "html2doc/mime"
4
4
  require_relative "html2doc/notes"
5
5
  require_relative "html2doc/math"
6
6
  require_relative "html2doc/lists"
7
+ require_relative "html2doc/xml"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.3
4
+ version: 1.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-10 00:00:00.000000000 Z
11
+ date: 2023-06-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities
@@ -293,6 +293,7 @@ files:
293
293
  - lib/html2doc/notes.rb
294
294
  - lib/html2doc/version.rb
295
295
  - lib/html2doc/wordstyle.css
296
+ - lib/html2doc/xml.rb
296
297
  homepage: https://github.com/metanorma/html2doc
297
298
  licenses:
298
299
  - CC-BY-SA-3.0