html2doc 1.5.3 → 1.5.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114c21f3bbc33c244fb49577d4c85334789a388be61d65c04765d33ed1913208
4
- data.tar.gz: 60c00e3a300eb16db1f2332393d5ff9a45f65c60a8a1d40a1cfaba2767744249
3
+ metadata.gz: 74b05f46f1fd365f9ff0766e95d884bd2959c01b92c70d4a080651adfc2e8d3c
4
+ data.tar.gz: f70eb009e705ff767b34922fc0444740be8dde80da8b78c503784e02be0e4560
5
5
  SHA512:
6
- metadata.gz: 606e87a4dcfd0a3e270588461c6150dd38b1baad57bfc90b3965806477945deeae0b2eaeca87af9800b70d214133b11588a66f8205108e3af87f13464306b34a
7
- data.tar.gz: ecc2ad3631a0cafb9ff3b7f04022b9a977efac71099f7b256f840ec6d15e418bb18f426c729d6a40e9a9cc196f3e02d4cf0c5c448a99c6097c2da9d6b2bfb4d0
6
+ metadata.gz: e3d93501d63bd27ed6e5245cb18dbc49013fcecd83bc57acf3a5d3c797636b928b91e148e33e8326f10f77f9b94a7175d85294eb86a1b4b2261aafb7dfe9d7a4
7
+ data.tar.gz: 4cbb8887089e622b9d9d1fd82dc4e5fd4e8e81a28a59dcf02ccce22cb3c9e6e7c4c7802177259557c268d697ced17ec09e4181e82c0dc851a613553e7f5b58c1
data/lib/html2doc/base.rb CHANGED
@@ -168,9 +168,7 @@ class Html2Doc
168
168
  end
169
169
 
170
170
  def stylesheet(_filename, _header_filename, cssname)
171
- (cssname.nil? || cssname.empty?) and
172
- cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
173
- stylesheet = File.read(cssname, encoding: "UTF-8")
171
+ stylesheet = read_stylesheet(cssname)
174
172
  xml = Nokogiri::XML("<style/>")
175
173
  # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
176
174
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
@@ -180,6 +178,12 @@ class Html2Doc
180
178
  xml.root.to_s
181
179
  end
182
180
 
181
+ def read_stylesheet(cssname)
182
+ (cssname.nil? || cssname.empty?) and
183
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
184
+ File.read(cssname, encoding: "UTF-8")
185
+ end
186
+
183
187
  def define_head(docxml)
184
188
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
185
189
  head = docxml.at("//*[local-name() = 'head']")
data/lib/html2doc/mime.rb CHANGED
@@ -76,7 +76,6 @@ class Html2Doc
76
76
  end
77
77
  end
78
78
 
79
- # max width for Word document is 400, max height is 680
80
79
  def image_resize(img, path, maxheight, maxwidth)
81
80
  s, realsize = get_image_size(img, path)
82
81
  return s if s[0] == nil && s[1] == nil
@@ -115,21 +114,89 @@ class Html2Doc
115
114
 
116
115
  # only processes locally stored images
117
116
  def image_cleanup(docxml, dir, localdir)
117
+ maxheight, maxwidth = page_dimensions(docxml)
118
118
  docxml.traverse do |i|
119
- src = i["src"]
120
- next unless i.element? && %w(img v:imagedata).include?(i.name)
121
- next if src.nil? || src.empty? || /^http/.match?(src)
122
- next if %r{^data:(image|application)/[^;]+;base64}.match? src
123
-
124
- local_filename = localname(src, localdir)
125
- new_filename = "#{mkuuid}#{File.extname(src)}"
126
- FileUtils.cp local_filename, File.join(dir, new_filename)
127
- i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
128
- i["src"] = File.join(File.basename(dir), new_filename)
119
+ skip_image_cleanup?(i) and next
120
+ local_filename = rename_image(i, dir, localdir)
121
+ i["width"], i["height"] = image_resize(i, local_filename, maxheight,
122
+ maxwidth)
129
123
  end
130
124
  docxml
131
125
  end
132
126
 
127
+ def rename_image(img, dir, localdir)
128
+ local_filename = localname(img["src"], localdir)
129
+ new_filename = "#{mkuuid}#{File.extname(img['src'])}"
130
+ FileUtils.cp local_filename, File.join(dir, new_filename)
131
+ img["src"] = File.join(File.basename(dir), new_filename)
132
+ local_filename
133
+ end
134
+
135
+ def skip_image_cleanup?(img)
136
+ src = img["src"]
137
+ return true unless img.element? && %w(img v:imagedata).include?(img.name)
138
+ return true if src.nil? || src.empty? || /^http/.match?(src) ||
139
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)
140
+
141
+ false
142
+ end
143
+
144
+ # we are going to use the 2nd instance of @page in the Word CSS,
145
+ # skipping the cover page. Currently doesn't deal with Landscape.
146
+ # Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
147
+ # Allow 0.9 * height to fit caption
148
+ def page_dimensions(docxml)
149
+ stylesheet = read_stylesheet(@stylesheet)
150
+ page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
151
+ return [680, 400]
152
+ m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
153
+ m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
154
+ return [680, 400]
155
+ [0.9 * (units_to_px(m_size[2]) - units_to_px(m_marg[1]) - units_to_px(m_marg[3])),
156
+ units_to_px(m_size[1]) - units_to_px(m_marg[2]) - units_to_px(m_marg[4])]
157
+ rescue StandardError
158
+ [680, 400]
159
+ end
160
+
161
+ def find_page_size_in_doc(stylesheet, doc)
162
+ find_page_size(stylesheet, "WordSection2", false) ||
163
+ find_page_size(stylesheet, "WordSection3", false) ||
164
+ find_page_size(doc, "WordSection2", true) ||
165
+ find_page_size(doc, "WordSection3", true) ||
166
+ find_page_size(stylesheet, "", false) || find_page_size(doc, "", true)
167
+ end
168
+
169
+ # if in_xml, CSS is embedded in XML <style> tag
170
+ def find_page_size(stylesheet, klass, in_xml)
171
+ xml_found = false
172
+ found = false
173
+ ret = ""
174
+ stylesheet&.lines&.each do |l|
175
+ in_xml && l.include?("<style") and xml_found = true and found = false
176
+ in_xml && l.include?("</style>") and xml_found = false
177
+ /^\s*@page\s+#{klass}/.match?(l) and found = true
178
+ found && /^\s*\{?size:/.match?(l) and ret += l
179
+ found && /^\s*\{?margin:/.match?(l) and ret += l
180
+ if found && /}/.match?(l)
181
+ !ret.blank? && (!in_xml || xml_found) and return ret
182
+ ret = ""
183
+ found = false
184
+ end
185
+ end
186
+ nil
187
+ end
188
+
189
+ def units_to_px(measure)
190
+ m = /^(\S+)(pt|cm)/.match(measure)
191
+ ret = case m[2]
192
+ when "px" then (m[1].to_f * 0.75)
193
+ when "pt" then m[1].to_f
194
+ when "cm" then (m[1].to_f * 28.346456693)
195
+ when "in" then (m[1].to_f * 72)
196
+ end
197
+ ret.to_i
198
+ end
199
+
133
200
  # do not parse the header through Nokogiri, since it will contain
134
201
  # non-XML like <![if !supportFootnotes]>
135
202
  def header_image_cleanup(doc, dir, filename, localdir)
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.3".freeze
2
+ VERSION = "1.5.4".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.3
4
+ version: 1.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-10 00:00:00.000000000 Z
11
+ date: 2023-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities