html2doc 1.5.3 → 1.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114c21f3bbc33c244fb49577d4c85334789a388be61d65c04765d33ed1913208
4
- data.tar.gz: 60c00e3a300eb16db1f2332393d5ff9a45f65c60a8a1d40a1cfaba2767744249
3
+ metadata.gz: 74b05f46f1fd365f9ff0766e95d884bd2959c01b92c70d4a080651adfc2e8d3c
4
+ data.tar.gz: f70eb009e705ff767b34922fc0444740be8dde80da8b78c503784e02be0e4560
5
5
  SHA512:
6
- metadata.gz: 606e87a4dcfd0a3e270588461c6150dd38b1baad57bfc90b3965806477945deeae0b2eaeca87af9800b70d214133b11588a66f8205108e3af87f13464306b34a
7
- data.tar.gz: ecc2ad3631a0cafb9ff3b7f04022b9a977efac71099f7b256f840ec6d15e418bb18f426c729d6a40e9a9cc196f3e02d4cf0c5c448a99c6097c2da9d6b2bfb4d0
6
+ metadata.gz: e3d93501d63bd27ed6e5245cb18dbc49013fcecd83bc57acf3a5d3c797636b928b91e148e33e8326f10f77f9b94a7175d85294eb86a1b4b2261aafb7dfe9d7a4
7
+ data.tar.gz: 4cbb8887089e622b9d9d1fd82dc4e5fd4e8e81a28a59dcf02ccce22cb3c9e6e7c4c7802177259557c268d697ced17ec09e4181e82c0dc851a613553e7f5b58c1
data/lib/html2doc/base.rb CHANGED
@@ -168,9 +168,7 @@ class Html2Doc
168
168
  end
169
169
 
170
170
  def stylesheet(_filename, _header_filename, cssname)
171
- (cssname.nil? || cssname.empty?) and
172
- cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
173
- stylesheet = File.read(cssname, encoding: "UTF-8")
171
+ stylesheet = read_stylesheet(cssname)
174
172
  xml = Nokogiri::XML("<style/>")
175
173
  # s = Nokogiri::XML::CDATA.new(xml, "\n#{stylesheet}\n")
176
174
  # xml.children.first << Nokogiri::XML::Comment.new(xml, s)
@@ -180,6 +178,12 @@ class Html2Doc
180
178
  xml.root.to_s
181
179
  end
182
180
 
181
+ def read_stylesheet(cssname)
182
+ (cssname.nil? || cssname.empty?) and
183
+ cssname = File.join(File.dirname(__FILE__), "wordstyle.css")
184
+ File.read(cssname, encoding: "UTF-8")
185
+ end
186
+
183
187
  def define_head(docxml)
184
188
  title = docxml.at("//*[local-name() = 'head']/*[local-name() = 'title']")
185
189
  head = docxml.at("//*[local-name() = 'head']")
data/lib/html2doc/mime.rb CHANGED
@@ -76,7 +76,6 @@ class Html2Doc
76
76
  end
77
77
  end
78
78
 
79
- # max width for Word document is 400, max height is 680
80
79
  def image_resize(img, path, maxheight, maxwidth)
81
80
  s, realsize = get_image_size(img, path)
82
81
  return s if s[0] == nil && s[1] == nil
@@ -115,21 +114,89 @@ class Html2Doc
115
114
 
116
115
  # only processes locally stored images
117
116
  def image_cleanup(docxml, dir, localdir)
117
+ maxheight, maxwidth = page_dimensions(docxml)
118
118
  docxml.traverse do |i|
119
- src = i["src"]
120
- next unless i.element? && %w(img v:imagedata).include?(i.name)
121
- next if src.nil? || src.empty? || /^http/.match?(src)
122
- next if %r{^data:(image|application)/[^;]+;base64}.match? src
123
-
124
- local_filename = localname(src, localdir)
125
- new_filename = "#{mkuuid}#{File.extname(src)}"
126
- FileUtils.cp local_filename, File.join(dir, new_filename)
127
- i["width"], i["height"] = image_resize(i, local_filename, 680, 400)
128
- i["src"] = File.join(File.basename(dir), new_filename)
119
+ skip_image_cleanup?(i) and next
120
+ local_filename = rename_image(i, dir, localdir)
121
+ i["width"], i["height"] = image_resize(i, local_filename, maxheight,
122
+ maxwidth)
129
123
  end
130
124
  docxml
131
125
  end
132
126
 
127
+ def rename_image(img, dir, localdir)
128
+ local_filename = localname(img["src"], localdir)
129
+ new_filename = "#{mkuuid}#{File.extname(img['src'])}"
130
+ FileUtils.cp local_filename, File.join(dir, new_filename)
131
+ img["src"] = File.join(File.basename(dir), new_filename)
132
+ local_filename
133
+ end
134
+
135
+ def skip_image_cleanup?(img)
136
+ src = img["src"]
137
+ return true unless img.element? && %w(img v:imagedata).include?(img.name)
138
+ return true if src.nil? || src.empty? || /^http/.match?(src) ||
139
+ %r{^data:(image|application)/[^;]+;base64}.match?(src)
140
+
141
+ false
142
+ end
143
+
144
+ # we are going to use the 2nd instance of @page in the Word CSS,
145
+ # skipping the cover page. Currently doesn't deal with Landscape.
146
+ # Scan both @stylesheet and docxml.to_xml (where @standardstylesheet has ended up)
147
+ # Allow 0.9 * height to fit caption
148
+ def page_dimensions(docxml)
149
+ stylesheet = read_stylesheet(@stylesheet)
150
+ page_size = find_page_size_in_doc(stylesheet, docxml.to_xml) or
151
+ return [680, 400]
152
+ m_size = /size:\s*(\S+)\s+(\S+)\s*;/.match(page_size) or return [680, 400]
153
+ m_marg = /margin:\s*(\S+)\s+(\S+)\s*(\S+)\s*(\S+)\s*;/.match(page_size) or
154
+ return [680, 400]
155
+ [0.9 * (units_to_px(m_size[2]) - units_to_px(m_marg[1]) - units_to_px(m_marg[3])),
156
+ units_to_px(m_size[1]) - units_to_px(m_marg[2]) - units_to_px(m_marg[4])]
157
+ rescue StandardError
158
+ [680, 400]
159
+ end
160
+
161
+ def find_page_size_in_doc(stylesheet, doc)
162
+ find_page_size(stylesheet, "WordSection2", false) ||
163
+ find_page_size(stylesheet, "WordSection3", false) ||
164
+ find_page_size(doc, "WordSection2", true) ||
165
+ find_page_size(doc, "WordSection3", true) ||
166
+ find_page_size(stylesheet, "", false) || find_page_size(doc, "", true)
167
+ end
168
+
169
+ # if in_xml, CSS is embedded in XML <style> tag
170
+ def find_page_size(stylesheet, klass, in_xml)
171
+ xml_found = false
172
+ found = false
173
+ ret = ""
174
+ stylesheet&.lines&.each do |l|
175
+ in_xml && l.include?("<style") and xml_found = true and found = false
176
+ in_xml && l.include?("</style>") and xml_found = false
177
+ /^\s*@page\s+#{klass}/.match?(l) and found = true
178
+ found && /^\s*\{?size:/.match?(l) and ret += l
179
+ found && /^\s*\{?margin:/.match?(l) and ret += l
180
+ if found && /}/.match?(l)
181
+ !ret.blank? && (!in_xml || xml_found) and return ret
182
+ ret = ""
183
+ found = false
184
+ end
185
+ end
186
+ nil
187
+ end
188
+
189
+ def units_to_px(measure)
190
+ m = /^(\S+)(pt|cm)/.match(measure)
191
+ ret = case m[2]
192
+ when "px" then (m[1].to_f * 0.75)
193
+ when "pt" then m[1].to_f
194
+ when "cm" then (m[1].to_f * 28.346456693)
195
+ when "in" then (m[1].to_f * 72)
196
+ end
197
+ ret.to_i
198
+ end
199
+
133
200
  # do not parse the header through Nokogiri, since it will contain
134
201
  # non-XML like <![if !supportFootnotes]>
135
202
  def header_image_cleanup(doc, dir, filename, localdir)
@@ -1,3 +1,3 @@
1
1
  class Html2Doc
2
- VERSION = "1.5.3".freeze
2
+ VERSION = "1.5.4".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: html2doc
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.5.3
4
+ version: 1.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-10 00:00:00.000000000 Z
11
+ date: 2023-05-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: htmlentities