libis-format 1.2.3 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8bb468afbfdd7031a466736c6d355919395bf295356032f6b3d3488908bb61b5
4
- data.tar.gz: b2ddc4588f0f3a1ac4ddde120ce726af21ab9013c98bae303674dca2033fec58
3
+ metadata.gz: 19055f943473fa0d2a9a1fdebf499c87652fcab845239a4b1f21341229ad08ad
4
+ data.tar.gz: f6f64bcaf5472f0e11aceb2cbe97b3f367b0ed4ccc4dfc389e5857f191c59897
5
5
  SHA512:
6
- metadata.gz: 176685c4eb8fac9734666928c7d22b18cc870158b4a45151b35f5cc81922d4f6f0521af026719efb04d9b3c3920ae887c271739b91ade104e5d1f2e6651e8f3e
7
- data.tar.gz: f3ca3c913aef08d75ff1dadef3790e7ad6838429bc91dbcff3173391f8d052d3f23ef9bcf3309c4c062650064947821cc046431cfe1196644964c04236cff6a7
6
+ metadata.gz: 7b0ee24177c3f541a8cd12733e04a04649d03b2785c68c8eaa44ca3fdebd7c657346afc9a824d59e872666bfa3a657483b00b6126970cc739ef8433889621847
7
+ data.tar.gz: 0bdbc5a928d6e556a816f65d05f2151e6b6ac331de37ce2d12030ace291f2b4cae452420236d9d5371f3566a7346f60854976dc33181e17549663b8ada5302f6
@@ -0,0 +1,21 @@
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "type": "rdbg",
9
+ "name": "Debug current file with rdbg",
10
+ "request": "launch",
11
+ "script": "${file}",
12
+ "args": [],
13
+ "askParameters": true
14
+ },
15
+ {
16
+ "type": "rdbg",
17
+ "name": "Attach with rdbg",
18
+ "request": "attach"
19
+ }
20
+ ]
21
+ }
data/Rakefile CHANGED
@@ -1,6 +1,10 @@
1
+ require 'rake'
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
6
+ require_relative 'spec/spec_helper'
7
+
4
8
  RSpec::Core::RakeTask.new
5
9
 
6
10
  task :default => :spec
@@ -1,10 +1,10 @@
1
1
  # encoding: utf-8
2
2
  require 'os'
3
+ require 'libis-tools'
3
4
 
4
5
  module Libis
5
6
  module Format
6
7
 
7
- # noinspection RubyConstantNamingConvention
8
8
  Config = ::Libis::Tools::Config
9
9
 
10
10
  Config[:converter_chain_max_level] = 8
@@ -21,8 +21,7 @@ module Libis
21
21
  Config[:fido_formats] = [(File.join(Libis::Format::DATA_DIR, 'lias_formats.xml'))]
22
22
  Config[:pdf_tool] = File.join(Libis::Format::TOOL_DIR, 'PdfTool.jar')
23
23
  Config[:preflight_jar] = File.join(Libis::Format::TOOL_DIR, 'pdfbox', 'preflight-app-2.0.13.jar')
24
- Config[:email2pdf_jar] = File.join(Libis::Format::TOOL_DIR, 'emailconverter.jar')
25
- # noinspection RubyStringKeysInHashInspection
24
+ Config[:wkhtmltopdf] = 'wkhtmltopdf'
26
25
  Config[:xml_validations] = [['archive/ead', File.join(Libis::Format::DATA_DIR, 'ead.xsd')]]
27
26
  Config[:type_database] = File.join(Libis::Format::DATA_DIR, 'types.yml')
28
27
  Config[:raw_audio_convert_cmd] = 'sox -V1 %s -e signed -b 16 -t wav %s rate %d channels %d'
@@ -2,7 +2,7 @@
2
2
 
3
3
  require_relative 'base'
4
4
 
5
- require 'libis/format/tool/email_to_pdf'
5
+ require 'libis/format/tool/msg_to_pdf'
6
6
  require 'libis/format/type_database'
7
7
  require 'rexml/document'
8
8
 
@@ -28,7 +28,7 @@ module Libis
28
28
  def convert(source, target, format, opts = {})
29
29
  super
30
30
 
31
- Format::Tool::EmailToPdf.run(source, target)
31
+ Format::Tool::MsgToPdf.run(source, target)
32
32
  end
33
33
 
34
34
  end
@@ -0,0 +1,298 @@
1
+ # encoding: utf-8
2
+
3
+ require 'mapi/msg'
4
+ require 'rfc_2047'
5
+ require 'cgi'
6
+ require 'pdfkit'
7
+
8
+ require 'fileutils'
9
+
10
+ require 'libis/format/config'
11
+
12
+ module Libis
13
+ module Format
14
+ module Tool
15
+ class MsgToPdf
16
+ include ::Libis::Tools::Logger
17
+
18
+ HEADER_STYLE = '<style>.header-table {margin: 0 0 20 0;padding: 0;font-family: Arial, Helvetica, sans-serif;}.header-name {padding-right: 5px;color: #9E9E9E;text-align: right;vertical-align: top;font-size: 12px;}.header-value {font-size: 12px;}#header_fields {background: white;margin: 0;border: 1px solid #DDD;border-radius: 3px;padding: 8px;width: 100%%;box-sizing: border-box;}</style><script type="text/javascript">function timer() {try {parent.postMessage(Math.max(document.body.offsetHeight, document.body.scrollHeight), \'*\');} catch (r) {}setTimeout(timer, 10);};timer();</script>'
19
+ HEADER_TABLE_TEMPLATE = '<div class="header-table"><table id="header_fields"><tbody>%s</tbody></table></div>'
20
+ HEADER_FIELD_TEMPLATE = '<tr><td class="header-name">%s</td><td class="header-value">%s</td></tr>'
21
+ HTML_WRAPPER_TEMPLATE = '<!DOCTYPE html><html><head><style>body {font-size: 0.5cm;}</style><title>title</title></head><body>%s</body></html>'
22
+
23
+ IMG_CID_PLAIN_REGEX = %r/\[cid:(.*?)\]/m
24
+ IMG_CID_HTML_REGEX = %r/cid:([^"]*)/m
25
+
26
+ def self.installed?
27
+ File.exist?(Libis::Format::Config[:wkhtmltopdf])
28
+ end
29
+
30
+ def self.run(source, target, options = {})
31
+ new.run source, target, options
32
+ end
33
+
34
+ def run(source, target, options = {})
35
+ # Preliminary checks
36
+ # ------------------
37
+
38
+ @warnings = []
39
+
40
+ # Check if source file exists
41
+ raise "File #{source} does not exist" unless File.exist?(source)
42
+
43
+ # Retrieving the message
44
+ # ----------------------
45
+
46
+ # Open the message
47
+ msg = Mapi::Msg.open(source)
48
+
49
+ target_format = options.delete(:to_html) ? :HTML : :PDF
50
+ result = msg_to_pdf(msg, target, target_format, options)
51
+ msg.close
52
+ return result
53
+ end
54
+
55
+ def msg_to_pdf(msg, target, target_format, pdf_options, reraise: false)
56
+
57
+ # Make sure the target directory exists
58
+ outdir = File.dirname(target)
59
+ FileUtils.mkdir_p(outdir)
60
+
61
+ # puts "Headers:"
62
+ # puts '--------'
63
+ # pp msg.headers
64
+
65
+ # puts "Recipients:"
66
+ # puts '-----------'
67
+ # pp msg.recipients
68
+
69
+ # puts "Body:"
70
+ # puts '-----'
71
+ # puts msg.properties.body
72
+ # puts '-----'
73
+ # puts msg.properties.body_rtf
74
+ # puts '-----'
75
+ # puts msg.properties.body_html
76
+
77
+ # puts "Attachments:"
78
+ # puts '------------'
79
+ # msg.attachments.each {|a| p "#{a.filename} - #{a.properties.attach_content_id}"}
80
+
81
+ # puts "Converting:"
82
+ # puts '-----------'
83
+
84
+ # Get the body of the message in HTML
85
+ body = msg.properties.body_html
86
+ body ||= begin
87
+ # Embed plain body in HTML as a fallback
88
+ HTML_WRAPPER_TEMPLATE % msg.properties.body
89
+ end
90
+
91
+ # Check and fix the character encoding
92
+ begin
93
+ # Try to encode into UTF-8
94
+ body.encode!('UTF-8', universal_newline: true)
95
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
96
+ begin
97
+ # If it fails, the text may be in Windows' Latin1 (ISO-8859-1)
98
+ body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true)
99
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
100
+ # If that fails too, log a warning and replace the invalid/unknown with a ? character.
101
+ @warnings << "#{e.class}: #{e.message}"
102
+ body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace)
103
+ end
104
+ end
105
+
106
+ # Process headers
107
+ # ---------------
108
+ headers = {}
109
+ hdr_html = ''
110
+
111
+ %w"From To Cc Subject Date".each do |key|
112
+ value = find_hdr(msg.headers, key)
113
+ if value
114
+ headers[key.downcase.to_sym] = value
115
+ hdr_html += hdr_html(key, value)
116
+ end
117
+ end
118
+
119
+ # Add header section to the HTML body
120
+ unless hdr_html.empty?
121
+ # Insert header block styles
122
+ if body =~ /<\/head>/
123
+ # if head exists, append the style block
124
+ body.gsub!(/<\/head>/, HEADER_STYLE + '</head>')
125
+ else
126
+ # otherwise insert a head section before the body tag
127
+ body.gsub!(/<body/, '<head>' + HEADER_STYLE + '</head><body')
128
+ end
129
+ # Add the headers html table as first element in the body section
130
+ body.gsub!(/<body[^>]*>/) {|m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}"}
131
+ end
132
+
133
+ # Embed inline images
134
+ # -------------------
135
+ attachments = msg.attachments
136
+ used_files = []
137
+
138
+ # First process plaintext cid entries
139
+ body.gsub!(IMG_CID_PLAIN_REGEX) do |match|
140
+ # puts "CID found: #{match}, looking for #{$1}"
141
+ data = getAttachmentData(attachments, $1)
142
+ unless data
143
+ # puts "cid #{$1} not found"
144
+ return '<img src=""/>'
145
+ end
146
+ # puts "cid #{$1} data: #{data.inspect}"
147
+ used_files << $1
148
+ "<img src=\"data:#{data[:mime_type]};base64,#{data[:base64]}\"/>"
149
+ end
150
+
151
+ # Then process HTML img tags with CID entries
152
+ body.gsub!(IMG_CID_HTML_REGEX) do |match|
153
+ # puts "CID found: #{match}, looking for #{$1}"
154
+ data = getAttachmentData(attachments, $1)
155
+ unless data
156
+ # puts "cid #{$1} not found"
157
+ return ''
158
+ end
159
+ # puts "cid #{$1} data: #{data.inspect}"
160
+ used_files << $1
161
+ "data:#{data[:mime_type]};base64,#{data[:base64]}"
162
+ end
163
+
164
+ # Create PDF
165
+ # ----------
166
+ files = []
167
+
168
+ if target_format == :PDF
169
+ # PDF creation options
170
+ pdf_options = {
171
+ page_size: 'A4',
172
+ margin_top: '10mm',
173
+ margin_bottom: '10mm',
174
+ margin_left: '10mm',
175
+ margin_right: '10mm',
176
+ dpi: 300,
177
+ # image_quality: 100,
178
+ # viewport_size: '2480x3508',
179
+ }.merge pdf_options
180
+
181
+ # pp pdf_options
182
+ # puts "Final HTML body:"
183
+ # pp body
184
+ subject = find_hdr(msg.headers, 'Subject')
185
+ kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options)
186
+ pdf = kit.to_pdf
187
+ File.open(target, 'wb') {|f| f.write(pdf)}
188
+ # puts "message #{subject} converted to PDF file '#{target}'"
189
+ else
190
+ File.open(target, 'wb') {|f| f.write(body)}
191
+ # puts "message #{subject} converted to HTML file '#{target}'"
192
+ end
193
+ files << target if File.exist?(target)
194
+
195
+ # Save attachments
196
+ # ----------------
197
+ outdir = File.join(outdir, "#{File.basename(target)}.attachments")
198
+ digits = ((attachments.count + 1)/ 10) + 1
199
+ i = 0
200
+ attachments.delete_if {|a| a.properties.attachment_hidden}.each do |a|
201
+ prefix = "#{"%0*d" % [digits, i]}-"
202
+ if sub_msg = a.instance_variable_get(:@embedded_msg)
203
+ # puts "Embedded email message ..."
204
+ subject = a.properties[:display_name] || sub_msg.subject || ""
205
+ file = File.join(outdir, "#{prefix}#{subject}.#{target_format.to_s.downcase}")
206
+
207
+ result = msg_to_pdf(sub_msg, file, target_format, pdf_options, reraise: true)
208
+ if e = result[:error]
209
+ raise
210
+ end
211
+ files += result[:files]
212
+ elsif a.filename
213
+ next if used_files.include?(a.filename)
214
+ file = File.join(outdir, "#{prefix}#{a.filename}")
215
+
216
+ FileUtils.mkdir_p(File.dirname(file))
217
+ File.open(file, 'wb') {|f| a.save(f)}
218
+ files << file
219
+ # puts "Attachment file '#{file}' created"
220
+ else
221
+ @warnings << "Attachment #{a.properties[:display_name]} cannot be saved"
222
+ next
223
+ end
224
+ i += 1
225
+ end
226
+
227
+ {
228
+ command: {status: 0},
229
+ files: files,
230
+ headers: headers,
231
+ warnings: @warnings
232
+ }
233
+
234
+ rescue Exception => e
235
+ # puts "ERROR: Exception #{e.class} raised: #{e.message}"
236
+ # e.backtrace.each {|t| puts " - #{t}"}
237
+ raise if reraise
238
+ msg.close
239
+ return {
240
+ command: {status: -1},
241
+ files: [],
242
+ headers: {},
243
+ errors: [
244
+ {
245
+ error: e.message,
246
+ error_class: e.class.name,
247
+ error_trace: e.backtrace,
248
+ }
249
+ ],
250
+ warnings: @warnings
251
+ }
252
+ end
253
+
254
+ protected
255
+
256
+ def eml_to_html
257
+
258
+ end
259
+
260
+ private
261
+
262
+ def find_hdr(list, key)
263
+ keys = list.keys
264
+ if k = keys.find {|x| x.to_s =~ /^#{key}$/i}
265
+ v = list[k]
266
+ v = v.first if v.is_a? Array
267
+ v = Rfc2047.decode(v).strip if v.is_a? String
268
+ return v
269
+ end
270
+ nil
271
+ end
272
+
273
+ def hdr_html(key, value)
274
+ return HEADER_FIELD_TEMPLATE % [key, CGI::escapeHTML(value)] if key.is_a?(String) && value.is_a?(String) && !value.empty?
275
+ ''
276
+ end
277
+
278
+ def getAttachmentData(attachments, cid)
279
+ attachments.each do |attachment|
280
+ if attachment.properties.attach_content_id == cid
281
+ attachment.data.rewind
282
+ return {
283
+ mime_type: attachment.properties.attach_mime_tag,
284
+ base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '')
285
+ }
286
+ end
287
+ end
288
+ return nil
289
+ end
290
+
291
+ def read_header(headers_file)
292
+ headers = YAML.load_file(headers_file)
293
+ headers.symbolize_keys
294
+ end
295
+ end
296
+ end
297
+ end
298
+ end
@@ -18,6 +18,7 @@ module Libis
18
18
  autoload :PdfSplit, 'libis/format/tool/pdf_split'
19
19
  autoload :PdfToPdfa, 'libis/format/tool/pdf_to_pdfa'
20
20
  autoload :PdfaValidator, 'libis/format/tool/pdfa_validator'
21
+ autoload :MsgToPdf, 'libis/format/tool/msg_to_pdf'
21
22
 
22
23
  end
23
24
  end
@@ -1,5 +1,5 @@
1
1
  module Libis
2
2
  module Format
3
- VERSION = '1.2.3'
3
+ VERSION = '1.2.5'
4
4
  end
5
5
  end
data/libis-format.gemspec CHANGED
@@ -40,5 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'chromaprint', '~> 0.0.2'
41
41
  spec.add_runtime_dependency 'naturally', '~> 2.1'
42
42
  spec.add_runtime_dependency 'pdfinfo', '~> 1.4'
43
+ spec.add_runtime_dependency 'ruby-msg-nx'
44
+ spec.add_runtime_dependency 'new_rfc_2047'
45
+ spec.add_runtime_dependency 'pdfkit'
43
46
 
44
47
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libis-format
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kris Dekeyser
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-07-19 00:00:00.000000000 Z
11
+ date: 2023-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -178,6 +178,48 @@ dependencies:
178
178
  - - "~>"
179
179
  - !ruby/object:Gem::Version
180
180
  version: '1.4'
181
+ - !ruby/object:Gem::Dependency
182
+ name: ruby-msg-nx
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: new_rfc_2047
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ type: :runtime
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: pdfkit
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
181
223
  description: Collection of tools and classes that help to identify formats of binary
182
224
  files and create derivative copies (e.g. PDF from Word).
183
225
  email:
@@ -194,6 +236,7 @@ files:
194
236
  - ".coveralls.yml"
195
237
  - ".gitignore"
196
238
  - ".travis.yml"
239
+ - ".vscode/launch.json"
197
240
  - Gemfile
198
241
  - LICENSE.txt
199
242
  - README.md
@@ -240,13 +283,13 @@ files:
240
283
  - lib/libis/format/identifier.rb
241
284
  - lib/libis/format/tool.rb
242
285
  - lib/libis/format/tool/droid.rb
243
- - lib/libis/format/tool/email_to_pdf.rb
244
286
  - lib/libis/format/tool/extension_identification.rb
245
287
  - lib/libis/format/tool/ff_mpeg.rb
246
288
  - lib/libis/format/tool/fido.rb
247
289
  - lib/libis/format/tool/file_tool.rb
248
290
  - lib/libis/format/tool/fop_pdf.rb
249
291
  - lib/libis/format/tool/identification_tool.rb
292
+ - lib/libis/format/tool/msg_to_pdf.rb
250
293
  - lib/libis/format/tool/office_to_pdf.rb
251
294
  - lib/libis/format/tool/pdf_copy.rb
252
295
  - lib/libis/format/tool/pdf_merge.rb
@@ -262,7 +305,6 @@ files:
262
305
  - tools/PdfTool.jar
263
306
  - tools/bcpkix-jdk15on-1.49.jar
264
307
  - tools/bcprov-jdk15on-1.49.jar
265
- - tools/emailconverter.jar
266
308
  - tools/fop/build/fop.jar
267
309
  - tools/fop/conf/fop.xconf
268
310
  - tools/fop/fop
@@ -335,7 +377,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
335
377
  - !ruby/object:Gem::Version
336
378
  version: '0'
337
379
  requirements: []
338
- rubygems_version: 3.1.6
380
+ rubygems_version: 3.3.7
339
381
  signing_key:
340
382
  specification_version: 4
341
383
  summary: LIBIS File format format services.
@@ -1,75 +0,0 @@
1
- require "fileutils"
2
-
3
- require "libis/tools/extend/string"
4
- require "libis/tools/logger"
5
- require "libis/tools/command"
6
-
7
- require "libis/format/config"
8
- require "rexml/document"
9
-
10
- module Libis
11
- module Format
12
- module Tool
13
- class EmailToPdf
14
- include ::Libis::Tools::Logger
15
-
16
- def self.installed?
17
- result = Libis::Tools::Command.run(Libis::Format::Config[:email2pdf_cmd], "-v")
18
- result[:status] == 0
19
- end
20
-
21
- def self.run(source, target, options = {})
22
- new.run source, target, options
23
- end
24
-
25
- def run(source, target, _ = {})
26
- timeout = Libis::Format::Config[:timeouts][:email2pdf] || 120
27
- result = Libis::Tools::Command.run(
28
- Libis::Format::Config[:java_cmd],
29
- "-Duser.timezone=Europe/Brussels", "-Duser.language=nl", "-Duser.country=BE",
30
- "-jar", Libis::Format::Config[:email2pdf_jar],
31
- "-e", "-hd", "-a",
32
- "-o", target,
33
- source,
34
- timeout: timeout,
35
- kill_after: timeout * 2
36
- )
37
-
38
- warn "EmailToPdf conversion messages: \n\t#{result[:err].join("\n\t")}" unless result[:err].empty?
39
-
40
- raise "#{self.class} took too long (> #{timeout} seconds) to complete" if result[:timeout]
41
- raise "#{self.class} failed to generate target file #{target}" unless File.exist?(target)
42
- raise "#{self.class} command failed with status code #{result[:status]}" unless result[:status] == 0
43
-
44
- base_path = File.join(File.dirname(target), File.basename(target, ".*"))
45
- headers_file = "#{base_path}.headers.xml"
46
- headers = read_header(headers_file)
47
-
48
- {
49
- command: result,
50
- files: [target, headers_file] + headers[:attachments].map { |a| File.join("#{base_path}-attachments", a) },
51
- headers: headers
52
- }
53
- end
54
-
55
- private
56
-
57
- def read_header(headers_file)
58
- headers = {}
59
- return headers unless File.exist?(headers_file)
60
- doc = REXML::Document.new(File.new(headers_file))
61
- root = doc.root
62
- root.children.each do |element|
63
- case element.name
64
- when "attachments"
65
- headers[:attachments] = element.elements.map { |e| e.text }
66
- else
67
- headers[element.name.to_sym] = element.text
68
- end
69
- end
70
- headers
71
- end
72
- end
73
- end
74
- end
75
- end
Binary file