libis-format 1.2.3 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8bb468afbfdd7031a466736c6d355919395bf295356032f6b3d3488908bb61b5
4
- data.tar.gz: b2ddc4588f0f3a1ac4ddde120ce726af21ab9013c98bae303674dca2033fec58
3
+ metadata.gz: 19055f943473fa0d2a9a1fdebf499c87652fcab845239a4b1f21341229ad08ad
4
+ data.tar.gz: f6f64bcaf5472f0e11aceb2cbe97b3f367b0ed4ccc4dfc389e5857f191c59897
5
5
  SHA512:
6
- metadata.gz: 176685c4eb8fac9734666928c7d22b18cc870158b4a45151b35f5cc81922d4f6f0521af026719efb04d9b3c3920ae887c271739b91ade104e5d1f2e6651e8f3e
7
- data.tar.gz: f3ca3c913aef08d75ff1dadef3790e7ad6838429bc91dbcff3173391f8d052d3f23ef9bcf3309c4c062650064947821cc046431cfe1196644964c04236cff6a7
6
+ metadata.gz: 7b0ee24177c3f541a8cd12733e04a04649d03b2785c68c8eaa44ca3fdebd7c657346afc9a824d59e872666bfa3a657483b00b6126970cc739ef8433889621847
7
+ data.tar.gz: 0bdbc5a928d6e556a816f65d05f2151e6b6ac331de37ce2d12030ace291f2b4cae452420236d9d5371f3566a7346f60854976dc33181e17549663b8ada5302f6
@@ -0,0 +1,21 @@
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "type": "rdbg",
9
+ "name": "Debug current file with rdbg",
10
+ "request": "launch",
11
+ "script": "${file}",
12
+ "args": [],
13
+ "askParameters": true
14
+ },
15
+ {
16
+ "type": "rdbg",
17
+ "name": "Attach with rdbg",
18
+ "request": "attach"
19
+ }
20
+ ]
21
+ }
data/Rakefile CHANGED
@@ -1,6 +1,10 @@
1
+ require 'rake'
2
+
1
3
  require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
6
+ require_relative 'spec/spec_helper'
7
+
4
8
  RSpec::Core::RakeTask.new
5
9
 
6
10
  task :default => :spec
@@ -1,10 +1,10 @@
1
1
  # encoding: utf-8
2
2
  require 'os'
3
+ require 'libis-tools'
3
4
 
4
5
  module Libis
5
6
  module Format
6
7
 
7
- # noinspection RubyConstantNamingConvention
8
8
  Config = ::Libis::Tools::Config
9
9
 
10
10
  Config[:converter_chain_max_level] = 8
@@ -21,8 +21,7 @@ module Libis
21
21
  Config[:fido_formats] = [(File.join(Libis::Format::DATA_DIR, 'lias_formats.xml'))]
22
22
  Config[:pdf_tool] = File.join(Libis::Format::TOOL_DIR, 'PdfTool.jar')
23
23
  Config[:preflight_jar] = File.join(Libis::Format::TOOL_DIR, 'pdfbox', 'preflight-app-2.0.13.jar')
24
- Config[:email2pdf_jar] = File.join(Libis::Format::TOOL_DIR, 'emailconverter.jar')
25
- # noinspection RubyStringKeysInHashInspection
24
+ Config[:wkhtmltopdf] = 'wkhtmltopdf'
26
25
  Config[:xml_validations] = [['archive/ead', File.join(Libis::Format::DATA_DIR, 'ead.xsd')]]
27
26
  Config[:type_database] = File.join(Libis::Format::DATA_DIR, 'types.yml')
28
27
  Config[:raw_audio_convert_cmd] = 'sox -V1 %s -e signed -b 16 -t wav %s rate %d channels %d'
@@ -2,7 +2,7 @@
2
2
 
3
3
  require_relative 'base'
4
4
 
5
- require 'libis/format/tool/email_to_pdf'
5
+ require 'libis/format/tool/msg_to_pdf'
6
6
  require 'libis/format/type_database'
7
7
  require 'rexml/document'
8
8
 
@@ -28,7 +28,7 @@ module Libis
28
28
  def convert(source, target, format, opts = {})
29
29
  super
30
30
 
31
- Format::Tool::EmailToPdf.run(source, target)
31
+ Format::Tool::MsgToPdf.run(source, target)
32
32
  end
33
33
 
34
34
  end
@@ -0,0 +1,298 @@
1
+ # encoding: utf-8
2
+
3
+ require 'mapi/msg'
4
+ require 'rfc_2047'
5
+ require 'cgi'
6
+ require 'pdfkit'
7
+
8
+ require 'fileutils'
9
+
10
+ require 'libis/format/config'
11
+
12
+ module Libis
13
+ module Format
14
+ module Tool
15
+ class MsgToPdf
16
+ include ::Libis::Tools::Logger
17
+
18
+ HEADER_STYLE = '<style>.header-table {margin: 0 0 20 0;padding: 0;font-family: Arial, Helvetica, sans-serif;}.header-name {padding-right: 5px;color: #9E9E9E;text-align: right;vertical-align: top;font-size: 12px;}.header-value {font-size: 12px;}#header_fields {background: white;margin: 0;border: 1px solid #DDD;border-radius: 3px;padding: 8px;width: 100%%;box-sizing: border-box;}</style><script type="text/javascript">function timer() {try {parent.postMessage(Math.max(document.body.offsetHeight, document.body.scrollHeight), \'*\');} catch (r) {}setTimeout(timer, 10);};timer();</script>'
19
+ HEADER_TABLE_TEMPLATE = '<div class="header-table"><table id="header_fields"><tbody>%s</tbody></table></div>'
20
+ HEADER_FIELD_TEMPLATE = '<tr><td class="header-name">%s</td><td class="header-value">%s</td></tr>'
21
+ HTML_WRAPPER_TEMPLATE = '<!DOCTYPE html><html><head><style>body {font-size: 0.5cm;}</style><title>title</title></head><body>%s</body></html>'
22
+
23
+ IMG_CID_PLAIN_REGEX = %r/\[cid:(.*?)\]/m
24
+ IMG_CID_HTML_REGEX = %r/cid:([^"]*)/m
25
+
26
+ def self.installed?
27
+ File.exist?(Libis::Format::Config[:wkhtmltopdf])
28
+ end
29
+
30
+ def self.run(source, target, options = {})
31
+ new.run source, target, options
32
+ end
33
+
34
+ def run(source, target, options = {})
35
+ # Preliminary checks
36
+ # ------------------
37
+
38
+ @warnings = []
39
+
40
+ # Check if source file exists
41
+ raise "File #{source} does not exist" unless File.exist?(source)
42
+
43
+ # Retrieving the message
44
+ # ----------------------
45
+
46
+ # Open the message
47
+ msg = Mapi::Msg.open(source)
48
+
49
+ target_format = options.delete(:to_html) ? :HTML : :PDF
50
+ result = msg_to_pdf(msg, target, target_format, options)
51
+ msg.close
52
+ return result
53
+ end
54
+
55
+ def msg_to_pdf(msg, target, target_format, pdf_options, reraise: false)
56
+
57
+ # Make sure the target directory exists
58
+ outdir = File.dirname(target)
59
+ FileUtils.mkdir_p(outdir)
60
+
61
+ # puts "Headers:"
62
+ # puts '--------'
63
+ # pp msg.headers
64
+
65
+ # puts "Recipients:"
66
+ # puts '-----------'
67
+ # pp msg.recipients
68
+
69
+ # puts "Body:"
70
+ # puts '-----'
71
+ # puts msg.properties.body
72
+ # puts '-----'
73
+ # puts msg.properties.body_rtf
74
+ # puts '-----'
75
+ # puts msg.properties.body_html
76
+
77
+ # puts "Attachments:"
78
+ # puts '------------'
79
+ # msg.attachments.each {|a| p "#{a.filename} - #{a.properties.attach_content_id}"}
80
+
81
+ # puts "Converting:"
82
+ # puts '-----------'
83
+
84
+ # Get the body of the message in HTML
85
+ body = msg.properties.body_html
86
+ body ||= begin
87
+ # Embed plain body in HTML as a fallback
88
+ HTML_WRAPPER_TEMPLATE % msg.properties.body
89
+ end
90
+
91
+ # Check and fix the character encoding
92
+ begin
93
+ # Try to encode into UTF-8
94
+ body.encode!('UTF-8', universal_newline: true)
95
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
96
+ begin
97
+ # If it fails, the text may be in Windows' Latin1 (ISO-8859-1)
98
+ body.force_encoding('ISO-8859-1').encode!('UTF-8', universal_newline: true)
99
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError => e
100
+ # If that fails too, log a warning and replace the invalid/unknown with a ? character.
101
+ @warnings << "#{e.class}: #{e.message}"
102
+ body.encode!('UTF-8', universal_newline: true, invalid: :replace, undef: :replace)
103
+ end
104
+ end
105
+
106
+ # Process headers
107
+ # ---------------
108
+ headers = {}
109
+ hdr_html = ''
110
+
111
+ %w"From To Cc Subject Date".each do |key|
112
+ value = find_hdr(msg.headers, key)
113
+ if value
114
+ headers[key.downcase.to_sym] = value
115
+ hdr_html += hdr_html(key, value)
116
+ end
117
+ end
118
+
119
+ # Add header section to the HTML body
120
+ unless hdr_html.empty?
121
+ # Insert header block styles
122
+ if body =~ /<\/head>/
123
+ # if head exists, append the style block
124
+ body.gsub!(/<\/head>/, HEADER_STYLE + '</head>')
125
+ else
126
+ # otherwise insert a head section before the body tag
127
+ body.gsub!(/<body/, '<head>' + HEADER_STYLE + '</head><body')
128
+ end
129
+ # Add the headers html table as first element in the body section
130
+ body.gsub!(/<body[^>]*>/) {|m| "#{m}#{HEADER_TABLE_TEMPLATE % hdr_html}"}
131
+ end
132
+
133
+ # Embed inline images
134
+ # -------------------
135
+ attachments = msg.attachments
136
+ used_files = []
137
+
138
+ # First process plaintext cid entries
139
+ body.gsub!(IMG_CID_PLAIN_REGEX) do |match|
140
+ # puts "CID found: #{match}, looking for #{$1}"
141
+ data = getAttachmentData(attachments, $1)
142
+ unless data
143
+ # puts "cid #{$1} not found"
144
+ return '<img src=""/>'
145
+ end
146
+ # puts "cid #{$1} data: #{data.inspect}"
147
+ used_files << $1
148
+ "<img src=\"data:#{data[:mime_type]};base64,#{data[:base64]}\"/>"
149
+ end
150
+
151
+ # Then process HTML img tags with CID entries
152
+ body.gsub!(IMG_CID_HTML_REGEX) do |match|
153
+ # puts "CID found: #{match}, looking for #{$1}"
154
+ data = getAttachmentData(attachments, $1)
155
+ unless data
156
+ # puts "cid #{$1} not found"
157
+ return ''
158
+ end
159
+ # puts "cid #{$1} data: #{data.inspect}"
160
+ used_files << $1
161
+ "data:#{data[:mime_type]};base64,#{data[:base64]}"
162
+ end
163
+
164
+ # Create PDF
165
+ # ----------
166
+ files = []
167
+
168
+ if target_format == :PDF
169
+ # PDF creation options
170
+ pdf_options = {
171
+ page_size: 'A4',
172
+ margin_top: '10mm',
173
+ margin_bottom: '10mm',
174
+ margin_left: '10mm',
175
+ margin_right: '10mm',
176
+ dpi: 300,
177
+ # image_quality: 100,
178
+ # viewport_size: '2480x3508',
179
+ }.merge pdf_options
180
+
181
+ # pp pdf_options
182
+ # puts "Final HTML body:"
183
+ # pp body
184
+ subject = find_hdr(msg.headers, 'Subject')
185
+ kit = PDFKit.new(body, title: (subject || 'message'), **pdf_options)
186
+ pdf = kit.to_pdf
187
+ File.open(target, 'wb') {|f| f.write(pdf)}
188
+ # puts "message #{subject} converted to PDF file '#{target}'"
189
+ else
190
+ File.open(target, 'wb') {|f| f.write(body)}
191
+ # puts "message #{subject} converted to HTML file '#{target}'"
192
+ end
193
+ files << target if File.exist?(target)
194
+
195
+ # Save attachments
196
+ # ----------------
197
+ outdir = File.join(outdir, "#{File.basename(target)}.attachments")
198
+ digits = ((attachments.count + 1)/ 10) + 1
199
+ i = 0
200
+ attachments.delete_if {|a| a.properties.attachment_hidden}.each do |a|
201
+ prefix = "#{"%0*d" % [digits, i]}-"
202
+ if sub_msg = a.instance_variable_get(:@embedded_msg)
203
+ # puts "Embedded email message ..."
204
+ subject = a.properties[:display_name] || sub_msg.subject || ""
205
+ file = File.join(outdir, "#{prefix}#{subject}.#{target_format.to_s.downcase}")
206
+
207
+ result = msg_to_pdf(sub_msg, file, target_format, pdf_options, reraise: true)
208
+ if e = result[:error]
209
+ raise
210
+ end
211
+ files += result[:files]
212
+ elsif a.filename
213
+ next if used_files.include?(a.filename)
214
+ file = File.join(outdir, "#{prefix}#{a.filename}")
215
+
216
+ FileUtils.mkdir_p(File.dirname(file))
217
+ File.open(file, 'wb') {|f| a.save(f)}
218
+ files << file
219
+ # puts "Attachment file '#{file}' created"
220
+ else
221
+ @warnings << "Attachment #{a.properties[:display_name]} cannot be saved"
222
+ next
223
+ end
224
+ i += 1
225
+ end
226
+
227
+ {
228
+ command: {status: 0},
229
+ files: files,
230
+ headers: headers,
231
+ warnings: @warnings
232
+ }
233
+
234
+ rescue Exception => e
235
+ # puts "ERROR: Exception #{e.class} raised: #{e.message}"
236
+ # e.backtrace.each {|t| puts " - #{t}"}
237
+ raise if reraise
238
+ msg.close
239
+ return {
240
+ command: {status: -1},
241
+ files: [],
242
+ headers: {},
243
+ errors: [
244
+ {
245
+ error: e.message,
246
+ error_class: e.class.name,
247
+ error_trace: e.backtrace,
248
+ }
249
+ ],
250
+ warnings: @warnings
251
+ }
252
+ end
253
+
254
+ protected
255
+
256
+ def eml_to_html
257
+
258
+ end
259
+
260
+ private
261
+
262
+ def find_hdr(list, key)
263
+ keys = list.keys
264
+ if k = keys.find {|x| x.to_s =~ /^#{key}$/i}
265
+ v = list[k]
266
+ v = v.first if v.is_a? Array
267
+ v = Rfc2047.decode(v).strip if v.is_a? String
268
+ return v
269
+ end
270
+ nil
271
+ end
272
+
273
+ def hdr_html(key, value)
274
+ return HEADER_FIELD_TEMPLATE % [key, CGI::escapeHTML(value)] if key.is_a?(String) && value.is_a?(String) && !value.empty?
275
+ ''
276
+ end
277
+
278
+ def getAttachmentData(attachments, cid)
279
+ attachments.each do |attachment|
280
+ if attachment.properties.attach_content_id == cid
281
+ attachment.data.rewind
282
+ return {
283
+ mime_type: attachment.properties.attach_mime_tag,
284
+ base64: Base64.encode64(attachment.data.read).gsub(/[\r\n]/, '')
285
+ }
286
+ end
287
+ end
288
+ return nil
289
+ end
290
+
291
+ def read_header(headers_file)
292
+ headers = YAML.load_file(headers_file)
293
+ headers.symbolize_keys
294
+ end
295
+ end
296
+ end
297
+ end
298
+ end
@@ -18,6 +18,7 @@ module Libis
18
18
  autoload :PdfSplit, 'libis/format/tool/pdf_split'
19
19
  autoload :PdfToPdfa, 'libis/format/tool/pdf_to_pdfa'
20
20
  autoload :PdfaValidator, 'libis/format/tool/pdfa_validator'
21
+ autoload :MsgToPdf, 'libis/format/tool/msg_to_pdf'
21
22
 
22
23
  end
23
24
  end
@@ -1,5 +1,5 @@
1
1
  module Libis
2
2
  module Format
3
- VERSION = '1.2.3'
3
+ VERSION = '1.2.5'
4
4
  end
5
5
  end
data/libis-format.gemspec CHANGED
@@ -40,5 +40,8 @@ Gem::Specification.new do |spec|
40
40
  spec.add_runtime_dependency 'chromaprint', '~> 0.0.2'
41
41
  spec.add_runtime_dependency 'naturally', '~> 2.1'
42
42
  spec.add_runtime_dependency 'pdfinfo', '~> 1.4'
43
+ spec.add_runtime_dependency 'ruby-msg-nx'
44
+ spec.add_runtime_dependency 'new_rfc_2047'
45
+ spec.add_runtime_dependency 'pdfkit'
43
46
 
44
47
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: libis-format
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.3
4
+ version: 1.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Kris Dekeyser
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-07-19 00:00:00.000000000 Z
11
+ date: 2023-08-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -178,6 +178,48 @@ dependencies:
178
178
  - - "~>"
179
179
  - !ruby/object:Gem::Version
180
180
  version: '1.4'
181
+ - !ruby/object:Gem::Dependency
182
+ name: ruby-msg-nx
183
+ requirement: !ruby/object:Gem::Requirement
184
+ requirements:
185
+ - - ">="
186
+ - !ruby/object:Gem::Version
187
+ version: '0'
188
+ type: :runtime
189
+ prerelease: false
190
+ version_requirements: !ruby/object:Gem::Requirement
191
+ requirements:
192
+ - - ">="
193
+ - !ruby/object:Gem::Version
194
+ version: '0'
195
+ - !ruby/object:Gem::Dependency
196
+ name: new_rfc_2047
197
+ requirement: !ruby/object:Gem::Requirement
198
+ requirements:
199
+ - - ">="
200
+ - !ruby/object:Gem::Version
201
+ version: '0'
202
+ type: :runtime
203
+ prerelease: false
204
+ version_requirements: !ruby/object:Gem::Requirement
205
+ requirements:
206
+ - - ">="
207
+ - !ruby/object:Gem::Version
208
+ version: '0'
209
+ - !ruby/object:Gem::Dependency
210
+ name: pdfkit
211
+ requirement: !ruby/object:Gem::Requirement
212
+ requirements:
213
+ - - ">="
214
+ - !ruby/object:Gem::Version
215
+ version: '0'
216
+ type: :runtime
217
+ prerelease: false
218
+ version_requirements: !ruby/object:Gem::Requirement
219
+ requirements:
220
+ - - ">="
221
+ - !ruby/object:Gem::Version
222
+ version: '0'
181
223
  description: Collection of tools and classes that help to identify formats of binary
182
224
  files and create derivative copies (e.g. PDF from Word).
183
225
  email:
@@ -194,6 +236,7 @@ files:
194
236
  - ".coveralls.yml"
195
237
  - ".gitignore"
196
238
  - ".travis.yml"
239
+ - ".vscode/launch.json"
197
240
  - Gemfile
198
241
  - LICENSE.txt
199
242
  - README.md
@@ -240,13 +283,13 @@ files:
240
283
  - lib/libis/format/identifier.rb
241
284
  - lib/libis/format/tool.rb
242
285
  - lib/libis/format/tool/droid.rb
243
- - lib/libis/format/tool/email_to_pdf.rb
244
286
  - lib/libis/format/tool/extension_identification.rb
245
287
  - lib/libis/format/tool/ff_mpeg.rb
246
288
  - lib/libis/format/tool/fido.rb
247
289
  - lib/libis/format/tool/file_tool.rb
248
290
  - lib/libis/format/tool/fop_pdf.rb
249
291
  - lib/libis/format/tool/identification_tool.rb
292
+ - lib/libis/format/tool/msg_to_pdf.rb
250
293
  - lib/libis/format/tool/office_to_pdf.rb
251
294
  - lib/libis/format/tool/pdf_copy.rb
252
295
  - lib/libis/format/tool/pdf_merge.rb
@@ -262,7 +305,6 @@ files:
262
305
  - tools/PdfTool.jar
263
306
  - tools/bcpkix-jdk15on-1.49.jar
264
307
  - tools/bcprov-jdk15on-1.49.jar
265
- - tools/emailconverter.jar
266
308
  - tools/fop/build/fop.jar
267
309
  - tools/fop/conf/fop.xconf
268
310
  - tools/fop/fop
@@ -335,7 +377,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
335
377
  - !ruby/object:Gem::Version
336
378
  version: '0'
337
379
  requirements: []
338
- rubygems_version: 3.1.6
380
+ rubygems_version: 3.3.7
339
381
  signing_key:
340
382
  specification_version: 4
341
383
  summary: LIBIS File format format services.
@@ -1,75 +0,0 @@
1
- require "fileutils"
2
-
3
- require "libis/tools/extend/string"
4
- require "libis/tools/logger"
5
- require "libis/tools/command"
6
-
7
- require "libis/format/config"
8
- require "rexml/document"
9
-
10
- module Libis
11
- module Format
12
- module Tool
13
- class EmailToPdf
14
- include ::Libis::Tools::Logger
15
-
16
- def self.installed?
17
- result = Libis::Tools::Command.run(Libis::Format::Config[:email2pdf_cmd], "-v")
18
- result[:status] == 0
19
- end
20
-
21
- def self.run(source, target, options = {})
22
- new.run source, target, options
23
- end
24
-
25
- def run(source, target, _ = {})
26
- timeout = Libis::Format::Config[:timeouts][:email2pdf] || 120
27
- result = Libis::Tools::Command.run(
28
- Libis::Format::Config[:java_cmd],
29
- "-Duser.timezone=Europe/Brussels", "-Duser.language=nl", "-Duser.country=BE",
30
- "-jar", Libis::Format::Config[:email2pdf_jar],
31
- "-e", "-hd", "-a",
32
- "-o", target,
33
- source,
34
- timeout: timeout,
35
- kill_after: timeout * 2
36
- )
37
-
38
- warn "EmailToPdf conversion messages: \n\t#{result[:err].join("\n\t")}" unless result[:err].empty?
39
-
40
- raise "#{self.class} took too long (> #{timeout} seconds) to complete" if result[:timeout]
41
- raise "#{self.class} failed to generate target file #{target}" unless File.exist?(target)
42
- raise "#{self.class} command failed with status code #{result[:status]}" unless result[:status] == 0
43
-
44
- base_path = File.join(File.dirname(target), File.basename(target, ".*"))
45
- headers_file = "#{base_path}.headers.xml"
46
- headers = read_header(headers_file)
47
-
48
- {
49
- command: result,
50
- files: [target, headers_file] + headers[:attachments].map { |a| File.join("#{base_path}-attachments", a) },
51
- headers: headers
52
- }
53
- end
54
-
55
- private
56
-
57
- def read_header(headers_file)
58
- headers = {}
59
- return headers unless File.exist?(headers_file)
60
- doc = REXML::Document.new(File.new(headers_file))
61
- root = doc.root
62
- root.children.each do |element|
63
- case element.name
64
- when "attachments"
65
- headers[:attachments] = element.elements.map { |e| e.text }
66
- else
67
- headers[element.name.to_sym] = element.text
68
- end
69
- end
70
- headers
71
- end
72
- end
73
- end
74
- end
75
- end
Binary file