mbox2csv 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -0
  3. data/lib/mbox2csv.rb +128 -3
  4. metadata +3 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fff3b927d3dc15547ce87943b6bfe1b747423222391f82cb336dbb03b710a056
4
- data.tar.gz: 143acbaaf7f95ae5f2b61df3e242bf976d328dfa780d4e3439d764a294ddffd7
3
+ metadata.gz: d01ca30a89acfe78cea4db6c1b61ee76a10ee9f268ffc51c80ed15136c98b279
4
+ data.tar.gz: 295a46bbcee3b62796febe837c145a002df6ad03dcbf1f2e9d3b3a455a46f3c7
5
5
  SHA512:
6
- metadata.gz: f68e2ca1627bcb9cf8d1325000b6b00d96b7f856227b1ea2a768962312def6bbd29dd4cce478c286ad1a610915fd5b0bbe90310fd0c370a28ca916681f2e4b35
7
- data.tar.gz: 0cf68cb50e9886e174d55844f1b68533795c23cb3a38824bf5c9c3d8c90e2b5454317db10b681ca1328eb483a0a7fca1222d18d1b20106e1a81ecfab336452fc
6
+ metadata.gz: e17144ea47a735f13d00070bca37c2e7fbbecd8e91685abb943628fc08f2e1b68e845864b69492fc333b3e72c8c07e98e78d662951d6aa4d8ecba54b6fe1dfe1
7
+ data.tar.gz: 833c1c6e20dbb6faa17d27e65db529877b08810eca35288277dc78b54f13f6034ba94318ae5376764613bd1ba0024b54a8705dbf6dcda06f1580c55dfd318132
data/README.md CHANGED
@@ -34,6 +34,10 @@ parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails
34
34
 
35
35
  # Parse the MBOX file, save email data, and generate statistics
36
36
  parser.parse
37
+
38
+ # Extract attachments
39
+ parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
40
+
37
41
  ```
38
42
 
39
43
  ## 🔑 License
data/lib/mbox2csv.rb CHANGED
@@ -5,7 +5,8 @@ require 'fileutils'
5
5
  require 'ruby-progressbar'
6
6
 
7
7
  module Mbox2CSV
8
- # Main class for parsing MBOX files and saving email data and statistics to CSV files.
8
+ # Main class for parsing MBOX files, saving email data/statistics to CSV,
9
+ # and (optionally) extracting selected attachment types to disk.
9
10
  class MboxParser
10
11
  # Initializes the MboxParser with file paths for the MBOX file, output CSV file,
11
12
  # and statistics CSV files for sender and recipient statistics.
@@ -55,6 +56,44 @@ module Mbox2CSV
55
56
  puts "Error processing MBOX file: #{e.message}"
56
57
  end
57
58
 
59
+ # Extract selected attachment file types from the MBOX into a folder.
60
+ #
61
+ # @param [Boolean] extract Flag to enable/disable extraction.
62
+ # @param [Array<String>] filetypes Array of extensions to extract (e.g., %w[pdf jpg png]).
63
+ # @param [String] output_folder Directory to write attachments into.
64
+ # @return [Integer] Number of files successfully written.
65
+ def extract_attachments(extract: true, filetypes: [], output_folder: 'attachments')
66
+ return 0 unless extract
67
+
68
+ wanted_exts = Array(filetypes).map { |e| e.to_s.downcase.sub(/\A\./, '') }.uniq
69
+ raise ArgumentError, "filetypes must not be empty when extract: true" if wanted_exts.empty?
70
+
71
+ FileUtils.mkdir_p(output_folder)
72
+ total_written = 0
73
+
74
+ total_lines = File.foreach(@mbox_file).inject(0) { |c, _| c + 1 }
75
+ progressbar = ProgressBar.create(title: "Extracting Attachments", total: total_lines, format: "%t: |%B| %p%%")
76
+
77
+ File.open(@mbox_file, 'r') do |mbox|
78
+ buffer = ""
79
+ mbox.each_line do |line|
80
+ progressbar.increment
81
+ if line.start_with?("From ")
82
+ total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
83
+ buffer = ""
84
+ end
85
+ buffer << line
86
+ end
87
+ total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
88
+ end
89
+
90
+ puts "Attachment extraction completed. #{total_written} file(s) saved to #{output_folder}"
91
+ total_written
92
+ rescue => e
93
+ puts "Error extracting attachments: #{e.message}"
94
+ 0
95
+ end
96
+
58
97
  private
59
98
 
60
99
  # Processes an individual email block from the MBOX file, extracts the email fields,
@@ -86,7 +125,8 @@ module Mbox2CSV
86
125
  # @param [Mail] mail The mail object to decode.
87
126
  # @return [String] The decoded email body.
88
127
  def decode_body(mail)
89
- body = if mail.multipart?
128
+ body =
129
+ if mail.multipart?
90
130
  part = mail.text_part || mail.html_part
91
131
  part&.body&.decoded || ''
92
132
  else
@@ -129,7 +169,7 @@ module Mbox2CSV
129
169
  sender_file = File.join(@senders_folder, "#{sanitize_filename(from)}.csv")
130
170
 
131
171
  CSV.open(sender_file, 'a') do |csv|
132
- if File.size(sender_file).zero?
172
+ if File.size?(sender_file).nil? || File.size(sender_file).zero?
133
173
  csv << ['From', 'To', 'Subject', 'Date', 'Body'] # Add header if file is new
134
174
  end
135
175
  csv << [from, to, subject, date, body]
@@ -145,6 +185,86 @@ module Mbox2CSV
145
185
  def sanitize_filename(filename)
146
186
  filename.gsub(/[^0-9A-Za-z\-]/, '_')
147
187
  end
188
+
189
+ # --- Helpers for attachment extraction ---
190
+
191
+ # Process a single email block to extract wanted attachments.
192
+ def process_attachment_block(buffer, wanted_exts, output_folder)
193
+ return 0 if buffer.nil? || buffer.empty?
194
+
195
+ mail = Mail.read_from_string(buffer)
196
+ return 0 unless mail
197
+
198
+ written = 0
199
+ date = (mail.date rescue nil)
200
+ date_str = date ? date.strftime("%Y-%m-%d") : "unknown_date"
201
+ time_str = date ? date.strftime("%H-%M-%S") : "unknown_time"
202
+
203
+ Array(mail.attachments).each do |att|
204
+ begin
205
+ original_name = att.filename || att.name || "attachment"
206
+ base = File.basename(original_name, ".*")
207
+ ext = File.extname(original_name).downcase.sub(/\A\./, '')
208
+
209
+ # If no ext present, try to infer from MIME type
210
+ ext = mime_to_ext(att.mime_type) if ext.empty? && att.mime_type
211
+
212
+ # Skip if extension not desired
213
+ next unless wanted_exts.include?(ext.downcase)
214
+
215
+ safe_base = sanitize_filename(base)
216
+ fname = "#{safe_base}_#{date_str}_#{time_str}.#{ext}"
217
+ path = File.join(output_folder, fname)
218
+
219
+ # Ensure uniqueness if file already exists
220
+ path = uniquify_path(path)
221
+
222
+ # Write decoded content
223
+ File.open(path, "wb") { |f| f.write(att.body.decoded) }
224
+ written += 1
225
+ rescue => e
226
+ puts "Failed to save attachment '#{att&.filename}': #{e.message}"
227
+ end
228
+ end
229
+
230
+ written
231
+ rescue => e
232
+ puts "Error processing attachment block: #{e.message}"
233
+ 0
234
+ end
235
+
236
+ # Minimal MIME→extension mapping; extend as needed.
237
+ def mime_to_ext(mime)
238
+ map = {
239
+ 'application/pdf' => 'pdf',
240
+ 'image/jpeg' => 'jpg',
241
+ 'image/jpg' => 'jpg',
242
+ 'image/png' => 'png',
243
+ 'image/gif' => 'gif',
244
+ 'text/plain' => 'txt',
245
+ 'application/zip' => 'zip',
246
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
247
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
248
+ 'application/msword' => 'doc',
249
+ 'application/vnd.ms-excel' => 'xls'
250
+ }
251
+ map[mime] || 'bin'
252
+ end
253
+
254
+ # If a path exists, append an incrementing suffix before the extension.
255
+ def uniquify_path(path)
256
+ return path unless File.exist?(path)
257
+ dir = File.dirname(path)
258
+ base = File.basename(path, ".*")
259
+ ext = File.extname(path)
260
+ i = 1
261
+ new_path = File.join(dir, "#{base}_#{i}#{ext}")
262
+ while File.exist?(new_path)
263
+ i += 1
264
+ new_path = File.join(dir, "#{base}_#{i}#{ext}")
265
+ end
266
+ new_path
267
+ end
148
268
  end
149
269
 
150
270
  # The EmailStatistics class is responsible for gathering and writing statistics related to emails.
@@ -214,3 +334,8 @@ class EmailStatistics
214
334
  end
215
335
  end
216
336
  end
337
+
338
+ # --- Usage example ---
339
+ # parser = Mbox2CSV::MboxParser.new("inbox.mbox", "emails.csv", "sender_stats.csv", "recipient_stats.csv")
340
+ # parser.parse
341
+ # parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mbox2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - firefly-cpp
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-01-25 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: base64
@@ -66,7 +65,6 @@ dependencies:
66
65
  - - "~>"
67
66
  - !ruby/object:Gem::Version
68
67
  version: '1.11'
69
- description:
70
68
  email:
71
69
  - iztok@iztok-jr-fister.eu
72
70
  executables: []
@@ -83,7 +81,6 @@ metadata:
83
81
  homepage_uri: https://github.com/firefly-cpp/mbox2csv
84
82
  source_code_uri: https://github.com/firefly-cpp/mbox2csv
85
83
  changelog_uri: https://github.com/firefly-cpp/mbox2csv
86
- post_install_message:
87
84
  rdoc_options: []
88
85
  require_paths:
89
86
  - lib
@@ -98,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
95
  - !ruby/object:Gem::Version
99
96
  version: '0'
100
97
  requirements: []
101
- rubygems_version: 3.5.22
102
- signing_key:
98
+ rubygems_version: 3.6.7
103
99
  specification_version: 4
104
100
  summary: Parse MBOX files and export email data into CSV format
105
101
  test_files: []