mbox2csv 0.1.2 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +4 -0
  3. data/lib/mbox2csv.rb +177 -15
  4. metadata +17 -7
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4075cf789bd3819e1642195c3787a2406ce5f04c3cea31345bb2a76a2783a960
4
- data.tar.gz: dd347f1302b8b90363b1b975753c2eda79a52d09ce90bf620df8946e37ae3363
3
+ metadata.gz: d01ca30a89acfe78cea4db6c1b61ee76a10ee9f268ffc51c80ed15136c98b279
4
+ data.tar.gz: 295a46bbcee3b62796febe837c145a002df6ad03dcbf1f2e9d3b3a455a46f3c7
5
5
  SHA512:
6
- metadata.gz: 93d4bf15e4932ab1bd47c6553b1cd3136d5763dee587ce3bcf06a607910770707be5d4c8c7a21017c0469acb7725fccff037376bf0bdadd42e0ee8e06ba84bb5
7
- data.tar.gz: 0fc729d86a2d57552a98eab459a78bd7d0472853ede8aeb3488f9f110daed88a38c9161ff497262b38da3f5558897993335b6bbe8d331c7848415f488e308353
6
+ metadata.gz: e17144ea47a735f13d00070bca37c2e7fbbecd8e91685abb943628fc08f2e1b68e845864b69492fc333b3e72c8c07e98e78d662951d6aa4d8ecba54b6fe1dfe1
7
+ data.tar.gz: 833c1c6e20dbb6faa17d27e65db529877b08810eca35288277dc78b54f13f6034ba94318ae5376764613bd1ba0024b54a8705dbf6dcda06f1580c55dfd318132
data/README.md CHANGED
@@ -34,6 +34,10 @@ parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails
34
34
 
35
35
  # Parse the MBOX file, save email data, and generate statistics
36
36
  parser.parse
37
+
38
+ # Extract attachments
39
+ parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
40
+
37
41
  ```
38
42
 
39
43
  ## 🔑 License
data/lib/mbox2csv.rb CHANGED
@@ -1,12 +1,15 @@
1
1
  require 'base64'
2
2
  require 'csv'
3
3
  require 'mail'
4
+ require 'fileutils'
5
+ require 'ruby-progressbar'
4
6
 
5
7
  module Mbox2CSV
6
- # Main class
8
+ # Main class for parsing MBOX files, saving email data/statistics to CSV,
9
+ # and (optionally) extracting selected attachment types to disk.
7
10
  class MboxParser
8
11
  # Initializes the MboxParser with file paths for the MBOX file, output CSV file,
9
- # and statistics CSV files for sender statistics.
12
+ # and statistics CSV files for sender and recipient statistics.
10
13
  #
11
14
  # @param [String] mbox_file Path to the MBOX file to be parsed.
12
15
  # @param [String] csv_file Path to the output CSV file where parsed email data will be saved.
@@ -18,40 +21,84 @@ module Mbox2CSV
18
21
  @statistics = EmailStatistics.new
19
22
  @stats_csv_file = stats_csv_file
20
23
  @recipient_stats_csv_file = recipient_stats_csv_file
24
+ @senders_folder = 'senders/'
25
+ FileUtils.mkdir_p(@senders_folder) # Create the senders folder if it doesn't exist
21
26
  end
22
27
 
23
28
  # Parses the MBOX file and writes the email data to the specified CSV file.
24
29
  # It also saves sender and recipient statistics to separate CSV files.
30
+ # A progress bar is displayed during the parsing process.
25
31
  def parse
32
+ total_lines = File.foreach(@mbox_file).inject(0) { |c, _line| c + 1 }
33
+ progressbar = ProgressBar.create(title: "Parsing Emails", total: total_lines, format: "%t: |%B| %p%%")
34
+
26
35
  CSV.open(@csv_file, 'w') do |csv|
27
- # Write CSV header
28
36
  csv << ['From', 'To', 'Subject', 'Date', 'Body']
29
37
 
30
38
  File.open(@mbox_file, 'r') do |mbox|
31
39
  buffer = ""
32
40
  mbox.each_line do |line|
41
+ progressbar.increment
33
42
  if line.start_with?("From ")
34
43
  process_email_block(buffer, csv) unless buffer.empty?
35
- buffer = "" # Reset buffer
44
+ buffer = ""
36
45
  end
37
- buffer << line # Append line to buffer
46
+ buffer << line
38
47
  end
39
- process_email_block(buffer, csv) unless buffer.empty? # Process last email block
48
+ process_email_block(buffer, csv) unless buffer.empty?
40
49
  end
41
50
  end
42
51
  puts "Parsing completed. Data saved to #{@csv_file}"
43
52
 
44
- # Save and print statistics after parsing
45
53
  @statistics.save_sender_statistics(@stats_csv_file)
46
54
  @statistics.save_recipient_statistics(@recipient_stats_csv_file)
47
55
  rescue => e
48
56
  puts "Error processing MBOX file: #{e.message}"
49
57
  end
50
58
 
59
+ # Extract selected attachment file types from the MBOX into a folder.
60
+ #
61
+ # @param [Boolean] extract Flag to enable/disable extraction.
62
+ # @param [Array<String>] filetypes Array of extensions to extract (e.g., %w[pdf jpg png]).
63
+ # @param [String] output_folder Directory to write attachments into.
64
+ # @return [Integer] Number of files successfully written.
65
+ def extract_attachments(extract: true, filetypes: [], output_folder: 'attachments')
66
+ return 0 unless extract
67
+
68
+ wanted_exts = Array(filetypes).map { |e| e.to_s.downcase.sub(/\A\./, '') }.uniq
69
+ raise ArgumentError, "filetypes must not be empty when extract: true" if wanted_exts.empty?
70
+
71
+ FileUtils.mkdir_p(output_folder)
72
+ total_written = 0
73
+
74
+ total_lines = File.foreach(@mbox_file).inject(0) { |c, _| c + 1 }
75
+ progressbar = ProgressBar.create(title: "Extracting Attachments", total: total_lines, format: "%t: |%B| %p%%")
76
+
77
+ File.open(@mbox_file, 'r') do |mbox|
78
+ buffer = ""
79
+ mbox.each_line do |line|
80
+ progressbar.increment
81
+ if line.start_with?("From ")
82
+ total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
83
+ buffer = ""
84
+ end
85
+ buffer << line
86
+ end
87
+ total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
88
+ end
89
+
90
+ puts "Attachment extraction completed. #{total_written} file(s) saved to #{output_folder}"
91
+ total_written
92
+ rescue => e
93
+ puts "Error extracting attachments: #{e.message}"
94
+ 0
95
+ end
96
+
51
97
  private
52
98
 
53
99
  # Processes an individual email block from the MBOX file, extracts the email fields,
54
- # and writes them to the CSV. Also records email statistics for analysis.
100
+ # and writes them to the CSV. Also records email statistics for analysis and creates
101
+ # sender-specific CSV files.
55
102
  #
56
103
  # @param [String] buffer The email block from the MBOX file.
57
104
  # @param [CSV] csv The CSV object where email data is written.
@@ -62,14 +109,13 @@ module Mbox2CSV
62
109
  to = ensure_utf8(mail.to ? mail.to.join(", ") : '', 'UTF-8')
63
110
  subject = ensure_utf8(mail.subject ? mail.subject : '', 'UTF-8')
64
111
  date = ensure_utf8(mail.date ? mail.date.to_s : '', 'UTF-8')
65
-
66
112
  body = decode_body(mail)
67
113
 
68
- # Write to CSV
69
114
  csv << [from, to, subject, date, body]
70
115
 
71
- # Record email for statistics
72
116
  @statistics.record_email(from, to, body.length)
117
+
118
+ save_email_to_sender_csv(from, to, subject, date, body)
73
119
  rescue => e
74
120
  puts "Error processing email block: #{e.message}"
75
121
  end
@@ -79,7 +125,8 @@ module Mbox2CSV
79
125
  # @param [Mail] mail The mail object to decode.
80
126
  # @return [String] The decoded email body.
81
127
  def decode_body(mail)
82
- body = if mail.multipart?
128
+ body =
129
+ if mail.multipart?
83
130
  part = mail.text_part || mail.html_part
84
131
  part&.body&.decoded || ''
85
132
  else
@@ -108,15 +155,125 @@ module Mbox2CSV
108
155
  text = text.force_encoding(charset) if charset
109
156
  text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
110
157
  end
158
+
159
+ # Saves an email to a sender-specific CSV file.
160
+ #
161
+ # @param [String] from The sender of the email.
162
+ # @param [String] to The recipient(s) of the email.
163
+ # @param [String] subject The subject of the email.
164
+ # @param [String] date The date of the email.
165
+ # @param [String] body The body of the email.
166
+ def save_email_to_sender_csv(from, to, subject, date, body)
167
+ return if from.empty?
168
+
169
+ sender_file = File.join(@senders_folder, "#{sanitize_filename(from)}.csv")
170
+
171
+ CSV.open(sender_file, 'a') do |csv|
172
+ if File.size?(sender_file).nil? || File.size(sender_file).zero?
173
+ csv << ['From', 'To', 'Subject', 'Date', 'Body'] # Add header if file is new
174
+ end
175
+ csv << [from, to, subject, date, body]
176
+ end
177
+ rescue => e
178
+ puts "Error writing to sender CSV file for #{from}: #{e.message}"
179
+ end
180
+
181
+ # Sanitizes filenames by replacing invalid characters with underscores.
182
+ #
183
+ # @param [String] filename The input filename.
184
+ # @return [String] A sanitized version of the filename.
185
+ def sanitize_filename(filename)
186
+ filename.gsub(/[^0-9A-Za-z\-]/, '_')
187
+ end
188
+
189
+ # --- Helpers for attachment extraction ---
190
+
191
+ # Process a single email block to extract wanted attachments.
192
+ def process_attachment_block(buffer, wanted_exts, output_folder)
193
+ return 0 if buffer.nil? || buffer.empty?
194
+
195
+ mail = Mail.read_from_string(buffer)
196
+ return 0 unless mail
197
+
198
+ written = 0
199
+ date = (mail.date rescue nil)
200
+ date_str = date ? date.strftime("%Y-%m-%d") : "unknown_date"
201
+ time_str = date ? date.strftime("%H-%M-%S") : "unknown_time"
202
+
203
+ Array(mail.attachments).each do |att|
204
+ begin
205
+ original_name = att.filename || att.name || "attachment"
206
+ base = File.basename(original_name, ".*")
207
+ ext = File.extname(original_name).downcase.sub(/\A\./, '')
208
+
209
+ # If no ext present, try to infer from MIME type
210
+ ext = mime_to_ext(att.mime_type) if ext.empty? && att.mime_type
211
+
212
+ # Skip if extension not desired
213
+ next unless wanted_exts.include?(ext.downcase)
214
+
215
+ safe_base = sanitize_filename(base)
216
+ fname = "#{safe_base}_#{date_str}_#{time_str}.#{ext}"
217
+ path = File.join(output_folder, fname)
218
+
219
+ # Ensure uniqueness if file already exists
220
+ path = uniquify_path(path)
221
+
222
+ # Write decoded content
223
+ File.open(path, "wb") { |f| f.write(att.body.decoded) }
224
+ written += 1
225
+ rescue => e
226
+ puts "Failed to save attachment '#{att&.filename}': #{e.message}"
227
+ end
228
+ end
229
+
230
+ written
231
+ rescue => e
232
+ puts "Error processing attachment block: #{e.message}"
233
+ 0
234
+ end
235
+
236
+ # Minimal MIME→extension mapping; extend as needed.
237
+ def mime_to_ext(mime)
238
+ map = {
239
+ 'application/pdf' => 'pdf',
240
+ 'image/jpeg' => 'jpg',
241
+ 'image/jpg' => 'jpg',
242
+ 'image/png' => 'png',
243
+ 'image/gif' => 'gif',
244
+ 'text/plain' => 'txt',
245
+ 'application/zip' => 'zip',
246
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
247
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
248
+ 'application/msword' => 'doc',
249
+ 'application/vnd.ms-excel' => 'xls'
250
+ }
251
+ map[mime] || 'bin'
252
+ end
253
+
254
+ # If a path exists, append an incrementing suffix before the extension.
255
+ def uniquify_path(path)
256
+ return path unless File.exist?(path)
257
+ dir = File.dirname(path)
258
+ base = File.basename(path, ".*")
259
+ ext = File.extname(path)
260
+ i = 1
261
+ new_path = File.join(dir, "#{base}_#{i}#{ext}")
262
+ while File.exist?(new_path)
263
+ i += 1
264
+ new_path = File.join(dir, "#{base}_#{i}#{ext}")
265
+ end
266
+ new_path
267
+ end
111
268
  end
112
269
 
113
270
  # The EmailStatistics class is responsible for gathering and writing statistics related to emails.
114
271
  # It tracks sender frequency, recipient frequency, and calculates the average email body length per sender.
115
272
  class EmailStatistics
116
273
  def initialize
117
- @sender_counts = Hash.new(0) # Keeps count of emails per sender
118
- @recipient_counts = Hash.new(0) # Keeps count of emails per recipient
119
- @body_lengths = Hash.new { |hash, key| hash[key] = [] } # Stores body lengths per sender
274
+ @sender_counts = Hash.new(0)
275
+ @recipient_counts = Hash.new(0)
276
+ @body_lengths = Hash.new { |hash, key| hash[key] = [] }
120
277
  end
121
278
 
122
279
  # Records an email's sender, recipients, and body length for statistical purposes.
@@ -177,3 +334,8 @@ class EmailStatistics
177
334
  end
178
335
  end
179
336
  end
337
+
338
+ # --- Usage example ---
339
+ # parser = Mbox2CSV::MboxParser.new("inbox.mbox", "emails.csv", "sender_stats.csv", "recipient_stats.csv")
340
+ # parser.parse
341
+ # parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mbox2csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - firefly-cpp
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2025-01-10 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: base64
@@ -52,7 +51,20 @@ dependencies:
52
51
  - - "~>"
53
52
  - !ruby/object:Gem::Version
54
53
  version: 2.8.1
55
- description:
54
+ - !ruby/object:Gem::Dependency
55
+ name: ruby-progressbar
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.11'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.11'
56
68
  email:
57
69
  - iztok@iztok-jr-fister.eu
58
70
  executables: []
@@ -69,7 +81,6 @@ metadata:
69
81
  homepage_uri: https://github.com/firefly-cpp/mbox2csv
70
82
  source_code_uri: https://github.com/firefly-cpp/mbox2csv
71
83
  changelog_uri: https://github.com/firefly-cpp/mbox2csv
72
- post_install_message:
73
84
  rdoc_options: []
74
85
  require_paths:
75
86
  - lib
@@ -84,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
84
95
  - !ruby/object:Gem::Version
85
96
  version: '0'
86
97
  requirements: []
87
- rubygems_version: 3.5.22
88
- signing_key:
98
+ rubygems_version: 3.6.7
89
99
  specification_version: 4
90
100
  summary: Parse MBOX files and export email data into CSV format
91
101
  test_files: []