mbox2csv 0.1.2 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/mbox2csv.rb +177 -15
- metadata +17 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d01ca30a89acfe78cea4db6c1b61ee76a10ee9f268ffc51c80ed15136c98b279
|
4
|
+
data.tar.gz: 295a46bbcee3b62796febe837c145a002df6ad03dcbf1f2e9d3b3a455a46f3c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e17144ea47a735f13d00070bca37c2e7fbbecd8e91685abb943628fc08f2e1b68e845864b69492fc333b3e72c8c07e98e78d662951d6aa4d8ecba54b6fe1dfe1
|
7
|
+
data.tar.gz: 833c1c6e20dbb6faa17d27e65db529877b08810eca35288277dc78b54f13f6034ba94318ae5376764613bd1ba0024b54a8705dbf6dcda06f1580c55dfd318132
|
data/README.md
CHANGED
@@ -34,6 +34,10 @@ parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails
|
|
34
34
|
|
35
35
|
# Parse the MBOX file, save email data, and generate statistics
|
36
36
|
parser.parse
|
37
|
+
|
38
|
+
# Extract attachments
|
39
|
+
parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
|
40
|
+
|
37
41
|
```
|
38
42
|
|
39
43
|
## 🔑 License
|
data/lib/mbox2csv.rb
CHANGED
@@ -1,12 +1,15 @@
|
|
1
1
|
require 'base64'
|
2
2
|
require 'csv'
|
3
3
|
require 'mail'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'ruby-progressbar'
|
4
6
|
|
5
7
|
module Mbox2CSV
|
6
|
-
# Main class
|
8
|
+
# Main class for parsing MBOX files, saving email data/statistics to CSV,
|
9
|
+
# and (optionally) extracting selected attachment types to disk.
|
7
10
|
class MboxParser
|
8
11
|
# Initializes the MboxParser with file paths for the MBOX file, output CSV file,
|
9
|
-
# and statistics CSV files for sender statistics.
|
12
|
+
# and statistics CSV files for sender and recipient statistics.
|
10
13
|
#
|
11
14
|
# @param [String] mbox_file Path to the MBOX file to be parsed.
|
12
15
|
# @param [String] csv_file Path to the output CSV file where parsed email data will be saved.
|
@@ -18,40 +21,84 @@ module Mbox2CSV
|
|
18
21
|
@statistics = EmailStatistics.new
|
19
22
|
@stats_csv_file = stats_csv_file
|
20
23
|
@recipient_stats_csv_file = recipient_stats_csv_file
|
24
|
+
@senders_folder = 'senders/'
|
25
|
+
FileUtils.mkdir_p(@senders_folder) # Create the senders folder if it doesn't exist
|
21
26
|
end
|
22
27
|
|
23
28
|
# Parses the MBOX file and writes the email data to the specified CSV file.
|
24
29
|
# It also saves sender and recipient statistics to separate CSV files.
|
30
|
+
# A progress bar is displayed during the parsing process.
|
25
31
|
def parse
|
32
|
+
total_lines = File.foreach(@mbox_file).inject(0) { |c, _line| c + 1 }
|
33
|
+
progressbar = ProgressBar.create(title: "Parsing Emails", total: total_lines, format: "%t: |%B| %p%%")
|
34
|
+
|
26
35
|
CSV.open(@csv_file, 'w') do |csv|
|
27
|
-
# Write CSV header
|
28
36
|
csv << ['From', 'To', 'Subject', 'Date', 'Body']
|
29
37
|
|
30
38
|
File.open(@mbox_file, 'r') do |mbox|
|
31
39
|
buffer = ""
|
32
40
|
mbox.each_line do |line|
|
41
|
+
progressbar.increment
|
33
42
|
if line.start_with?("From ")
|
34
43
|
process_email_block(buffer, csv) unless buffer.empty?
|
35
|
-
buffer = ""
|
44
|
+
buffer = ""
|
36
45
|
end
|
37
|
-
buffer << line
|
46
|
+
buffer << line
|
38
47
|
end
|
39
|
-
process_email_block(buffer, csv) unless buffer.empty?
|
48
|
+
process_email_block(buffer, csv) unless buffer.empty?
|
40
49
|
end
|
41
50
|
end
|
42
51
|
puts "Parsing completed. Data saved to #{@csv_file}"
|
43
52
|
|
44
|
-
# Save and print statistics after parsing
|
45
53
|
@statistics.save_sender_statistics(@stats_csv_file)
|
46
54
|
@statistics.save_recipient_statistics(@recipient_stats_csv_file)
|
47
55
|
rescue => e
|
48
56
|
puts "Error processing MBOX file: #{e.message}"
|
49
57
|
end
|
50
58
|
|
59
|
+
# Extract selected attachment file types from the MBOX into a folder.
|
60
|
+
#
|
61
|
+
# @param [Boolean] extract Flag to enable/disable extraction.
|
62
|
+
# @param [Array<String>] filetypes Array of extensions to extract (e.g., %w[pdf jpg png]).
|
63
|
+
# @param [String] output_folder Directory to write attachments into.
|
64
|
+
# @return [Integer] Number of files successfully written.
|
65
|
+
def extract_attachments(extract: true, filetypes: [], output_folder: 'attachments')
|
66
|
+
return 0 unless extract
|
67
|
+
|
68
|
+
wanted_exts = Array(filetypes).map { |e| e.to_s.downcase.sub(/\A\./, '') }.uniq
|
69
|
+
raise ArgumentError, "filetypes must not be empty when extract: true" if wanted_exts.empty?
|
70
|
+
|
71
|
+
FileUtils.mkdir_p(output_folder)
|
72
|
+
total_written = 0
|
73
|
+
|
74
|
+
total_lines = File.foreach(@mbox_file).inject(0) { |c, _| c + 1 }
|
75
|
+
progressbar = ProgressBar.create(title: "Extracting Attachments", total: total_lines, format: "%t: |%B| %p%%")
|
76
|
+
|
77
|
+
File.open(@mbox_file, 'r') do |mbox|
|
78
|
+
buffer = ""
|
79
|
+
mbox.each_line do |line|
|
80
|
+
progressbar.increment
|
81
|
+
if line.start_with?("From ")
|
82
|
+
total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
|
83
|
+
buffer = ""
|
84
|
+
end
|
85
|
+
buffer << line
|
86
|
+
end
|
87
|
+
total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
|
88
|
+
end
|
89
|
+
|
90
|
+
puts "Attachment extraction completed. #{total_written} file(s) saved to #{output_folder}"
|
91
|
+
total_written
|
92
|
+
rescue => e
|
93
|
+
puts "Error extracting attachments: #{e.message}"
|
94
|
+
0
|
95
|
+
end
|
96
|
+
|
51
97
|
private
|
52
98
|
|
53
99
|
# Processes an individual email block from the MBOX file, extracts the email fields,
|
54
|
-
# and writes them to the CSV. Also records email statistics for analysis
|
100
|
+
# and writes them to the CSV. Also records email statistics for analysis and creates
|
101
|
+
# sender-specific CSV files.
|
55
102
|
#
|
56
103
|
# @param [String] buffer The email block from the MBOX file.
|
57
104
|
# @param [CSV] csv The CSV object where email data is written.
|
@@ -62,14 +109,13 @@ module Mbox2CSV
|
|
62
109
|
to = ensure_utf8(mail.to ? mail.to.join(", ") : '', 'UTF-8')
|
63
110
|
subject = ensure_utf8(mail.subject ? mail.subject : '', 'UTF-8')
|
64
111
|
date = ensure_utf8(mail.date ? mail.date.to_s : '', 'UTF-8')
|
65
|
-
|
66
112
|
body = decode_body(mail)
|
67
113
|
|
68
|
-
# Write to CSV
|
69
114
|
csv << [from, to, subject, date, body]
|
70
115
|
|
71
|
-
# Record email for statistics
|
72
116
|
@statistics.record_email(from, to, body.length)
|
117
|
+
|
118
|
+
save_email_to_sender_csv(from, to, subject, date, body)
|
73
119
|
rescue => e
|
74
120
|
puts "Error processing email block: #{e.message}"
|
75
121
|
end
|
@@ -79,7 +125,8 @@ module Mbox2CSV
|
|
79
125
|
# @param [Mail] mail The mail object to decode.
|
80
126
|
# @return [String] The decoded email body.
|
81
127
|
def decode_body(mail)
|
82
|
-
body =
|
128
|
+
body =
|
129
|
+
if mail.multipart?
|
83
130
|
part = mail.text_part || mail.html_part
|
84
131
|
part&.body&.decoded || ''
|
85
132
|
else
|
@@ -108,15 +155,125 @@ module Mbox2CSV
|
|
108
155
|
text = text.force_encoding(charset) if charset
|
109
156
|
text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
110
157
|
end
|
158
|
+
|
159
|
+
# Saves an email to a sender-specific CSV file.
|
160
|
+
#
|
161
|
+
# @param [String] from The sender of the email.
|
162
|
+
# @param [String] to The recipient(s) of the email.
|
163
|
+
# @param [String] subject The subject of the email.
|
164
|
+
# @param [String] date The date of the email.
|
165
|
+
# @param [String] body The body of the email.
|
166
|
+
def save_email_to_sender_csv(from, to, subject, date, body)
|
167
|
+
return if from.empty?
|
168
|
+
|
169
|
+
sender_file = File.join(@senders_folder, "#{sanitize_filename(from)}.csv")
|
170
|
+
|
171
|
+
CSV.open(sender_file, 'a') do |csv|
|
172
|
+
if File.size?(sender_file).nil? || File.size(sender_file).zero?
|
173
|
+
csv << ['From', 'To', 'Subject', 'Date', 'Body'] # Add header if file is new
|
174
|
+
end
|
175
|
+
csv << [from, to, subject, date, body]
|
176
|
+
end
|
177
|
+
rescue => e
|
178
|
+
puts "Error writing to sender CSV file for #{from}: #{e.message}"
|
179
|
+
end
|
180
|
+
|
181
|
+
# Sanitizes filenames by replacing invalid characters with underscores.
|
182
|
+
#
|
183
|
+
# @param [String] filename The input filename.
|
184
|
+
# @return [String] A sanitized version of the filename.
|
185
|
+
def sanitize_filename(filename)
|
186
|
+
filename.gsub(/[^0-9A-Za-z\-]/, '_')
|
187
|
+
end
|
188
|
+
|
189
|
+
# --- Helpers for attachment extraction ---
|
190
|
+
|
191
|
+
# Process a single email block to extract wanted attachments.
|
192
|
+
def process_attachment_block(buffer, wanted_exts, output_folder)
|
193
|
+
return 0 if buffer.nil? || buffer.empty?
|
194
|
+
|
195
|
+
mail = Mail.read_from_string(buffer)
|
196
|
+
return 0 unless mail
|
197
|
+
|
198
|
+
written = 0
|
199
|
+
date = (mail.date rescue nil)
|
200
|
+
date_str = date ? date.strftime("%Y-%m-%d") : "unknown_date"
|
201
|
+
time_str = date ? date.strftime("%H-%M-%S") : "unknown_time"
|
202
|
+
|
203
|
+
Array(mail.attachments).each do |att|
|
204
|
+
begin
|
205
|
+
original_name = att.filename || att.name || "attachment"
|
206
|
+
base = File.basename(original_name, ".*")
|
207
|
+
ext = File.extname(original_name).downcase.sub(/\A\./, '')
|
208
|
+
|
209
|
+
# If no ext present, try to infer from MIME type
|
210
|
+
ext = mime_to_ext(att.mime_type) if ext.empty? && att.mime_type
|
211
|
+
|
212
|
+
# Skip if extension not desired
|
213
|
+
next unless wanted_exts.include?(ext.downcase)
|
214
|
+
|
215
|
+
safe_base = sanitize_filename(base)
|
216
|
+
fname = "#{safe_base}_#{date_str}_#{time_str}.#{ext}"
|
217
|
+
path = File.join(output_folder, fname)
|
218
|
+
|
219
|
+
# Ensure uniqueness if file already exists
|
220
|
+
path = uniquify_path(path)
|
221
|
+
|
222
|
+
# Write decoded content
|
223
|
+
File.open(path, "wb") { |f| f.write(att.body.decoded) }
|
224
|
+
written += 1
|
225
|
+
rescue => e
|
226
|
+
puts "Failed to save attachment '#{att&.filename}': #{e.message}"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
written
|
231
|
+
rescue => e
|
232
|
+
puts "Error processing attachment block: #{e.message}"
|
233
|
+
0
|
234
|
+
end
|
235
|
+
|
236
|
+
# Minimal MIME→extension mapping; extend as needed.
|
237
|
+
def mime_to_ext(mime)
|
238
|
+
map = {
|
239
|
+
'application/pdf' => 'pdf',
|
240
|
+
'image/jpeg' => 'jpg',
|
241
|
+
'image/jpg' => 'jpg',
|
242
|
+
'image/png' => 'png',
|
243
|
+
'image/gif' => 'gif',
|
244
|
+
'text/plain' => 'txt',
|
245
|
+
'application/zip' => 'zip',
|
246
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
|
247
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
|
248
|
+
'application/msword' => 'doc',
|
249
|
+
'application/vnd.ms-excel' => 'xls'
|
250
|
+
}
|
251
|
+
map[mime] || 'bin'
|
252
|
+
end
|
253
|
+
|
254
|
+
# If a path exists, append an incrementing suffix before the extension.
|
255
|
+
def uniquify_path(path)
|
256
|
+
return path unless File.exist?(path)
|
257
|
+
dir = File.dirname(path)
|
258
|
+
base = File.basename(path, ".*")
|
259
|
+
ext = File.extname(path)
|
260
|
+
i = 1
|
261
|
+
new_path = File.join(dir, "#{base}_#{i}#{ext}")
|
262
|
+
while File.exist?(new_path)
|
263
|
+
i += 1
|
264
|
+
new_path = File.join(dir, "#{base}_#{i}#{ext}")
|
265
|
+
end
|
266
|
+
new_path
|
267
|
+
end
|
111
268
|
end
|
112
269
|
|
113
270
|
# The EmailStatistics class is responsible for gathering and writing statistics related to emails.
|
114
271
|
# It tracks sender frequency, recipient frequency, and calculates the average email body length per sender.
|
115
272
|
class EmailStatistics
|
116
273
|
def initialize
|
117
|
-
@sender_counts = Hash.new(0)
|
118
|
-
@recipient_counts = Hash.new(0)
|
119
|
-
@body_lengths = Hash.new { |hash, key| hash[key] = [] }
|
274
|
+
@sender_counts = Hash.new(0)
|
275
|
+
@recipient_counts = Hash.new(0)
|
276
|
+
@body_lengths = Hash.new { |hash, key| hash[key] = [] }
|
120
277
|
end
|
121
278
|
|
122
279
|
# Records an email's sender, recipients, and body length for statistical purposes.
|
@@ -177,3 +334,8 @@ class EmailStatistics
|
|
177
334
|
end
|
178
335
|
end
|
179
336
|
end
|
337
|
+
|
338
|
+
# --- Usage example ---
|
339
|
+
# parser = Mbox2CSV::MboxParser.new("inbox.mbox", "emails.csv", "sender_stats.csv", "recipient_stats.csv")
|
340
|
+
# parser.parse
|
341
|
+
# parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mbox2csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- firefly-cpp
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: base64
|
@@ -52,7 +51,20 @@ dependencies:
|
|
52
51
|
- - "~>"
|
53
52
|
- !ruby/object:Gem::Version
|
54
53
|
version: 2.8.1
|
55
|
-
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: ruby-progressbar
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - "~>"
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '1.11'
|
61
|
+
type: :runtime
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.11'
|
56
68
|
email:
|
57
69
|
- iztok@iztok-jr-fister.eu
|
58
70
|
executables: []
|
@@ -69,7 +81,6 @@ metadata:
|
|
69
81
|
homepage_uri: https://github.com/firefly-cpp/mbox2csv
|
70
82
|
source_code_uri: https://github.com/firefly-cpp/mbox2csv
|
71
83
|
changelog_uri: https://github.com/firefly-cpp/mbox2csv
|
72
|
-
post_install_message:
|
73
84
|
rdoc_options: []
|
74
85
|
require_paths:
|
75
86
|
- lib
|
@@ -84,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
84
95
|
- !ruby/object:Gem::Version
|
85
96
|
version: '0'
|
86
97
|
requirements: []
|
87
|
-
rubygems_version: 3.
|
88
|
-
signing_key:
|
98
|
+
rubygems_version: 3.6.7
|
89
99
|
specification_version: 4
|
90
100
|
summary: Parse MBOX files and export email data into CSV format
|
91
101
|
test_files: []
|