mbox2csv 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +4 -0
- data/lib/mbox2csv.rb +128 -3
- metadata +3 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d01ca30a89acfe78cea4db6c1b61ee76a10ee9f268ffc51c80ed15136c98b279
|
4
|
+
data.tar.gz: 295a46bbcee3b62796febe837c145a002df6ad03dcbf1f2e9d3b3a455a46f3c7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e17144ea47a735f13d00070bca37c2e7fbbecd8e91685abb943628fc08f2e1b68e845864b69492fc333b3e72c8c07e98e78d662951d6aa4d8ecba54b6fe1dfe1
|
7
|
+
data.tar.gz: 833c1c6e20dbb6faa17d27e65db529877b08810eca35288277dc78b54f13f6034ba94318ae5376764613bd1ba0024b54a8705dbf6dcda06f1580c55dfd318132
|
data/README.md
CHANGED
@@ -34,6 +34,10 @@ parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails
|
|
34
34
|
|
35
35
|
# Parse the MBOX file, save email data, and generate statistics
|
36
36
|
parser.parse
|
37
|
+
|
38
|
+
# Extract attachments
|
39
|
+
parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
|
40
|
+
|
37
41
|
```
|
38
42
|
|
39
43
|
## 🔑 License
|
data/lib/mbox2csv.rb
CHANGED
@@ -5,7 +5,8 @@ require 'fileutils'
|
|
5
5
|
require 'ruby-progressbar'
|
6
6
|
|
7
7
|
module Mbox2CSV
|
8
|
-
# Main class for parsing MBOX files
|
8
|
+
# Main class for parsing MBOX files, saving email data/statistics to CSV,
|
9
|
+
# and (optionally) extracting selected attachment types to disk.
|
9
10
|
class MboxParser
|
10
11
|
# Initializes the MboxParser with file paths for the MBOX file, output CSV file,
|
11
12
|
# and statistics CSV files for sender and recipient statistics.
|
@@ -55,6 +56,44 @@ module Mbox2CSV
|
|
55
56
|
puts "Error processing MBOX file: #{e.message}"
|
56
57
|
end
|
57
58
|
|
59
|
+
# Extract selected attachment file types from the MBOX into a folder.
|
60
|
+
#
|
61
|
+
# @param [Boolean] extract Flag to enable/disable extraction.
|
62
|
+
# @param [Array<String>] filetypes Array of extensions to extract (e.g., %w[pdf jpg png]).
|
63
|
+
# @param [String] output_folder Directory to write attachments into.
|
64
|
+
# @return [Integer] Number of files successfully written.
|
65
|
+
def extract_attachments(extract: true, filetypes: [], output_folder: 'attachments')
|
66
|
+
return 0 unless extract
|
67
|
+
|
68
|
+
wanted_exts = Array(filetypes).map { |e| e.to_s.downcase.sub(/\A\./, '') }.uniq
|
69
|
+
raise ArgumentError, "filetypes must not be empty when extract: true" if wanted_exts.empty?
|
70
|
+
|
71
|
+
FileUtils.mkdir_p(output_folder)
|
72
|
+
total_written = 0
|
73
|
+
|
74
|
+
total_lines = File.foreach(@mbox_file).inject(0) { |c, _| c + 1 }
|
75
|
+
progressbar = ProgressBar.create(title: "Extracting Attachments", total: total_lines, format: "%t: |%B| %p%%")
|
76
|
+
|
77
|
+
File.open(@mbox_file, 'r') do |mbox|
|
78
|
+
buffer = ""
|
79
|
+
mbox.each_line do |line|
|
80
|
+
progressbar.increment
|
81
|
+
if line.start_with?("From ")
|
82
|
+
total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
|
83
|
+
buffer = ""
|
84
|
+
end
|
85
|
+
buffer << line
|
86
|
+
end
|
87
|
+
total_written += process_attachment_block(buffer, wanted_exts, output_folder) unless buffer.empty?
|
88
|
+
end
|
89
|
+
|
90
|
+
puts "Attachment extraction completed. #{total_written} file(s) saved to #{output_folder}"
|
91
|
+
total_written
|
92
|
+
rescue => e
|
93
|
+
puts "Error extracting attachments: #{e.message}"
|
94
|
+
0
|
95
|
+
end
|
96
|
+
|
58
97
|
private
|
59
98
|
|
60
99
|
# Processes an individual email block from the MBOX file, extracts the email fields,
|
@@ -86,7 +125,8 @@ module Mbox2CSV
|
|
86
125
|
# @param [Mail] mail The mail object to decode.
|
87
126
|
# @return [String] The decoded email body.
|
88
127
|
def decode_body(mail)
|
89
|
-
body =
|
128
|
+
body =
|
129
|
+
if mail.multipart?
|
90
130
|
part = mail.text_part || mail.html_part
|
91
131
|
part&.body&.decoded || ''
|
92
132
|
else
|
@@ -129,7 +169,7 @@ module Mbox2CSV
|
|
129
169
|
sender_file = File.join(@senders_folder, "#{sanitize_filename(from)}.csv")
|
130
170
|
|
131
171
|
CSV.open(sender_file, 'a') do |csv|
|
132
|
-
if File.size(sender_file).zero?
|
172
|
+
if File.size?(sender_file).nil? || File.size(sender_file).zero?
|
133
173
|
csv << ['From', 'To', 'Subject', 'Date', 'Body'] # Add header if file is new
|
134
174
|
end
|
135
175
|
csv << [from, to, subject, date, body]
|
@@ -145,6 +185,86 @@ module Mbox2CSV
|
|
145
185
|
def sanitize_filename(filename)
|
146
186
|
filename.gsub(/[^0-9A-Za-z\-]/, '_')
|
147
187
|
end
|
188
|
+
|
189
|
+
# --- Helpers for attachment extraction ---
|
190
|
+
|
191
|
+
# Process a single email block to extract wanted attachments.
|
192
|
+
def process_attachment_block(buffer, wanted_exts, output_folder)
|
193
|
+
return 0 if buffer.nil? || buffer.empty?
|
194
|
+
|
195
|
+
mail = Mail.read_from_string(buffer)
|
196
|
+
return 0 unless mail
|
197
|
+
|
198
|
+
written = 0
|
199
|
+
date = (mail.date rescue nil)
|
200
|
+
date_str = date ? date.strftime("%Y-%m-%d") : "unknown_date"
|
201
|
+
time_str = date ? date.strftime("%H-%M-%S") : "unknown_time"
|
202
|
+
|
203
|
+
Array(mail.attachments).each do |att|
|
204
|
+
begin
|
205
|
+
original_name = att.filename || att.name || "attachment"
|
206
|
+
base = File.basename(original_name, ".*")
|
207
|
+
ext = File.extname(original_name).downcase.sub(/\A\./, '')
|
208
|
+
|
209
|
+
# If no ext present, try to infer from MIME type
|
210
|
+
ext = mime_to_ext(att.mime_type) if ext.empty? && att.mime_type
|
211
|
+
|
212
|
+
# Skip if extension not desired
|
213
|
+
next unless wanted_exts.include?(ext.downcase)
|
214
|
+
|
215
|
+
safe_base = sanitize_filename(base)
|
216
|
+
fname = "#{safe_base}_#{date_str}_#{time_str}.#{ext}"
|
217
|
+
path = File.join(output_folder, fname)
|
218
|
+
|
219
|
+
# Ensure uniqueness if file already exists
|
220
|
+
path = uniquify_path(path)
|
221
|
+
|
222
|
+
# Write decoded content
|
223
|
+
File.open(path, "wb") { |f| f.write(att.body.decoded) }
|
224
|
+
written += 1
|
225
|
+
rescue => e
|
226
|
+
puts "Failed to save attachment '#{att&.filename}': #{e.message}"
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
written
|
231
|
+
rescue => e
|
232
|
+
puts "Error processing attachment block: #{e.message}"
|
233
|
+
0
|
234
|
+
end
|
235
|
+
|
236
|
+
# Minimal MIME→extension mapping; extend as needed.
|
237
|
+
def mime_to_ext(mime)
|
238
|
+
map = {
|
239
|
+
'application/pdf' => 'pdf',
|
240
|
+
'image/jpeg' => 'jpg',
|
241
|
+
'image/jpg' => 'jpg',
|
242
|
+
'image/png' => 'png',
|
243
|
+
'image/gif' => 'gif',
|
244
|
+
'text/plain' => 'txt',
|
245
|
+
'application/zip' => 'zip',
|
246
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document' => 'docx',
|
247
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' => 'xlsx',
|
248
|
+
'application/msword' => 'doc',
|
249
|
+
'application/vnd.ms-excel' => 'xls'
|
250
|
+
}
|
251
|
+
map[mime] || 'bin'
|
252
|
+
end
|
253
|
+
|
254
|
+
# If a path exists, append an incrementing suffix before the extension.
|
255
|
+
def uniquify_path(path)
|
256
|
+
return path unless File.exist?(path)
|
257
|
+
dir = File.dirname(path)
|
258
|
+
base = File.basename(path, ".*")
|
259
|
+
ext = File.extname(path)
|
260
|
+
i = 1
|
261
|
+
new_path = File.join(dir, "#{base}_#{i}#{ext}")
|
262
|
+
while File.exist?(new_path)
|
263
|
+
i += 1
|
264
|
+
new_path = File.join(dir, "#{base}_#{i}#{ext}")
|
265
|
+
end
|
266
|
+
new_path
|
267
|
+
end
|
148
268
|
end
|
149
269
|
|
150
270
|
# The EmailStatistics class is responsible for gathering and writing statistics related to emails.
|
@@ -214,3 +334,8 @@ class EmailStatistics
|
|
214
334
|
end
|
215
335
|
end
|
216
336
|
end
|
337
|
+
|
338
|
+
# --- Usage example ---
|
339
|
+
# parser = Mbox2CSV::MboxParser.new("inbox.mbox", "emails.csv", "sender_stats.csv", "recipient_stats.csv")
|
340
|
+
# parser.parse
|
341
|
+
# parser.extract_attachments(extract: true, filetypes: %w[pdf jpg], output_folder: "exports")
|
metadata
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mbox2csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- firefly-cpp
|
8
|
-
autorequire:
|
9
8
|
bindir: bin
|
10
9
|
cert_chain: []
|
11
|
-
date:
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
12
11
|
dependencies:
|
13
12
|
- !ruby/object:Gem::Dependency
|
14
13
|
name: base64
|
@@ -66,7 +65,6 @@ dependencies:
|
|
66
65
|
- - "~>"
|
67
66
|
- !ruby/object:Gem::Version
|
68
67
|
version: '1.11'
|
69
|
-
description:
|
70
68
|
email:
|
71
69
|
- iztok@iztok-jr-fister.eu
|
72
70
|
executables: []
|
@@ -83,7 +81,6 @@ metadata:
|
|
83
81
|
homepage_uri: https://github.com/firefly-cpp/mbox2csv
|
84
82
|
source_code_uri: https://github.com/firefly-cpp/mbox2csv
|
85
83
|
changelog_uri: https://github.com/firefly-cpp/mbox2csv
|
86
|
-
post_install_message:
|
87
84
|
rdoc_options: []
|
88
85
|
require_paths:
|
89
86
|
- lib
|
@@ -98,8 +95,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
95
|
- !ruby/object:Gem::Version
|
99
96
|
version: '0'
|
100
97
|
requirements: []
|
101
|
-
rubygems_version: 3.
|
102
|
-
signing_key:
|
98
|
+
rubygems_version: 3.6.7
|
103
99
|
specification_version: 4
|
104
100
|
summary: Parse MBOX files and export email data into CSV format
|
105
101
|
test_files: []
|