mbox2csv 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +9 -0
- data/README.md +29 -0
- data/lib/mbox2csv.rb +179 -0
- metadata +91 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bb21771471de4cd41fb5352229f53e8907f948fb146054fb4b050287e0f51116
|
4
|
+
data.tar.gz: 8436988c043147b78301f7a5b4e3fe521f1a15464332a2d00943b4d9eca37762
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c186e0bbba46cf3ee609794beffb67a9ed0a08605c3a9a165299d33ac6a065f4807d38b5835917dcc51977abc4e676a544cfef75d5d920ad99dfdadd8f53c2ef
|
7
|
+
data.tar.gz: 8a70e8871a80a7213d948de25d7310febaeb63b360d672c5546a45b69057f6e66de8831cf60e1b8941f839f0ec5850a1bc41f6cf2a598a4ac54e9028853b4d58
|
data/LICENSE
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 firefly-cpp
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
6
|
+
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
8
|
+
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# mbox2csv -- Parse MBOX files and export email data into CSV format
|
2
|
+
|
3
|
+
## About
|
4
|
+
|
5
|
+
mbox2csv is a Ruby gem that provides a simple way to parse MBOX files and export email data into CSV format. It also generates valuable email statistics for data mining tasks, such as the number of emails sent by each sender and recipient and average body lengths. This is ideal for analyzing email datasets or processing email archives.
|
6
|
+
|
7
|
+
## 📦 Installation
|
8
|
+
|
9
|
+
```sh
|
10
|
+
$ gem install mbox2csv
|
11
|
+
```
|
12
|
+
|
13
|
+
## 🚀 Basic run example
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
require 'mbox2csv'
|
17
|
+
|
18
|
+
# Define file paths
|
19
|
+
mbox_file = '/path/to/the/INBOX_file'
|
20
|
+
all_emails = 'emails.csv'
|
21
|
+
sender_stats_all_emails = 'email_statistics.csv'
|
22
|
+
recipient_stats_all_emails = 'recipient_statistics.csv'
|
23
|
+
|
24
|
+
# Initialize the parser with the file paths
|
25
|
+
parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails, recipient_stats_all_emails)
|
26
|
+
|
27
|
+
# Parse the MBOX file, save email data, and generate statistics
|
28
|
+
parser.parse
|
29
|
+
```
|
data/lib/mbox2csv.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'base64'
|
2
|
+
require 'csv'
|
3
|
+
require 'mail'
|
4
|
+
|
5
|
+
module Mbox2CSV
|
6
|
+
# Main class
|
7
|
+
class MboxParser
|
8
|
+
# Initializes the MboxParser with file paths for the MBOX file, output CSV file,
|
9
|
+
# and statistics CSV files for sender statistics.
|
10
|
+
#
|
11
|
+
# @param [String] mbox_file Path to the MBOX file to be parsed.
|
12
|
+
# @param [String] csv_file Path to the output CSV file where parsed email data will be saved.
|
13
|
+
# @param [String] stats_csv_file Path to the output CSV file where sender statistics will be saved.
|
14
|
+
# @param [String] recipient_stats_csv_file Path to the output CSV file where recipient statistics will be saved.
|
15
|
+
def initialize(mbox_file, csv_file, stats_csv_file, recipient_stats_csv_file)
|
16
|
+
@mbox_file = mbox_file
|
17
|
+
@csv_file = csv_file
|
18
|
+
@statistics = EmailStatistics.new
|
19
|
+
@stats_csv_file = stats_csv_file
|
20
|
+
@recipient_stats_csv_file = recipient_stats_csv_file
|
21
|
+
end
|
22
|
+
|
23
|
+
# Parses the MBOX file and writes the email data to the specified CSV file.
|
24
|
+
# It also saves sender and recipient statistics to separate CSV files.
|
25
|
+
def parse
|
26
|
+
CSV.open(@csv_file, 'w') do |csv|
|
27
|
+
# Write CSV header
|
28
|
+
csv << ['From', 'To', 'Subject', 'Date', 'Body']
|
29
|
+
|
30
|
+
File.open(@mbox_file, 'r') do |mbox|
|
31
|
+
buffer = ""
|
32
|
+
mbox.each_line do |line|
|
33
|
+
if line.start_with?("From ")
|
34
|
+
process_email_block(buffer, csv) unless buffer.empty?
|
35
|
+
buffer = "" # Reset buffer
|
36
|
+
end
|
37
|
+
buffer << line # Append line to buffer
|
38
|
+
end
|
39
|
+
process_email_block(buffer, csv) unless buffer.empty? # Process last email block
|
40
|
+
end
|
41
|
+
end
|
42
|
+
puts "Parsing completed. Data saved to #{@csv_file}"
|
43
|
+
|
44
|
+
# Save and print statistics after parsing
|
45
|
+
@statistics.save_sender_statistics(@stats_csv_file)
|
46
|
+
@statistics.save_recipient_statistics(@recipient_stats_csv_file)
|
47
|
+
rescue => e
|
48
|
+
puts "Error processing MBOX file: #{e.message}"
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
# Processes an individual email block from the MBOX file, extracts the email fields,
|
54
|
+
# and writes them to the CSV. Also records email statistics for analysis.
|
55
|
+
#
|
56
|
+
# @param [String] buffer The email block from the MBOX file.
|
57
|
+
# @param [CSV] csv The CSV object where email data is written.
|
58
|
+
def process_email_block(buffer, csv)
|
59
|
+
mail = Mail.read_from_string(buffer)
|
60
|
+
|
61
|
+
from = ensure_utf8(mail.from ? mail.from.join(", ") : '', 'UTF-8')
|
62
|
+
to = ensure_utf8(mail.to ? mail.to.join(", ") : '', 'UTF-8')
|
63
|
+
subject = ensure_utf8(mail.subject ? mail.subject : '', 'UTF-8')
|
64
|
+
date = ensure_utf8(mail.date ? mail.date.to_s : '', 'UTF-8')
|
65
|
+
|
66
|
+
body = decode_body(mail)
|
67
|
+
|
68
|
+
# Write to CSV
|
69
|
+
csv << [from, to, subject, date, body]
|
70
|
+
|
71
|
+
# Record email for statistics
|
72
|
+
@statistics.record_email(from, to, body.length)
|
73
|
+
rescue => e
|
74
|
+
puts "Error processing email block: #{e.message}"
|
75
|
+
end
|
76
|
+
|
77
|
+
# Decodes the email body content based on content-transfer encoding and converts it to UTF-8.
|
78
|
+
#
|
79
|
+
# @param [Mail] mail The mail object to decode.
|
80
|
+
# @return [String] The decoded email body.
|
81
|
+
def decode_body(mail)
|
82
|
+
body = if mail.multipart?
|
83
|
+
part = mail.text_part || mail.html_part
|
84
|
+
part&.body&.decoded || ''
|
85
|
+
else
|
86
|
+
mail.body.decoded
|
87
|
+
end
|
88
|
+
|
89
|
+
charset = mail.charset || 'UTF-8'
|
90
|
+
|
91
|
+
case mail.content_transfer_encoding
|
92
|
+
when 'base64'
|
93
|
+
body = Base64.decode64(body)
|
94
|
+
when 'quoted-printable'
|
95
|
+
body = body.unpack('M').first
|
96
|
+
end
|
97
|
+
|
98
|
+
ensure_utf8(body, charset)
|
99
|
+
end
|
100
|
+
|
101
|
+
# Converts text to UTF-8 encoding, handling invalid characters by replacing them with '?'.
|
102
|
+
#
|
103
|
+
# @param [String] text The input text.
|
104
|
+
# @param [String] charset The character set of the input text.
|
105
|
+
# @return [String] UTF-8 encoded text.
|
106
|
+
def ensure_utf8(text, charset)
|
107
|
+
return '' if text.nil?
|
108
|
+
text = text.force_encoding(charset) if charset
|
109
|
+
text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
# The EmailStatistics class is responsible for gathering and writing statistics related to emails.
|
114
|
+
# It tracks sender frequency, recipient frequency, and calculates the average email body length per sender.
|
115
|
+
class EmailStatistics
|
116
|
+
def initialize
|
117
|
+
@sender_counts = Hash.new(0) # Keeps count of emails per sender
|
118
|
+
@recipient_counts = Hash.new(0) # Keeps count of emails per recipient
|
119
|
+
@body_lengths = Hash.new { |hash, key| hash[key] = [] } # Stores body lengths per sender
|
120
|
+
end
|
121
|
+
|
122
|
+
# Records an email's sender, recipients, and body length for statistical purposes.
|
123
|
+
#
|
124
|
+
# @param [String] from The sender of the email.
|
125
|
+
# @param [String, Array<String>] to The recipient(s) of the email.
|
126
|
+
# @param [Integer] body_length The length of the email body in characters.
|
127
|
+
def record_email(from, to, body_length)
|
128
|
+
return if from.empty?
|
129
|
+
|
130
|
+
@sender_counts[from] += 1
|
131
|
+
@body_lengths[from] << body_length
|
132
|
+
|
133
|
+
Array(to).each do |recipient|
|
134
|
+
@recipient_counts[recipient] += 1
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
# Saves sender statistics to a CSV file and prints them to the console.
|
139
|
+
#
|
140
|
+
# @param [String] csv_filename The path to the output CSV file for sender statistics.
|
141
|
+
def save_sender_statistics(csv_filename)
|
142
|
+
sorted_senders = @sender_counts.sort_by { |_sender, count| -count }
|
143
|
+
average_body_lengths = @body_lengths.transform_values { |lengths| lengths.sum / lengths.size.to_f }
|
144
|
+
|
145
|
+
CSV.open(csv_filename, 'w') do |csv|
|
146
|
+
csv << ['Sender', 'Email Count', 'Average Body Length (chars)']
|
147
|
+
sorted_senders.each do |sender, count|
|
148
|
+
avg_length = average_body_lengths[sender].round(2)
|
149
|
+
csv << [sender, count, avg_length]
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
puts "Sender Email Statistics:"
|
154
|
+
sorted_senders.each do |sender, count|
|
155
|
+
avg_length = average_body_lengths[sender].round(2)
|
156
|
+
puts "#{sender}: #{count} emails, Average body length: #{avg_length} chars"
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
# Saves recipient statistics to a CSV file and prints them to the console.
|
161
|
+
#
|
162
|
+
# @param [String] csv_filename The path to the output CSV file for recipient statistics.
|
163
|
+
def save_recipient_statistics(csv_filename)
|
164
|
+
sorted_recipients = @recipient_counts.sort_by { |_recipient, count| -count }
|
165
|
+
|
166
|
+
CSV.open(csv_filename, 'w') do |csv|
|
167
|
+
csv << ['Recipient', 'Email Count']
|
168
|
+
sorted_recipients.each do |recipient, count|
|
169
|
+
csv << [recipient, count]
|
170
|
+
end
|
171
|
+
end
|
172
|
+
|
173
|
+
puts "\nRecipient Email Statistics:"
|
174
|
+
sorted_recipients.each do |recipient, count|
|
175
|
+
puts "#{recipient}: #{count} emails"
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|
metadata
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mbox2csv
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- firefly-cpp
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-09-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: base64
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.2.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.2.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: csv
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '3.3'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '3.3'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: mail
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 2.8.1
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 2.8.1
|
55
|
+
description:
|
56
|
+
email:
|
57
|
+
- iztok@iztok-jr-fister.eu
|
58
|
+
executables: []
|
59
|
+
extensions: []
|
60
|
+
extra_rdoc_files: []
|
61
|
+
files:
|
62
|
+
- LICENSE
|
63
|
+
- README.md
|
64
|
+
- lib/mbox2csv.rb
|
65
|
+
homepage: https://codeberg.org/firefly-cpp/mbox2csv
|
66
|
+
licenses:
|
67
|
+
- MIT
|
68
|
+
metadata:
|
69
|
+
homepage_uri: https://codeberg.org/firefly-cpp/mbox2csv
|
70
|
+
source_code_uri: https://codeberg.org/firefly-cpp/mbox2csv
|
71
|
+
changelog_uri: https://codeberg.org/firefly-cpp/mbox2csv
|
72
|
+
post_install_message:
|
73
|
+
rdoc_options: []
|
74
|
+
require_paths:
|
75
|
+
- lib
|
76
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: 2.6.0
|
81
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
version: '0'
|
86
|
+
requirements: []
|
87
|
+
rubygems_version: 3.5.11
|
88
|
+
signing_key:
|
89
|
+
specification_version: 4
|
90
|
+
summary: Parse MBOX files and export email data into CSV format
|
91
|
+
test_files: []
|