mbox2csv 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +9 -0
  3. data/README.md +29 -0
  4. data/lib/mbox2csv.rb +179 -0
  5. metadata +91 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: bb21771471de4cd41fb5352229f53e8907f948fb146054fb4b050287e0f51116
4
+ data.tar.gz: 8436988c043147b78301f7a5b4e3fe521f1a15464332a2d00943b4d9eca37762
5
+ SHA512:
6
+ metadata.gz: c186e0bbba46cf3ee609794beffb67a9ed0a08605c3a9a165299d33ac6a065f4807d38b5835917dcc51977abc4e676a544cfef75d5d920ad99dfdadd8f53c2ef
7
+ data.tar.gz: 8a70e8871a80a7213d948de25d7310febaeb63b360d672c5546a45b69057f6e66de8831cf60e1b8941f839f0ec5850a1bc41f6cf2a598a4ac54e9028853b4d58
data/LICENSE ADDED
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 firefly-cpp
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,29 @@
1
+ # mbox2csv -- Parse MBOX files and export email data into CSV format
2
+
3
+ ## About
4
+
5
+ mbox2csv is a Ruby gem that provides a simple way to parse MBOX files and export email data into CSV format. It also generates valuable email statistics for data mining tasks, such as the number of emails sent by each sender and recipient and average body lengths. This is ideal for analyzing email datasets or processing email archives.
6
+
7
+ ## 📦 Installation
8
+
9
+ ```sh
10
+ $ gem install mbox2csv
11
+ ```
12
+
13
+ ## 🚀 Basic run example
14
+
15
+ ```ruby
16
+ require 'mbox2csv'
17
+
18
+ # Define file paths
19
+ mbox_file = '/path/to/the/INBOX_file'
20
+ all_emails = 'emails.csv'
21
+ sender_stats_all_emails = 'email_statistics.csv'
22
+ recipient_stats_all_emails = 'recipient_statistics.csv'
23
+
24
+ # Initialize the parser with the file paths
25
+ parser = Mbox2CSV::MboxParser.new(mbox_file, all_emails, sender_stats_all_emails, recipient_stats_all_emails)
26
+
27
+ # Parse the MBOX file, save email data, and generate statistics
28
+ parser.parse
29
+ ```
data/lib/mbox2csv.rb ADDED
@@ -0,0 +1,179 @@
1
+ require 'base64'
2
+ require 'csv'
3
+ require 'mail'
4
+
5
+ module Mbox2CSV
6
+ # Main class
7
+ class MboxParser
8
+ # Initializes the MboxParser with file paths for the MBOX file, output CSV file,
9
+ # and statistics CSV files for sender statistics.
10
+ #
11
+ # @param [String] mbox_file Path to the MBOX file to be parsed.
12
+ # @param [String] csv_file Path to the output CSV file where parsed email data will be saved.
13
+ # @param [String] stats_csv_file Path to the output CSV file where sender statistics will be saved.
14
+ # @param [String] recipient_stats_csv_file Path to the output CSV file where recipient statistics will be saved.
15
+ def initialize(mbox_file, csv_file, stats_csv_file, recipient_stats_csv_file)
16
+ @mbox_file = mbox_file
17
+ @csv_file = csv_file
18
+ @statistics = EmailStatistics.new
19
+ @stats_csv_file = stats_csv_file
20
+ @recipient_stats_csv_file = recipient_stats_csv_file
21
+ end
22
+
23
+ # Parses the MBOX file and writes the email data to the specified CSV file.
24
+ # It also saves sender and recipient statistics to separate CSV files.
25
+ def parse
26
+ CSV.open(@csv_file, 'w') do |csv|
27
+ # Write CSV header
28
+ csv << ['From', 'To', 'Subject', 'Date', 'Body']
29
+
30
+ File.open(@mbox_file, 'r') do |mbox|
31
+ buffer = ""
32
+ mbox.each_line do |line|
33
+ if line.start_with?("From ")
34
+ process_email_block(buffer, csv) unless buffer.empty?
35
+ buffer = "" # Reset buffer
36
+ end
37
+ buffer << line # Append line to buffer
38
+ end
39
+ process_email_block(buffer, csv) unless buffer.empty? # Process last email block
40
+ end
41
+ end
42
+ puts "Parsing completed. Data saved to #{@csv_file}"
43
+
44
+ # Save and print statistics after parsing
45
+ @statistics.save_sender_statistics(@stats_csv_file)
46
+ @statistics.save_recipient_statistics(@recipient_stats_csv_file)
47
+ rescue => e
48
+ puts "Error processing MBOX file: #{e.message}"
49
+ end
50
+
51
+ private
52
+
53
+ # Processes an individual email block from the MBOX file, extracts the email fields,
54
+ # and writes them to the CSV. Also records email statistics for analysis.
55
+ #
56
+ # @param [String] buffer The email block from the MBOX file.
57
+ # @param [CSV] csv The CSV object where email data is written.
58
+ def process_email_block(buffer, csv)
59
+ mail = Mail.read_from_string(buffer)
60
+
61
+ from = ensure_utf8(mail.from ? mail.from.join(", ") : '', 'UTF-8')
62
+ to = ensure_utf8(mail.to ? mail.to.join(", ") : '', 'UTF-8')
63
+ subject = ensure_utf8(mail.subject ? mail.subject : '', 'UTF-8')
64
+ date = ensure_utf8(mail.date ? mail.date.to_s : '', 'UTF-8')
65
+
66
+ body = decode_body(mail)
67
+
68
+ # Write to CSV
69
+ csv << [from, to, subject, date, body]
70
+
71
+ # Record email for statistics
72
+ @statistics.record_email(from, to, body.length)
73
+ rescue => e
74
+ puts "Error processing email block: #{e.message}"
75
+ end
76
+
77
+ # Decodes the email body content based on content-transfer encoding and converts it to UTF-8.
78
+ #
79
+ # @param [Mail] mail The mail object to decode.
80
+ # @return [String] The decoded email body.
81
+ def decode_body(mail)
82
+ body = if mail.multipart?
83
+ part = mail.text_part || mail.html_part
84
+ part&.body&.decoded || ''
85
+ else
86
+ mail.body.decoded
87
+ end
88
+
89
+ charset = mail.charset || 'UTF-8'
90
+
91
+ case mail.content_transfer_encoding
92
+ when 'base64'
93
+ body = Base64.decode64(body)
94
+ when 'quoted-printable'
95
+ body = body.unpack('M').first
96
+ end
97
+
98
+ ensure_utf8(body, charset)
99
+ end
100
+
101
+ # Converts text to UTF-8 encoding, handling invalid characters by replacing them with '?'.
102
+ #
103
+ # @param [String] text The input text.
104
+ # @param [String] charset The character set of the input text.
105
+ # @return [String] UTF-8 encoded text.
106
+ def ensure_utf8(text, charset)
107
+ return '' if text.nil?
108
+ text = text.force_encoding(charset) if charset
109
+ text.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
110
+ end
111
+ end
112
+
113
+ # The EmailStatistics class is responsible for gathering and writing statistics related to emails.
114
+ # It tracks sender frequency, recipient frequency, and calculates the average email body length per sender.
115
+ class EmailStatistics
116
+ def initialize
117
+ @sender_counts = Hash.new(0) # Keeps count of emails per sender
118
+ @recipient_counts = Hash.new(0) # Keeps count of emails per recipient
119
+ @body_lengths = Hash.new { |hash, key| hash[key] = [] } # Stores body lengths per sender
120
+ end
121
+
122
+ # Records an email's sender, recipients, and body length for statistical purposes.
123
+ #
124
+ # @param [String] from The sender of the email.
125
+ # @param [String, Array<String>] to The recipient(s) of the email.
126
+ # @param [Integer] body_length The length of the email body in characters.
127
+ def record_email(from, to, body_length)
128
+ return if from.empty?
129
+
130
+ @sender_counts[from] += 1
131
+ @body_lengths[from] << body_length
132
+
133
+ Array(to).each do |recipient|
134
+ @recipient_counts[recipient] += 1
135
+ end
136
+ end
137
+
138
+ # Saves sender statistics to a CSV file and prints them to the console.
139
+ #
140
+ # @param [String] csv_filename The path to the output CSV file for sender statistics.
141
+ def save_sender_statistics(csv_filename)
142
+ sorted_senders = @sender_counts.sort_by { |_sender, count| -count }
143
+ average_body_lengths = @body_lengths.transform_values { |lengths| lengths.sum / lengths.size.to_f }
144
+
145
+ CSV.open(csv_filename, 'w') do |csv|
146
+ csv << ['Sender', 'Email Count', 'Average Body Length (chars)']
147
+ sorted_senders.each do |sender, count|
148
+ avg_length = average_body_lengths[sender].round(2)
149
+ csv << [sender, count, avg_length]
150
+ end
151
+ end
152
+
153
+ puts "Sender Email Statistics:"
154
+ sorted_senders.each do |sender, count|
155
+ avg_length = average_body_lengths[sender].round(2)
156
+ puts "#{sender}: #{count} emails, Average body length: #{avg_length} chars"
157
+ end
158
+ end
159
+
160
+ # Saves recipient statistics to a CSV file and prints them to the console.
161
+ #
162
+ # @param [String] csv_filename The path to the output CSV file for recipient statistics.
163
+ def save_recipient_statistics(csv_filename)
164
+ sorted_recipients = @recipient_counts.sort_by { |_recipient, count| -count }
165
+
166
+ CSV.open(csv_filename, 'w') do |csv|
167
+ csv << ['Recipient', 'Email Count']
168
+ sorted_recipients.each do |recipient, count|
169
+ csv << [recipient, count]
170
+ end
171
+ end
172
+
173
+ puts "\nRecipient Email Statistics:"
174
+ sorted_recipients.each do |recipient, count|
175
+ puts "#{recipient}: #{count} emails"
176
+ end
177
+ end
178
+ end
179
+ end
metadata ADDED
@@ -0,0 +1,91 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mbox2csv
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - firefly-cpp
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2024-09-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: base64
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.2.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.2.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: csv
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '3.3'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '3.3'
41
+ - !ruby/object:Gem::Dependency
42
+ name: mail
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 2.8.1
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 2.8.1
55
+ description:
56
+ email:
57
+ - iztok@iztok-jr-fister.eu
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - LICENSE
63
+ - README.md
64
+ - lib/mbox2csv.rb
65
+ homepage: https://codeberg.org/firefly-cpp/mbox2csv
66
+ licenses:
67
+ - MIT
68
+ metadata:
69
+ homepage_uri: https://codeberg.org/firefly-cpp/mbox2csv
70
+ source_code_uri: https://codeberg.org/firefly-cpp/mbox2csv
71
+ changelog_uri: https://codeberg.org/firefly-cpp/mbox2csv
72
+ post_install_message:
73
+ rdoc_options: []
74
+ require_paths:
75
+ - lib
76
+ required_ruby_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: 2.6.0
81
+ required_rubygems_version: !ruby/object:Gem::Requirement
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ requirements: []
87
+ rubygems_version: 3.5.11
88
+ signing_key:
89
+ specification_version: 4
90
+ summary: Parse MBOX files and export email data into CSV format
91
+ test_files: []