mbox-extractor 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
main.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import mailbox
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import hashlib
|
|
5
|
+
from email import policy
|
|
6
|
+
from email.parser import BytesParser
|
|
7
|
+
import argparse
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
def sanitize_filename(filename):
|
|
11
|
+
# Remove directories and illegal characters
|
|
12
|
+
filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
|
|
13
|
+
filename = filename.replace("..", "") # extra safety
|
|
14
|
+
return os.path.basename(filename)
|
|
15
|
+
|
|
16
|
+
def get_unique_filepath(output_dir, filename):
|
|
17
|
+
# This function is no longer needed with MD5-based disambiguation
|
|
18
|
+
return os.path.join(output_dir, filename)
|
|
19
|
+
|
|
20
|
+
def find_mbox_files(root_path):
|
|
21
|
+
"""Recursively yield paths to all .mbox files under root_path."""
|
|
22
|
+
for dirpath, _, filenames in os.walk(root_path):
|
|
23
|
+
for fname in filenames:
|
|
24
|
+
if fname.lower().endswith('.mbox'):
|
|
25
|
+
yield os.path.join(dirpath, fname)
|
|
26
|
+
|
|
27
|
+
def extract_attachments(mbox_file, output_dir):
|
|
28
|
+
if not os.path.exists(output_dir):
|
|
29
|
+
os.makedirs(output_dir)
|
|
30
|
+
|
|
31
|
+
print(f"Starting extraction for: {mbox_file}")
|
|
32
|
+
mbox = mailbox.mbox(mbox_file, factory=lambda f: BytesParser(policy=policy.default).parse(f))
|
|
33
|
+
attachment_count = 0
|
|
34
|
+
|
|
35
|
+
# Get total number of messages for tqdm progress bar
|
|
36
|
+
total_msgs = len(mbox)
|
|
37
|
+
for idx, message in enumerate(tqdm(mbox, total=total_msgs, desc=f"Extracting {os.path.basename(mbox_file)}")):
|
|
38
|
+
for part in message.iter_attachments():
|
|
39
|
+
filename = part.get_filename()
|
|
40
|
+
if filename:
|
|
41
|
+
clean_name = sanitize_filename(filename)
|
|
42
|
+
payload = part.get_payload(decode=True)
|
|
43
|
+
# Append short MD5 digest to filename for uniqueness
|
|
44
|
+
digest = hashlib.md5(payload).hexdigest()[:8]
|
|
45
|
+
base, ext = os.path.splitext(clean_name)
|
|
46
|
+
unique_name = f"{base}_{digest}{ext}"
|
|
47
|
+
safe_path = os.path.join(output_dir, unique_name)
|
|
48
|
+
|
|
49
|
+
# Save file
|
|
50
|
+
with open(safe_path, 'wb') as f:
|
|
51
|
+
f.write(payload)
|
|
52
|
+
attachment_count += 1
|
|
53
|
+
|
|
54
|
+
print(f"Extracted {attachment_count} attachments to '{output_dir}'.")
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
parser = argparse.ArgumentParser(
|
|
58
|
+
description="Recursively extract attachments from all .mbox files under a given path."
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"path",
|
|
62
|
+
help="Root directory or file to search for .mbox files"
|
|
63
|
+
)
|
|
64
|
+
args = parser.parse_args()
|
|
65
|
+
root_path = args.path
|
|
66
|
+
for mbox_path in find_mbox_files(root_path):
|
|
67
|
+
mbox_dir = os.path.splitext(mbox_path)[0]
|
|
68
|
+
print(f"Found mbox: {mbox_path} -> extracting to {mbox_dir}")
|
|
69
|
+
extract_attachments(mbox_path, mbox_dir)
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
main()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mbox-extractor
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Recursively extract attachments from .mbox files
|
|
5
|
+
Project-URL: Homepage, https://github.com/tsilva/mbox-extractor
|
|
6
|
+
Project-URL: Repository, https://github.com/tsilva/mbox-extractor
|
|
7
|
+
Author-email: Tiago Silva <eng.tiago.silva@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Communications :: Email
|
|
14
|
+
Requires-Python: >=3.12
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
<div align="center">
|
|
19
|
+
<img src="logo.png" alt="mbox-extractor" width="512"/>
|
|
20
|
+
|
|
21
|
+
# mbox-extractor
|
|
22
|
+
|
|
23
|
+
[](LICENSE)
|
|
24
|
+
[](https://python.org)
|
|
25
|
+
|
|
26
|
+
**📬 Recursively extract all attachments from .mbox email archives with a single command**
|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Recursive scanning** - Finds all `.mbox` files in any directory tree
|
|
33
|
+
- **Safe filenames** - Sanitizes attachment names, removing illegal characters
|
|
34
|
+
- **No duplicates** - Uses content-based hashing to prevent overwrites
|
|
35
|
+
- **Progress display** - Visual progress bar for large mailboxes
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv tool install mbox-extractor
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
mbox-extractor /path/to/emails
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### Using uv (recommended)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv tool install mbox-extractor
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Using pip
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install mbox-extractor
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### From source
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git clone https://github.com/tsilva/mbox-extractor.git
|
|
65
|
+
cd mbox-extractor
|
|
66
|
+
uv tool install .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
Extract all attachments from `.mbox` files under a directory:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
mbox-extractor /path/to/search
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Attachments from each `.mbox` file are saved to a folder with the same name:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Found mbox: /emails/archive.mbox -> extracting to /emails/archive
|
|
81
|
+
Extracting archive.mbox: 100%|████████████████████| 500/500 [00:10<00:00, 48.5it/s]
|
|
82
|
+
Extracted 42 attachments to '/emails/archive'.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### How It Works
|
|
86
|
+
|
|
87
|
+
1. Recursively scans the given path for `.mbox` files
|
|
88
|
+
2. Opens each mailbox and iterates through all messages
|
|
89
|
+
3. Extracts attachments with sanitized, unique filenames
|
|
90
|
+
4. Saves them to a folder named after the source `.mbox` file
|
|
91
|
+
|
|
92
|
+
Filenames are made unique by appending an 8-character MD5 hash of the file content, preventing overwrites when multiple attachments share the same name.
|
|
93
|
+
|
|
94
|
+
## Requirements
|
|
95
|
+
|
|
96
|
+
- Python 3.7+
|
|
97
|
+
- tqdm (installed automatically)
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
main.py,sha256=TDk4cMcNkW7IZvz2kr-fQLUK4gJH-PIUmBnJI9oRmOM,2742
|
|
2
|
+
mbox_extractor-0.1.4.dist-info/METADATA,sha256=D_rLzJn1mf6Lw9fD6sie5BGHnjOkTPbaDYNaJ0WvLDU,2581
|
|
3
|
+
mbox_extractor-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
4
|
+
mbox_extractor-0.1.4.dist-info/entry_points.txt,sha256=OYHWL_RW3Ng13Fwb9dfggI36Iwl52SJP4qUkUgEhJys,45
|
|
5
|
+
mbox_extractor-0.1.4.dist-info/licenses/LICENSE,sha256=gTrdDdqFDu7VtWezebCYIHmebmc-XTVxqWTZGKvDux0,1068
|
|
6
|
+
mbox_extractor-0.1.4.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tiago Silva
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|