mbox-extractor 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
main.py ADDED
@@ -0,0 +1,72 @@
1
+ import mailbox
2
+ import os
3
+ import re
4
+ import hashlib
5
+ from email import policy
6
+ from email.parser import BytesParser
7
+ import argparse
8
+ from tqdm import tqdm
9
+
10
+ def sanitize_filename(filename):
11
+ # Remove directories and illegal characters
12
+ filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
13
+ filename = filename.replace("..", "") # extra safety
14
+ return os.path.basename(filename)
15
+
16
+ def get_unique_filepath(output_dir, filename):
17
+ # This function is no longer needed with MD5-based disambiguation
18
+ return os.path.join(output_dir, filename)
19
+
20
+ def find_mbox_files(root_path):
21
+ """Recursively yield paths to all .mbox files under root_path."""
22
+ for dirpath, _, filenames in os.walk(root_path):
23
+ for fname in filenames:
24
+ if fname.lower().endswith('.mbox'):
25
+ yield os.path.join(dirpath, fname)
26
+
27
+ def extract_attachments(mbox_file, output_dir):
28
+ if not os.path.exists(output_dir):
29
+ os.makedirs(output_dir)
30
+
31
+ print(f"Starting extraction for: {mbox_file}")
32
+ mbox = mailbox.mbox(mbox_file, factory=lambda f: BytesParser(policy=policy.default).parse(f))
33
+ attachment_count = 0
34
+
35
+ # Get total number of messages for tqdm progress bar
36
+ total_msgs = len(mbox)
37
+ for idx, message in enumerate(tqdm(mbox, total=total_msgs, desc=f"Extracting {os.path.basename(mbox_file)}")):
38
+ for part in message.iter_attachments():
39
+ filename = part.get_filename()
40
+ if filename:
41
+ clean_name = sanitize_filename(filename)
42
+ payload = part.get_payload(decode=True)
43
+ # Append short MD5 digest to filename for uniqueness
44
+ digest = hashlib.md5(payload).hexdigest()[:8]
45
+ base, ext = os.path.splitext(clean_name)
46
+ unique_name = f"{base}_{digest}{ext}"
47
+ safe_path = os.path.join(output_dir, unique_name)
48
+
49
+ # Save file
50
+ with open(safe_path, 'wb') as f:
51
+ f.write(payload)
52
+ attachment_count += 1
53
+
54
+ print(f"Extracted {attachment_count} attachments to '{output_dir}'.")
55
+
56
+ def main():
57
+ parser = argparse.ArgumentParser(
58
+ description="Recursively extract attachments from all .mbox files under a given path."
59
+ )
60
+ parser.add_argument(
61
+ "path",
62
+ help="Root directory or file to search for .mbox files"
63
+ )
64
+ args = parser.parse_args()
65
+ root_path = args.path
66
+ for mbox_path in find_mbox_files(root_path):
67
+ mbox_dir = os.path.splitext(mbox_path)[0]
68
+ print(f"Found mbox: {mbox_path} -> extracting to {mbox_dir}")
69
+ extract_attachments(mbox_path, mbox_dir)
70
+
71
+ if __name__ == "__main__":
72
+ main()
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: mbox-extractor
3
+ Version: 0.1.4
4
+ Summary: Recursively extract attachments from .mbox files
5
+ Project-URL: Homepage, https://github.com/tsilva/mbox-extractor
6
+ Project-URL: Repository, https://github.com/tsilva/mbox-extractor
7
+ Author-email: Tiago Silva <eng.tiago.silva@gmail.com>
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Operating System :: OS Independent
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Topic :: Communications :: Email
14
+ Requires-Python: >=3.12
15
+ Requires-Dist: tqdm
16
+ Description-Content-Type: text/markdown
17
+
18
+ <div align="center">
19
+ <img src="logo.png" alt="mbox-extractor" width="512"/>
20
+
21
+ # mbox-extractor
22
+
23
+ [![License](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
24
+ [![Python](https://img.shields.io/badge/python-3.7+-blue.svg)](https://python.org)
25
+
26
+ **📬 Recursively extract all attachments from .mbox email archives with a single command**
27
+
28
+ </div>
29
+
30
+ ## Features
31
+
32
+ - **Recursive scanning** - Finds all `.mbox` files in any directory tree
33
+ - **Safe filenames** - Sanitizes attachment names, removing illegal characters
34
+ - **No duplicates** - Uses content-based hashing to prevent overwrites
35
+ - **Progress display** - Visual progress bar for large mailboxes
36
+
37
+ ## Quick Start
38
+
39
+ ```bash
40
+ uv tool install mbox-extractor
41
+ ```
42
+
43
+ ```bash
44
+ mbox-extractor /path/to/emails
45
+ ```
46
+
47
+ ## Installation
48
+
49
+ ### Using uv (recommended)
50
+
51
+ ```bash
52
+ uv tool install mbox-extractor
53
+ ```
54
+
55
+ ### Using pip
56
+
57
+ ```bash
58
+ pip install mbox-extractor
59
+ ```
60
+
61
+ ### From source
62
+
63
+ ```bash
64
+ git clone https://github.com/tsilva/mbox-extractor.git
65
+ cd mbox-extractor
66
+ uv tool install .
67
+ ```
68
+
69
+ ## Usage
70
+
71
+ Extract all attachments from `.mbox` files under a directory:
72
+
73
+ ```bash
74
+ mbox-extractor /path/to/search
75
+ ```
76
+
77
+ Attachments from each `.mbox` file are saved to a folder with the same name:
78
+
79
+ ```
80
+ Found mbox: /emails/archive.mbox -> extracting to /emails/archive
81
+ Extracting archive.mbox: 100%|████████████████████| 500/500 [00:10<00:00, 48.5it/s]
82
+ Extracted 42 attachments to '/emails/archive'.
83
+ ```
84
+
85
+ ### How It Works
86
+
87
+ 1. Recursively scans the given path for `.mbox` files
88
+ 2. Opens each mailbox and iterates through all messages
89
+ 3. Extracts attachments with sanitized, unique filenames
90
+ 4. Saves them to a folder named after the source `.mbox` file
91
+
92
+ Filenames are made unique by appending an 8-character MD5 hash of the file content, preventing overwrites when multiple attachments share the same name.
93
+
94
+ ## Requirements
95
+
96
+ - Python 3.7+
97
+ - tqdm (installed automatically)
98
+
99
+ ## License
100
+
101
+ [MIT](LICENSE)
@@ -0,0 +1,6 @@
1
+ main.py,sha256=TDk4cMcNkW7IZvz2kr-fQLUK4gJH-PIUmBnJI9oRmOM,2742
2
+ mbox_extractor-0.1.4.dist-info/METADATA,sha256=D_rLzJn1mf6Lw9fD6sie5BGHnjOkTPbaDYNaJ0WvLDU,2581
3
+ mbox_extractor-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
4
+ mbox_extractor-0.1.4.dist-info/entry_points.txt,sha256=OYHWL_RW3Ng13Fwb9dfggI36Iwl52SJP4qUkUgEhJys,45
5
+ mbox_extractor-0.1.4.dist-info/licenses/LICENSE,sha256=gTrdDdqFDu7VtWezebCYIHmebmc-XTVxqWTZGKvDux0,1068
6
+ mbox_extractor-0.1.4.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ mbox-extractor = main:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Tiago Silva
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.