mbox-extractor 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mbox_extractor-0.1.4/.github/dependabot.yml +8 -0
- mbox_extractor-0.1.4/.github/workflows/release.yml +49 -0
- mbox_extractor-0.1.4/.gitignore +220 -0
- mbox_extractor-0.1.4/CLAUDE.md +66 -0
- mbox_extractor-0.1.4/LICENSE +21 -0
- mbox_extractor-0.1.4/PKG-INFO +101 -0
- mbox_extractor-0.1.4/README.md +84 -0
- mbox_extractor-0.1.4/logo.png +0 -0
- mbox_extractor-0.1.4/main.py +72 -0
- mbox_extractor-0.1.4/pyproject.toml +33 -0
- mbox_extractor-0.1.4/uv.lock +35 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: write
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
release:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
outputs:
|
|
15
|
+
created: ${{ steps.check.outputs.exists == 'false' }}
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
- name: Get version
|
|
19
|
+
id: version
|
|
20
|
+
run: |
|
|
21
|
+
VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
22
|
+
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
|
|
23
|
+
echo "tag=v$VERSION" >> "$GITHUB_OUTPUT"
|
|
24
|
+
- name: Check if tag exists
|
|
25
|
+
id: check
|
|
26
|
+
run: |
|
|
27
|
+
if gh release view "${{ steps.version.outputs.tag }}" &>/dev/null; then
|
|
28
|
+
echo "exists=true" >> "$GITHUB_OUTPUT"
|
|
29
|
+
else
|
|
30
|
+
echo "exists=false" >> "$GITHUB_OUTPUT"
|
|
31
|
+
fi
|
|
32
|
+
env:
|
|
33
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
34
|
+
- name: Create release
|
|
35
|
+
if: steps.check.outputs.exists == 'false'
|
|
36
|
+
run: gh release create "${{ steps.version.outputs.tag }}" --title "${{ steps.version.outputs.tag }}" --generate-notes
|
|
37
|
+
env:
|
|
38
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
39
|
+
|
|
40
|
+
publish:
|
|
41
|
+
needs: release
|
|
42
|
+
if: needs.release.outputs.created == 'true'
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
environment: pypi
|
|
45
|
+
steps:
|
|
46
|
+
- uses: actions/checkout@v4
|
|
47
|
+
- uses: astral-sh/setup-uv@v4
|
|
48
|
+
- run: uv build
|
|
49
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py,cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
|
|
110
|
+
# pdm
|
|
111
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
112
|
+
#pdm.lock
|
|
113
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
114
|
+
# in version control.
|
|
115
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
116
|
+
.pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
121
|
+
__pypackages__/
|
|
122
|
+
|
|
123
|
+
# Celery stuff
|
|
124
|
+
celerybeat-schedule
|
|
125
|
+
celerybeat.pid
|
|
126
|
+
|
|
127
|
+
# SageMath parsed files
|
|
128
|
+
*.sage.py
|
|
129
|
+
|
|
130
|
+
# Environments
|
|
131
|
+
.env
|
|
132
|
+
.venv
|
|
133
|
+
env/
|
|
134
|
+
venv/
|
|
135
|
+
ENV/
|
|
136
|
+
env.bak/
|
|
137
|
+
venv.bak/
|
|
138
|
+
|
|
139
|
+
# Spyder project settings
|
|
140
|
+
.spyderproject
|
|
141
|
+
.spyproject
|
|
142
|
+
|
|
143
|
+
# Rope project settings
|
|
144
|
+
.ropeproject
|
|
145
|
+
|
|
146
|
+
# mkdocs documentation
|
|
147
|
+
/site
|
|
148
|
+
|
|
149
|
+
# mypy
|
|
150
|
+
.mypy_cache/
|
|
151
|
+
.dmypy.json
|
|
152
|
+
dmypy.json
|
|
153
|
+
|
|
154
|
+
# Pyre type checker
|
|
155
|
+
.pyre/
|
|
156
|
+
|
|
157
|
+
# pytype static type analyzer
|
|
158
|
+
.pytype/
|
|
159
|
+
|
|
160
|
+
# Cython debug symbols
|
|
161
|
+
cython_debug/
|
|
162
|
+
|
|
163
|
+
# PyCharm
|
|
164
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
165
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
166
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
167
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
168
|
+
#.idea/
|
|
169
|
+
|
|
170
|
+
# Ruff stuff:
|
|
171
|
+
.ruff_cache/
|
|
172
|
+
|
|
173
|
+
# PyPI configuration file
|
|
174
|
+
.pypirc
|
|
175
|
+
|
|
176
|
+
# Sensitive data - credentials and secrets
|
|
177
|
+
.env.*
|
|
178
|
+
.env.local
|
|
179
|
+
.env.*.local
|
|
180
|
+
*.pem
|
|
181
|
+
*.key
|
|
182
|
+
*.p12
|
|
183
|
+
*.pfx
|
|
184
|
+
credentials.json
|
|
185
|
+
secrets.json
|
|
186
|
+
service-account*.json
|
|
187
|
+
*-credentials.json
|
|
188
|
+
.aws/
|
|
189
|
+
.ssh/
|
|
190
|
+
*.gpg
|
|
191
|
+
|
|
192
|
+
# API keys and tokens often in these
|
|
193
|
+
config.local.*
|
|
194
|
+
*.secret
|
|
195
|
+
.secrets/
|
|
196
|
+
|
|
197
|
+
# Logs (may contain sensitive data)
|
|
198
|
+
logs/
|
|
199
|
+
npm-debug.log*
|
|
200
|
+
yarn-debug.log*
|
|
201
|
+
yarn-error.log*
|
|
202
|
+
|
|
203
|
+
# Build artifacts and caches
|
|
204
|
+
.venv/
|
|
205
|
+
node_modules/
|
|
206
|
+
.npm/
|
|
207
|
+
|
|
208
|
+
# OS files
|
|
209
|
+
.DS_Store
|
|
210
|
+
Thumbs.db
|
|
211
|
+
|
|
212
|
+
# IDE
|
|
213
|
+
.idea/
|
|
214
|
+
.vscode/
|
|
215
|
+
*.swp
|
|
216
|
+
*.swo
|
|
217
|
+
|
|
218
|
+
# Claude Code
|
|
219
|
+
.claude-sandbox.json
|
|
220
|
+
*.code-workspace
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
mbox-extractor is a command-line tool that recursively scans directories for `.mbox` email archive files and extracts all attachments. It's a single-file Python application with minimal dependencies.
|
|
8
|
+
|
|
9
|
+
## Installation & Setup
|
|
10
|
+
|
|
11
|
+
Install the tool using uv:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
uv tool install .
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Running the Tool
|
|
18
|
+
|
|
19
|
+
Extract attachments from all `.mbox` files in a directory:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
mbox-extractor /path/to/search
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Test during development by running the main.py module directly:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
python main.py /path/to/test/directory
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Architecture
|
|
32
|
+
|
|
33
|
+
This is a single-file application (`main.py`) with a simple processing pipeline:
|
|
34
|
+
|
|
35
|
+
1. **File Discovery** (`find_mbox_files`): Recursively walks directory tree to find `.mbox` files
|
|
36
|
+
2. **Attachment Extraction** (`extract_attachments`): For each `.mbox` file:
|
|
37
|
+
- Opens it as a mailbox using Python's `mailbox.mbox`
|
|
38
|
+
- Iterates through all messages with a progress bar
|
|
39
|
+
- Extracts attachments from each message
|
|
40
|
+
- Saves attachments with sanitized, unique filenames
|
|
41
|
+
3. **Filename Handling**:
|
|
42
|
+
- `sanitize_filename`: Removes illegal characters for safe filesystem names
|
|
43
|
+
- Uniqueness is ensured by appending an 8-character MD5 hash of the file content
|
|
44
|
+
|
|
45
|
+
**Key Design Points:**
|
|
46
|
+
- Each `.mbox` file's attachments are saved to a folder with the same name as the `.mbox` file (without extension)
|
|
47
|
+
- Filenames are made unique using content-based hashing (MD5) to prevent overwriting duplicate filenames
|
|
48
|
+
- Progress display uses `tqdm` for visual feedback on large mailboxes
|
|
49
|
+
|
|
50
|
+
## Dependencies
|
|
51
|
+
|
|
52
|
+
- **tqdm**: Progress bar display
|
|
53
|
+
- Python standard library: `mailbox`, `email`, `hashlib`, `argparse`, `os`, `re`
|
|
54
|
+
|
|
55
|
+
## Project Configuration
|
|
56
|
+
|
|
57
|
+
- Built with `hatchling` as the build backend
|
|
58
|
+
- Entry point: `mbox-extractor` command → `main:main`
|
|
59
|
+
- Python 3.7+ required
|
|
60
|
+
- The `main.py` file is packaged in the wheel distribution
|
|
61
|
+
|
|
62
|
+
## Important Notes
|
|
63
|
+
|
|
64
|
+
- README.md must be kept up to date with any significant project changes
|
|
65
|
+
- The tool uses `email.policy.default` for modern email parsing
|
|
66
|
+
- The `get_unique_filepath` function at main.py:16 is deprecated (MD5-based uniqueness replaced it)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tiago Silva
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mbox-extractor
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Recursively extract attachments from .mbox files
|
|
5
|
+
Project-URL: Homepage, https://github.com/tsilva/mbox-extractor
|
|
6
|
+
Project-URL: Repository, https://github.com/tsilva/mbox-extractor
|
|
7
|
+
Author-email: Tiago Silva <eng.tiago.silva@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Communications :: Email
|
|
14
|
+
Requires-Python: >=3.12
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
<div align="center">
|
|
19
|
+
<img src="logo.png" alt="mbox-extractor" width="512"/>
|
|
20
|
+
|
|
21
|
+
# mbox-extractor
|
|
22
|
+
|
|
23
|
+
[](LICENSE)
|
|
24
|
+
[](https://python.org)
|
|
25
|
+
|
|
26
|
+
**📬 Recursively extract all attachments from .mbox email archives with a single command**
|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
## Features
|
|
31
|
+
|
|
32
|
+
- **Recursive scanning** - Finds all `.mbox` files in any directory tree
|
|
33
|
+
- **Safe filenames** - Sanitizes attachment names, removing illegal characters
|
|
34
|
+
- **No duplicates** - Uses content-based hashing to prevent overwrites
|
|
35
|
+
- **Progress display** - Visual progress bar for large mailboxes
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv tool install mbox-extractor
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
mbox-extractor /path/to/emails
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
### Using uv (recommended)
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv tool install mbox-extractor
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Using pip
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
pip install mbox-extractor
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### From source
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
git clone https://github.com/tsilva/mbox-extractor.git
|
|
65
|
+
cd mbox-extractor
|
|
66
|
+
uv tool install .
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
Extract all attachments from `.mbox` files under a directory:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
mbox-extractor /path/to/search
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Attachments from each `.mbox` file are saved to a folder with the same name:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
Found mbox: /emails/archive.mbox -> extracting to /emails/archive
|
|
81
|
+
Extracting archive.mbox: 100%|████████████████████| 500/500 [00:10<00:00, 48.5it/s]
|
|
82
|
+
Extracted 42 attachments to '/emails/archive'.
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### How It Works
|
|
86
|
+
|
|
87
|
+
1. Recursively scans the given path for `.mbox` files
|
|
88
|
+
2. Opens each mailbox and iterates through all messages
|
|
89
|
+
3. Extracts attachments with sanitized, unique filenames
|
|
90
|
+
4. Saves them to a folder named after the source `.mbox` file
|
|
91
|
+
|
|
92
|
+
Filenames are made unique by appending an 8-character MD5 hash of the file content, preventing overwrites when multiple attachments share the same name.
|
|
93
|
+
|
|
94
|
+
## Requirements
|
|
95
|
+
|
|
96
|
+
- Python 3.7+
|
|
97
|
+
- tqdm (installed automatically)
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
<img src="logo.png" alt="mbox-extractor" width="512"/>
|
|
3
|
+
|
|
4
|
+
# mbox-extractor
|
|
5
|
+
|
|
6
|
+
[](LICENSE)
|
|
7
|
+
[](https://python.org)
|
|
8
|
+
|
|
9
|
+
**📬 Recursively extract all attachments from .mbox email archives with a single command**
|
|
10
|
+
|
|
11
|
+
</div>
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- **Recursive scanning** - Finds all `.mbox` files in any directory tree
|
|
16
|
+
- **Safe filenames** - Sanitizes attachment names, removing illegal characters
|
|
17
|
+
- **No duplicates** - Uses content-based hashing to prevent overwrites
|
|
18
|
+
- **Progress display** - Visual progress bar for large mailboxes
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv tool install mbox-extractor
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
mbox-extractor /path/to/emails
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
### Using uv (recommended)
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
uv tool install mbox-extractor
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### Using pip
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install mbox-extractor
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### From source
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
git clone https://github.com/tsilva/mbox-extractor.git
|
|
48
|
+
cd mbox-extractor
|
|
49
|
+
uv tool install .
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Usage
|
|
53
|
+
|
|
54
|
+
Extract all attachments from `.mbox` files under a directory:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
mbox-extractor /path/to/search
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Attachments from each `.mbox` file are saved to a folder with the same name:
|
|
61
|
+
|
|
62
|
+
```
|
|
63
|
+
Found mbox: /emails/archive.mbox -> extracting to /emails/archive
|
|
64
|
+
Extracting archive.mbox: 100%|████████████████████| 500/500 [00:10<00:00, 48.5it/s]
|
|
65
|
+
Extracted 42 attachments to '/emails/archive'.
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### How It Works
|
|
69
|
+
|
|
70
|
+
1. Recursively scans the given path for `.mbox` files
|
|
71
|
+
2. Opens each mailbox and iterates through all messages
|
|
72
|
+
3. Extracts attachments with sanitized, unique filenames
|
|
73
|
+
4. Saves them to a folder named after the source `.mbox` file
|
|
74
|
+
|
|
75
|
+
Filenames are made unique by appending an 8-character MD5 hash of the file content, preventing overwrites when multiple attachments share the same name.
|
|
76
|
+
|
|
77
|
+
## Requirements
|
|
78
|
+
|
|
79
|
+
- Python 3.7+
|
|
80
|
+
- tqdm (installed automatically)
|
|
81
|
+
|
|
82
|
+
## License
|
|
83
|
+
|
|
84
|
+
[MIT](LICENSE)
|
|
Binary file
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import mailbox
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import hashlib
|
|
5
|
+
from email import policy
|
|
6
|
+
from email.parser import BytesParser
|
|
7
|
+
import argparse
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
|
|
10
|
+
def sanitize_filename(filename):
|
|
11
|
+
# Remove directories and illegal characters
|
|
12
|
+
filename = re.sub(r'[\\/*?:"<>|]', "_", filename)
|
|
13
|
+
filename = filename.replace("..", "") # extra safety
|
|
14
|
+
return os.path.basename(filename)
|
|
15
|
+
|
|
16
|
+
def get_unique_filepath(output_dir, filename):
|
|
17
|
+
# This function is no longer needed with MD5-based disambiguation
|
|
18
|
+
return os.path.join(output_dir, filename)
|
|
19
|
+
|
|
20
|
+
def find_mbox_files(root_path):
|
|
21
|
+
"""Recursively yield paths to all .mbox files under root_path."""
|
|
22
|
+
for dirpath, _, filenames in os.walk(root_path):
|
|
23
|
+
for fname in filenames:
|
|
24
|
+
if fname.lower().endswith('.mbox'):
|
|
25
|
+
yield os.path.join(dirpath, fname)
|
|
26
|
+
|
|
27
|
+
def extract_attachments(mbox_file, output_dir):
|
|
28
|
+
if not os.path.exists(output_dir):
|
|
29
|
+
os.makedirs(output_dir)
|
|
30
|
+
|
|
31
|
+
print(f"Starting extraction for: {mbox_file}")
|
|
32
|
+
mbox = mailbox.mbox(mbox_file, factory=lambda f: BytesParser(policy=policy.default).parse(f))
|
|
33
|
+
attachment_count = 0
|
|
34
|
+
|
|
35
|
+
# Get total number of messages for tqdm progress bar
|
|
36
|
+
total_msgs = len(mbox)
|
|
37
|
+
for idx, message in enumerate(tqdm(mbox, total=total_msgs, desc=f"Extracting {os.path.basename(mbox_file)}")):
|
|
38
|
+
for part in message.iter_attachments():
|
|
39
|
+
filename = part.get_filename()
|
|
40
|
+
if filename:
|
|
41
|
+
clean_name = sanitize_filename(filename)
|
|
42
|
+
payload = part.get_payload(decode=True)
|
|
43
|
+
# Append short MD5 digest to filename for uniqueness
|
|
44
|
+
digest = hashlib.md5(payload).hexdigest()[:8]
|
|
45
|
+
base, ext = os.path.splitext(clean_name)
|
|
46
|
+
unique_name = f"{base}_{digest}{ext}"
|
|
47
|
+
safe_path = os.path.join(output_dir, unique_name)
|
|
48
|
+
|
|
49
|
+
# Save file
|
|
50
|
+
with open(safe_path, 'wb') as f:
|
|
51
|
+
f.write(payload)
|
|
52
|
+
attachment_count += 1
|
|
53
|
+
|
|
54
|
+
print(f"Extracted {attachment_count} attachments to '{output_dir}'.")
|
|
55
|
+
|
|
56
|
+
def main():
|
|
57
|
+
parser = argparse.ArgumentParser(
|
|
58
|
+
description="Recursively extract attachments from all .mbox files under a given path."
|
|
59
|
+
)
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"path",
|
|
62
|
+
help="Root directory or file to search for .mbox files"
|
|
63
|
+
)
|
|
64
|
+
args = parser.parse_args()
|
|
65
|
+
root_path = args.path
|
|
66
|
+
for mbox_path in find_mbox_files(root_path):
|
|
67
|
+
mbox_dir = os.path.splitext(mbox_path)[0]
|
|
68
|
+
print(f"Found mbox: {mbox_path} -> extracting to {mbox_dir}")
|
|
69
|
+
extract_attachments(mbox_path, mbox_dir)
|
|
70
|
+
|
|
71
|
+
if __name__ == "__main__":
|
|
72
|
+
main()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mbox-extractor"
|
|
7
|
+
version = "0.1.4"
|
|
8
|
+
description = "Recursively extract attachments from .mbox files"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Tiago Silva", email = "eng.tiago.silva@gmail.com" }
|
|
13
|
+
]
|
|
14
|
+
dependencies = [
|
|
15
|
+
"tqdm"
|
|
16
|
+
]
|
|
17
|
+
requires-python = ">=3.12"
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
"Topic :: Communications :: Email",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/tsilva/mbox-extractor"
|
|
27
|
+
Repository = "https://github.com/tsilva/mbox-extractor"
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
mbox-extractor = "main:main"
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.wheel]
|
|
33
|
+
include = ["main.py"]
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
version = 1
|
|
2
|
+
revision = 3
|
|
3
|
+
requires-python = ">=3.12"
|
|
4
|
+
|
|
5
|
+
[[package]]
|
|
6
|
+
name = "colorama"
|
|
7
|
+
version = "0.4.6"
|
|
8
|
+
source = { registry = "https://pypi.org/simple" }
|
|
9
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" }
|
|
10
|
+
wheels = [
|
|
11
|
+
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[[package]]
|
|
15
|
+
name = "mbox-extractor"
|
|
16
|
+
version = "0.1.1"
|
|
17
|
+
source = { editable = "." }
|
|
18
|
+
dependencies = [
|
|
19
|
+
{ name = "tqdm" },
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[package.metadata]
|
|
23
|
+
requires-dist = [{ name = "tqdm" }]
|
|
24
|
+
|
|
25
|
+
[[package]]
|
|
26
|
+
name = "tqdm"
|
|
27
|
+
version = "4.67.1"
|
|
28
|
+
source = { registry = "https://pypi.org/simple" }
|
|
29
|
+
dependencies = [
|
|
30
|
+
{ name = "colorama", marker = "sys_platform == 'win32'" },
|
|
31
|
+
]
|
|
32
|
+
sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737, upload-time = "2024-11-24T20:12:22.481Z" }
|
|
33
|
+
wheels = [
|
|
34
|
+
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" },
|
|
35
|
+
]
|