mail-parser 4.2.1__tar.gz → 4.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/workflows/main.yml +2 -1
- mail_parser-4.4.0/CLAUDE.md +97 -0
- mail_parser-4.2.1/README.md → mail_parser-4.4.0/PKG-INFO +61 -10
- mail_parser-4.2.1/PKG-INFO → mail_parser-4.4.0/README.md +34 -35
- {mail_parser-4.2.1 → mail_parser-4.4.0}/pyproject.toml +3 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/core.py +38 -11
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/utils.py +103 -10
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/version.py +1 -1
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_mail_parser.py +148 -1
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_main.py +11 -4
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_utils.py +173 -18
- mail_parser-4.4.0/uv.lock +2529 -0
- mail_parser-4.2.1/uv.lock +0 -1322
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/FUNDING.yml +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/copilot-instructions.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/containerization-docker-best-practices.instructions.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/github-actions-ci-cd-best-practices.instructions.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/markdown.instructions.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/python.instructions.md +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.gitignore +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.markdownlint.json +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/.pre-commit-config.yaml +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/Dockerfile +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/LICENSE.txt +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/Makefile +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/NOTICE.txt +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/docker-compose.yml +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/docs/images/Bitcoin SpamScope.jpg +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/__init__.py +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/__main__.py +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/const.py +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/exceptions.py +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_1 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_2 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_3 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_outlook_1 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_1 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_10 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_11 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_12 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_13 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_14 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_15 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_16 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_17 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_18 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_19 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_2 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_3 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_4 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_5 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_6 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_7 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_8 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_9 +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_improved_received_patterns.py +0 -0
- {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_received_corpus.py +0 -0
|
@@ -29,7 +29,7 @@ jobs:
|
|
|
29
29
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
30
30
|
sudo apt-get -qq update
|
|
31
31
|
sudo apt-get install -y libemail-outlook-message-perl
|
|
32
|
-
uv sync
|
|
32
|
+
uv sync --all-extras
|
|
33
33
|
export PERL_MM_USE_DEFAULT=1
|
|
34
34
|
sudo cpan -f -i Email::Outlook::Message
|
|
35
35
|
|
|
@@ -41,6 +41,7 @@ jobs:
|
|
|
41
41
|
uv run mail-parser -v
|
|
42
42
|
uv run mail-parser -h
|
|
43
43
|
uv run mail-parser -f tests/mails/mail_malformed_3 -j
|
|
44
|
+
uv run mail-parser -f tests/mails/mail_outlook_1 -o -j
|
|
44
45
|
cat tests/mails/mail_malformed_3 | uv run mail-parser -k -j
|
|
45
46
|
|
|
46
47
|
- name: Run pre-commit
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Engineering Principles
|
|
6
|
+
|
|
7
|
+
- **DRY** — extract repeated logic; no copy-paste code.
|
|
8
|
+
- **KISS** — simplest solution that works; no clever tricks.
|
|
9
|
+
- **YAGNI** — no code for hypothetical future needs.
|
|
10
|
+
- **SRP** — one function/class, one responsibility.
|
|
11
|
+
- **Fail fast** — validate at boundaries; trust internal code.
|
|
12
|
+
- No dead code, commented-out blocks, or unused imports.
|
|
13
|
+
- Every third-party import must have an explicit entry in `pyproject.toml`. Use extras when the
|
|
14
|
+
runtime feature requires them (e.g. `elasticsearch[async]`).
|
|
15
|
+
|
|
16
|
+
## Python Style
|
|
17
|
+
|
|
18
|
+
- Follow PEP 8. Use 4 spaces, never tabs.
|
|
19
|
+
- Lines: 79 chars for code, 72 for comments/docstrings.
|
|
20
|
+
- `snake_case` for functions/variables, `CapWords` for classes, `UPPER_CASE` for constants.
|
|
21
|
+
- Imports order: stdlib → third-party → local.
|
|
22
|
+
- Add docstrings to all public functions, methods, classes, and modules (params, return values,
|
|
23
|
+
exceptions).
|
|
24
|
+
|
|
25
|
+
## Commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv sync # install all dependencies
|
|
29
|
+
uv run pytest # run all tests
|
|
30
|
+
uv run pytest tests/test_mail_parser.py::TestMailParser::test_name # single test
|
|
31
|
+
uv run ruff check . # lint
|
|
32
|
+
uv run ruff format . # format
|
|
33
|
+
make check # lint + tests
|
|
34
|
+
make build # clean + build wheel/sdist
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
**Package layout**: `src/mailparser/` (src layout, no runtime deps — stdlib only).
|
|
40
|
+
|
|
41
|
+
### Module responsibilities
|
|
42
|
+
|
|
43
|
+
| Module | Role |
|
|
44
|
+
| --------------- | --------------------------------------------------------------------- |
|
|
45
|
+
| `core.py` | `MailParser` class + `parse_from_*` factory functions |
|
|
46
|
+
| `utils.py` | Stateless parsing helpers (address, received headers, date, encoding) |
|
|
47
|
+
| `const.py` | Compiled regexes, header sets, clause splitter |
|
|
48
|
+
| `exceptions.py` | Custom exception hierarchy |
|
|
49
|
+
| `__main__.py` | CLI entry point (`mail-parser` command) |
|
|
50
|
+
| `__init__.py` | Re-exports public API |
|
|
51
|
+
|
|
52
|
+
### Key design patterns
|
|
53
|
+
|
|
54
|
+
**Dynamic attribute access via `__getattr__`**: `MailParser` exposes every header dynamically.
|
|
55
|
+
Three access modes are supported for any attribute `X`:
|
|
56
|
+
|
|
57
|
+
- `parser.X` → Python object (str or list)
|
|
58
|
+
- `parser.X_json` → JSON string
|
|
59
|
+
- `parser.X_raw` → raw JSON via `message.get_all()`
|
|
60
|
+
|
|
61
|
+
Address headers listed in `const.ADDRESSES_HEADERS` (`from`, `to`, `cc`, `bcc`, `reply-to`,
|
|
62
|
+
`delivered-to`) return `list[tuple[str, str]]` (display_name, email) instead of plain strings.
|
|
63
|
+
|
|
64
|
+
**RFC non-compliance fallback in `get_addresses()`**: Python's
|
|
65
|
+
`email.utils.getaddresses(strict=True)` (hardened against CVE-2023-27043) rejects headers where
|
|
66
|
+
the display name contains `@`. Since this is a forensics tool, `utils.py` applies
|
|
67
|
+
`_ADDR_FALLBACK_RE` whenever strict parsing returns empty results — always surfacing what is
|
|
68
|
+
actually in the header.
|
|
69
|
+
|
|
70
|
+
**Received header parsing**: `utils.receiveds_parsing()` tokenizes on RFC 5321 clause keywords
|
|
71
|
+
(`from`, `by`, `via`, `with`, `id`, `for`, `envelope-from`) using `const._CLAUSE_SPLITTER`.
|
|
72
|
+
Output list is ordered first-hop first. Unparseable headers fall back to `{"raw": ...}`.
|
|
73
|
+
|
|
74
|
+
**Defect detection**: During `parse()`, every MIME part is walked and `_append_defects()` records
|
|
75
|
+
RFC violations. `EPILOGUE_DEFECTS` triggers special epilogue extraction to recover hidden payloads
|
|
76
|
+
in malformed boundaries.
|
|
77
|
+
|
|
78
|
+
**Outlook support**: `parse_from_file_msg()` shells out to the system `msgconvert` Perl tool
|
|
79
|
+
(`libemail-outlook-message-perl`) to convert `.msg` → `.eml`, then parses normally.
|
|
80
|
+
|
|
81
|
+
**Partial vs full mail**: `_make_mail(complete=True)` includes all headers found in the message;
|
|
82
|
+
`complete=False` restricts to `const.ADDRESSES_HEADERS | const.OTHERS_PARTS` (the "main" headers).
|
|
83
|
+
Accessible as `parser.mail` / `parser.mail_partial`.
|
|
84
|
+
|
|
85
|
+
### Adding new headers
|
|
86
|
+
|
|
87
|
+
To make a header always appear in partial output, add its lowercase name to `OTHERS_PARTS` in
|
|
88
|
+
`const.py`. Address-type headers (returning parsed name/email tuples) go in `ADDRESSES_HEADERS`.
|
|
89
|
+
|
|
90
|
+
## Workflow
|
|
91
|
+
|
|
92
|
+
After every change:
|
|
93
|
+
|
|
94
|
+
1. Add/update unittests to cover the change.
|
|
95
|
+
1. Update README.md if the change affects usage, API, or setup.
|
|
96
|
+
1. Stage changes and run pre-commit; fix all reported issues before proceeding.
|
|
97
|
+
1. Run full test suite; fix all failures before reporting done.
|
|
@@ -1,3 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mail-parser
|
|
3
|
+
Version: 4.4.0
|
|
4
|
+
Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
|
|
5
|
+
Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
6
|
+
Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
7
|
+
License-Expression: Apache-2.0
|
|
8
|
+
License-File: LICENSE.txt
|
|
9
|
+
License-File: NOTICE.txt
|
|
10
|
+
Keywords: email,forensics,mail,malware,parser,phishing,security,spam,threat detection
|
|
11
|
+
Classifier: Natural Language :: English
|
|
12
|
+
Classifier: Operating System :: MacOS
|
|
13
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
+
Classifier: Operating System :: Unix
|
|
15
|
+
Classifier: Programming Language :: Python
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
+
Requires-Python: <3.15,>=3.9
|
|
24
|
+
Provides-Extra: outlook
|
|
25
|
+
Requires-Dist: extract-msg>=0.54; extra == 'outlook'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
1
28
|
[](https://pypi.org/project/mail-parser/)
|
|
2
29
|
[](https://coveralls.io/github/SpamScope/mail-parser?branch=develop)
|
|
3
30
|
[](https://pypistats.org/packages/mail-parser)
|
|
@@ -36,20 +63,44 @@ formats, making it versatile for diverse email ecosystems.
|
|
|
36
63
|
**⚡ Production-Ready**: Trusted by security professionals and developers worldwide, with extensive
|
|
37
64
|
test coverage and proven reliability in high-stakes environments.
|
|
38
65
|
|
|
39
|
-
|
|
40
|
-
this functionality on Debian-based systems, simply install the required system package:
|
|
66
|
+
mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
|
|
41
67
|
|
|
42
|
-
|
|
43
|
-
apt-get install libemail-outlook-message-perl
|
|
44
|
-
```
|
|
68
|
+
## Parsing Outlook `.msg` files
|
|
45
69
|
|
|
46
|
-
|
|
70
|
+
mail-parser converts Outlook `.msg` files to standard `.eml` before parsing.
|
|
71
|
+
Two conversion backends are supported:
|
|
47
72
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
```
|
|
73
|
+
1. **`extract-msg` (recommended, pure Python).** No external tools required.
|
|
74
|
+
Install the optional extra:
|
|
51
75
|
|
|
52
|
-
|
|
76
|
+
```bash
|
|
77
|
+
pip install mail-parser[outlook]
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
1. **`msgconvert` (deprecated, external Perl tool).** Requires the
|
|
81
|
+
`libemail-outlook-message-perl` system package:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
apt-get install libemail-outlook-message-perl # Debian-based systems
|
|
85
|
+
apt-cache show libemail-outlook-message-perl # package details
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Backend precedence:** when `extract-msg` is installed it is used first.
|
|
89
|
+
Only when it is *not* available does mail-parser fall back to the `msgconvert`
|
|
90
|
+
external tool, logging a deprecation warning. If neither backend is available,
|
|
91
|
+
`parse_from_file_msg()` raises `MailParserOSError` telling you to install
|
|
92
|
+
either path.
|
|
93
|
+
|
|
94
|
+
> **⚠️ Deprecated:** the `msgconvert` external-tool backend is deprecated and
|
|
95
|
+
> will be removed in a future release. Migrate to the pure-Python backend with
|
|
96
|
+
> `pip install mail-parser[outlook]`.
|
|
97
|
+
|
|
98
|
+
**💥 BREAKING CHANGE:** the default `.msg` conversion backend changed.
|
|
99
|
+
When `extract-msg` is installed it is now preferred over `msgconvert`. The two
|
|
100
|
+
converters produce different intermediate `.eml` output, so some parsed fields
|
|
101
|
+
(header ordering, encoding edge cases, attachment naming) can differ from the
|
|
102
|
+
previous `msgconvert`-only behavior. Downstream code asserting on exact
|
|
103
|
+
`.msg`-derived output may need updating.
|
|
53
104
|
|
|
54
105
|
# Apache 2 Open Source License
|
|
55
106
|
|
|
@@ -1,28 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: mail-parser
|
|
3
|
-
Version: 4.2.1
|
|
4
|
-
Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
|
|
5
|
-
Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
6
|
-
Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
7
|
-
License-Expression: Apache-2.0
|
|
8
|
-
License-File: LICENSE.txt
|
|
9
|
-
License-File: NOTICE.txt
|
|
10
|
-
Keywords: email,forensics,mail,malware,parser,phishing,security,spam,threat detection
|
|
11
|
-
Classifier: Natural Language :: English
|
|
12
|
-
Classifier: Operating System :: MacOS
|
|
13
|
-
Classifier: Operating System :: Microsoft :: Windows
|
|
14
|
-
Classifier: Operating System :: Unix
|
|
15
|
-
Classifier: Programming Language :: Python
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
23
|
-
Requires-Python: <3.15,>=3.9
|
|
24
|
-
Description-Content-Type: text/markdown
|
|
25
|
-
|
|
26
1
|
[](https://pypi.org/project/mail-parser/)
|
|
27
2
|
[](https://coveralls.io/github/SpamScope/mail-parser?branch=develop)
|
|
28
3
|
[](https://pypistats.org/packages/mail-parser)
|
|
@@ -61,20 +36,44 @@ formats, making it versatile for diverse email ecosystems.
|
|
|
61
36
|
**⚡ Production-Ready**: Trusted by security professionals and developers worldwide, with extensive
|
|
62
37
|
test coverage and proven reliability in high-stakes environments.
|
|
63
38
|
|
|
64
|
-
|
|
65
|
-
this functionality on Debian-based systems, simply install the required system package:
|
|
39
|
+
mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
|
|
66
40
|
|
|
67
|
-
|
|
68
|
-
apt-get install libemail-outlook-message-perl
|
|
69
|
-
```
|
|
41
|
+
## Parsing Outlook `.msg` files
|
|
70
42
|
|
|
71
|
-
|
|
43
|
+
mail-parser converts Outlook `.msg` files to standard `.eml` before parsing.
|
|
44
|
+
Two conversion backends are supported:
|
|
72
45
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
```
|
|
46
|
+
1. **`extract-msg` (recommended, pure Python).** No external tools required.
|
|
47
|
+
Install the optional extra:
|
|
76
48
|
|
|
77
|
-
|
|
49
|
+
```bash
|
|
50
|
+
pip install mail-parser[outlook]
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
1. **`msgconvert` (deprecated, external Perl tool).** Requires the
|
|
54
|
+
`libemail-outlook-message-perl` system package:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
apt-get install libemail-outlook-message-perl # Debian-based systems
|
|
58
|
+
apt-cache show libemail-outlook-message-perl # package details
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Backend precedence:** when `extract-msg` is installed it is used first.
|
|
62
|
+
Only when it is *not* available does mail-parser fall back to the `msgconvert`
|
|
63
|
+
external tool, logging a deprecation warning. If neither backend is available,
|
|
64
|
+
`parse_from_file_msg()` raises `MailParserOSError` telling you to install
|
|
65
|
+
either path.
|
|
66
|
+
|
|
67
|
+
> **⚠️ Deprecated:** the `msgconvert` external-tool backend is deprecated and
|
|
68
|
+
> will be removed in a future release. Migrate to the pure-Python backend with
|
|
69
|
+
> `pip install mail-parser[outlook]`.
|
|
70
|
+
|
|
71
|
+
**💥 BREAKING CHANGE:** the default `.msg` conversion backend changed.
|
|
72
|
+
When `extract-msg` is installed it is now preferred over `msgconvert`. The two
|
|
73
|
+
converters produce different intermediate `.eml` output, so some parsed fields
|
|
74
|
+
(header ordering, encoding edge cases, attachment naming) can differ from the
|
|
75
|
+
previous `msgconvert`-only behavior. Downstream code asserting on exact
|
|
76
|
+
`.msg`-derived output may need updating.
|
|
78
77
|
|
|
79
78
|
# Apache 2 Open Source License
|
|
80
79
|
|
|
@@ -18,6 +18,7 @@ limitations under the License.
|
|
|
18
18
|
|
|
19
19
|
import base64
|
|
20
20
|
import email
|
|
21
|
+
import importlib.util
|
|
21
22
|
import ipaddress
|
|
22
23
|
import json
|
|
23
24
|
import logging
|
|
@@ -27,6 +28,7 @@ from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP
|
|
|
27
28
|
from mailparser.utils import (
|
|
28
29
|
convert_mail_date,
|
|
29
30
|
decode_header_part,
|
|
31
|
+
extract_msg_convert,
|
|
30
32
|
find_between,
|
|
31
33
|
get_addresses,
|
|
32
34
|
get_header,
|
|
@@ -186,14 +188,32 @@ class MailParser:
|
|
|
186
188
|
Init a new object from a Outlook message file,
|
|
187
189
|
mime type: application/vnd.ms-outlook
|
|
188
190
|
|
|
191
|
+
Conversion backend precedence:
|
|
192
|
+
1. ``extract-msg`` (pure Python, optional ``outlook`` extra).
|
|
193
|
+
2. ``msgconvert`` external Perl tool — **deprecated** fallback,
|
|
194
|
+
used only when ``extract-msg`` is not installed.
|
|
195
|
+
|
|
189
196
|
Args:
|
|
190
197
|
fp (string): file path of raw Outlook email
|
|
191
198
|
|
|
192
199
|
Returns:
|
|
193
200
|
Instance of MailParser
|
|
201
|
+
|
|
202
|
+
Raises:
|
|
203
|
+
MailParserOSError: if no conversion backend is available
|
|
194
204
|
"""
|
|
195
205
|
log.debug("Parsing email from file Outlook")
|
|
196
|
-
|
|
206
|
+
|
|
207
|
+
if importlib.util.find_spec("extract_msg") is not None:
|
|
208
|
+
f, _ = extract_msg_convert(fp)
|
|
209
|
+
else:
|
|
210
|
+
log.warning(
|
|
211
|
+
"msgconvert backend is deprecated and will be removed "
|
|
212
|
+
"in a future release. Install the pure-Python Outlook "
|
|
213
|
+
"support with 'pip install mail-parser[outlook]'."
|
|
214
|
+
)
|
|
215
|
+
f, _ = msgconvert(fp)
|
|
216
|
+
|
|
197
217
|
return cls.from_file(f, True)
|
|
198
218
|
|
|
199
219
|
@classmethod
|
|
@@ -429,22 +449,29 @@ class MailParser:
|
|
|
429
449
|
else:
|
|
430
450
|
log.debug(f"Email part {i!r} is not an attachment")
|
|
431
451
|
|
|
432
|
-
# Get the payload using get_payload method with decode=True
|
|
433
|
-
# As Python truly decodes only 'base64',
|
|
434
|
-
# 'quoted-printable', 'x-uuencode',
|
|
435
|
-
# 'uuencode', 'uue', 'x-uue'
|
|
436
|
-
# And for other encodings it breaks the characters so
|
|
437
|
-
# we need to decode them with encoding python is appying
|
|
438
|
-
# To maintain the characters
|
|
439
452
|
payload = p.get_payload(decode=True)
|
|
440
453
|
cte = p.get("Content-Transfer-Encoding")
|
|
441
454
|
if cte:
|
|
442
455
|
cte = cte.lower()
|
|
443
456
|
|
|
444
457
|
if not cte or cte in ["7bit", "8bit"]:
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
458
|
+
# message_from_bytes stores non-ASCII body bytes via
|
|
459
|
+
# ascii+surrogateescape, producing surrogates in the
|
|
460
|
+
# payload string. message_from_string stores a proper
|
|
461
|
+
# Unicode str (no surrogates). Detect which case we
|
|
462
|
+
# have via get_payload(decode=False) and decode
|
|
463
|
+
# accordingly so the declared charset is honoured.
|
|
464
|
+
raw_str = p.get_payload(decode=False)
|
|
465
|
+
if isinstance(raw_str, str):
|
|
466
|
+
try:
|
|
467
|
+
# Raises if surrogates present (from_bytes path)
|
|
468
|
+
raw_str.encode("utf-8")
|
|
469
|
+
payload = raw_str
|
|
470
|
+
except UnicodeEncodeError:
|
|
471
|
+
# Recover original bytes then decode with charset
|
|
472
|
+
orig_bytes = raw_str.encode("ascii", "surrogateescape")
|
|
473
|
+
payload = ported_string(orig_bytes, encoding=charset)
|
|
474
|
+
else:
|
|
448
475
|
payload = ported_string(payload, encoding=charset)
|
|
449
476
|
else:
|
|
450
477
|
payload = ported_string(payload, encoding=charset)
|
|
@@ -16,6 +16,8 @@ See the License for the specific language governing permissions and
|
|
|
16
16
|
limitations under the License.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
19
21
|
import base64
|
|
20
22
|
import datetime
|
|
21
23
|
import email
|
|
@@ -75,7 +77,9 @@ _ADDR_FALLBACK_RE = re.compile(
|
|
|
75
77
|
)
|
|
76
78
|
|
|
77
79
|
|
|
78
|
-
def get_addresses(
|
|
80
|
+
def get_addresses(
|
|
81
|
+
raw_header: str | email.header.Header | None,
|
|
82
|
+
) -> list[tuple[str, str]]:
|
|
79
83
|
"""
|
|
80
84
|
Parse email addresses from a raw address header with a fallback for
|
|
81
85
|
RFC-non-compliant but real-world-common formats.
|
|
@@ -100,13 +104,36 @@ def get_addresses(raw_header):
|
|
|
100
104
|
that was actually present in the header.
|
|
101
105
|
|
|
102
106
|
Args:
|
|
103
|
-
raw_header (str): raw value of an
|
|
104
|
-
(e.g. ``From``, ``To``, ``CC`` …)
|
|
107
|
+
raw_header (str | email.header.Header | None): raw value of an
|
|
108
|
+
address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a
|
|
109
|
+
plain ``str``, an ``email.header.Header`` instance (returned
|
|
110
|
+
by ``email.message.Message.get`` for headers containing
|
|
111
|
+
RFC 2047 encoded-words such as non-ASCII display names), or
|
|
112
|
+
``None``.
|
|
105
113
|
|
|
106
114
|
Returns:
|
|
107
115
|
list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
|
|
108
116
|
``display_name`` is an empty string when absent.
|
|
109
117
|
"""
|
|
118
|
+
# ``Message.get(name)`` returns an ``email.header.Header`` for any header
|
|
119
|
+
# whose value contains RFC 2047 encoded-words (typical for non-ASCII
|
|
120
|
+
# display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does
|
|
121
|
+
# not implement string methods such as ``.strip()`` and is not a valid
|
|
122
|
+
# input to ``email.utils.getaddresses``.
|
|
123
|
+
#
|
|
124
|
+
# Important: decode ``Header`` values into a plain parseable string first.
|
|
125
|
+
# In practice, strict address parsing can treat raw encoded-word tokens like
|
|
126
|
+
# ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as
|
|
127
|
+
# ``To: =?unknown-8bit?...?=``. Decoding first gives
|
|
128
|
+
# ``Álpám Longsom <recipient@example.com>`` so getaddresses() can split
|
|
129
|
+
# name/address correctly.
|
|
130
|
+
if raw_header is None:
|
|
131
|
+
return []
|
|
132
|
+
if isinstance(raw_header, email.header.Header):
|
|
133
|
+
raw_header = decode_header_part(raw_header.encode())
|
|
134
|
+
elif not isinstance(raw_header, str):
|
|
135
|
+
raw_header = str(raw_header)
|
|
136
|
+
|
|
110
137
|
parsed = email.utils.getaddresses([raw_header], strict=True)
|
|
111
138
|
|
|
112
139
|
# If every result from the strict parser has an empty address — while the
|
|
@@ -119,7 +146,7 @@ def get_addresses(raw_header):
|
|
|
119
146
|
results.append((m.group(1).strip(), m.group(2).strip()))
|
|
120
147
|
elif m.group(4): # Any Name <email> (incl. email-as-display-name)
|
|
121
148
|
results.append((m.group(3).strip(), m.group(4).strip()))
|
|
122
|
-
elif m.group(5): # bare email
|
|
149
|
+
elif m.group(5): # bare email # pragma: no branch
|
|
123
150
|
results.append(("", m.group(5).strip()))
|
|
124
151
|
if results:
|
|
125
152
|
log.debug(
|
|
@@ -291,6 +318,66 @@ def fingerprints(data):
|
|
|
291
318
|
return hashes(md5, sha1, sha256, sha512)
|
|
292
319
|
|
|
293
320
|
|
|
321
|
+
def _new_outlook_tempfile():
|
|
322
|
+
"""
|
|
323
|
+
Create an empty temporary file to hold a converted Outlook email.
|
|
324
|
+
|
|
325
|
+
The OS-level file handle is closed immediately; callers write to the
|
|
326
|
+
returned path with their own handle (a subprocess ``--outfile`` for
|
|
327
|
+
``msgconvert`` or a plain ``open`` for the pure-Python backend).
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
str: path of the new temporary ``.eml`` file
|
|
331
|
+
"""
|
|
332
|
+
handle, path = tempfile.mkstemp(prefix="outlook_")
|
|
333
|
+
os.close(handle)
|
|
334
|
+
return path
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def extract_msg_convert(fp):
|
|
338
|
+
"""
|
|
339
|
+
Convert an Outlook ``.msg`` file to ``.eml`` using the pure-Python
|
|
340
|
+
``extract-msg`` library (no external Perl tool required).
|
|
341
|
+
|
|
342
|
+
The ``extract_msg`` import is performed lazily inside this function so
|
|
343
|
+
that the package keeps importing with zero runtime dependencies when
|
|
344
|
+
the optional ``outlook`` extra is not installed.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
fp (string): file path of the Outlook ``.msg`` mail
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
tuple: ``(eml_path, info)`` where ``eml_path`` is the path of the
|
|
351
|
+
converted ``.eml`` file and ``info`` is a short descriptive string
|
|
352
|
+
|
|
353
|
+
Raises:
|
|
354
|
+
ImportError: if the ``extract-msg`` library is not installed
|
|
355
|
+
MailParserOSError: if the ``.msg`` is not a convertible email
|
|
356
|
+
message (e.g. a contact or calendar item)
|
|
357
|
+
"""
|
|
358
|
+
import extract_msg # lazy: keep package import stdlib-only
|
|
359
|
+
|
|
360
|
+
log.debug("Started converting Outlook email with extract-msg")
|
|
361
|
+
msg = extract_msg.openMsg(fp)
|
|
362
|
+
try:
|
|
363
|
+
# openMsg() may return a non-email MSGFile (contact, calendar,
|
|
364
|
+
# task...) which cannot be rendered as an email message.
|
|
365
|
+
as_email_message = getattr(msg, "asEmailMessage", None)
|
|
366
|
+
if as_email_message is None:
|
|
367
|
+
raise MailParserOSError(
|
|
368
|
+
f"Outlook file {fp!r} is not a convertible email "
|
|
369
|
+
f"message (type {type(msg).__name__})"
|
|
370
|
+
)
|
|
371
|
+
eml = as_email_message()
|
|
372
|
+
info = f"{eml.get('From', '')} | {eml.get('Subject', '')}".strip()
|
|
373
|
+
temp = _new_outlook_tempfile()
|
|
374
|
+
with open(temp, "wb") as f:
|
|
375
|
+
f.write(eml.as_bytes())
|
|
376
|
+
return temp, info
|
|
377
|
+
finally:
|
|
378
|
+
msg.close()
|
|
379
|
+
|
|
380
|
+
|
|
294
381
|
def msgconvert(email):
|
|
295
382
|
"""
|
|
296
383
|
Exec msgconvert tool, to convert msg Outlook
|
|
@@ -302,9 +389,12 @@ def msgconvert(email):
|
|
|
302
389
|
Returns:
|
|
303
390
|
tuple with file path of mail converted and
|
|
304
391
|
standard output data (str)
|
|
392
|
+
|
|
393
|
+
Raises:
|
|
394
|
+
MailParserOSError: if the ``msgconvert`` tool is not installed
|
|
305
395
|
"""
|
|
306
396
|
log.debug("Started converting Outlook email")
|
|
307
|
-
|
|
397
|
+
temp = _new_outlook_tempfile()
|
|
308
398
|
command = ["msgconvert", "--outfile", temp, email]
|
|
309
399
|
|
|
310
400
|
try:
|
|
@@ -316,7 +406,13 @@ def msgconvert(email):
|
|
|
316
406
|
)
|
|
317
407
|
|
|
318
408
|
except OSError as e:
|
|
319
|
-
message =
|
|
409
|
+
message = (
|
|
410
|
+
"Cannot convert Outlook .msg: no conversion backend "
|
|
411
|
+
"available. Install pure-Python support with "
|
|
412
|
+
"'pip install mail-parser[outlook]', or install the "
|
|
413
|
+
"'msgconvert' Perl tool "
|
|
414
|
+
f"(libemail-outlook-message-perl). {e!r}"
|
|
415
|
+
)
|
|
320
416
|
log.exception(message)
|
|
321
417
|
raise MailParserOSError(message)
|
|
322
418
|
|
|
@@ -324,9 +420,6 @@ def msgconvert(email):
|
|
|
324
420
|
stdoutdata, _ = out.communicate()
|
|
325
421
|
return temp, stdoutdata.decode("utf-8").strip()
|
|
326
422
|
|
|
327
|
-
finally:
|
|
328
|
-
os.close(temph)
|
|
329
|
-
|
|
330
423
|
|
|
331
424
|
def parse_received(received):
|
|
332
425
|
"""
|
|
@@ -447,7 +540,7 @@ def receiveds_parsing(receiveds):
|
|
|
447
540
|
|
|
448
541
|
log.debug("len(receiveds) %s, len(parsed) %s" % (len(receiveds), len(parsed)))
|
|
449
542
|
|
|
450
|
-
if len(receiveds) != len(parsed):
|
|
543
|
+
if len(receiveds) != len(parsed): # pragma: no cover
|
|
451
544
|
# something really bad happened,
|
|
452
545
|
# so just return raw receiveds with hop indices
|
|
453
546
|
log.error(
|