mail-parser 4.2.0__tar.gz → 4.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mail_parser-4.3.0/CLAUDE.md +103 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/PKG-INFO +1 -1
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/core.py +30 -15
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/utils.py +114 -1
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/version.py +1 -1
- mail_parser-4.3.0/tests/mails/mail_test_19 +12 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/test_mail_parser.py +217 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/test_utils.py +186 -18
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/FUNDING.yml +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/copilot-instructions.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/instructions/containerization-docker-best-practices.instructions.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/instructions/github-actions-ci-cd-best-practices.instructions.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/instructions/markdown.instructions.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/instructions/python.instructions.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.github/workflows/main.yml +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.gitignore +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.markdownlint.json +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/.pre-commit-config.yaml +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/Dockerfile +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/LICENSE.txt +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/Makefile +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/NOTICE.txt +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/README.md +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/docker-compose.yml +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/docs/images/Bitcoin SpamScope.jpg +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/pyproject.toml +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/__init__.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/__main__.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/const.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/src/mailparser/exceptions.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_malformed_1 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_malformed_2 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_malformed_3 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_outlook_1 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_1 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_10 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_11 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_12 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_13 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_14 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_15 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_16 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_17 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_18 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_2 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_3 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_4 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_5 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_6 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_7 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_8 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/mails/mail_test_9 +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/test_improved_received_patterns.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/test_main.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/tests/test_received_corpus.py +0 -0
- {mail_parser-4.2.0 → mail_parser-4.3.0}/uv.lock +0 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Engineering Principles
|
|
6
|
+
|
|
7
|
+
- **DRY** — extract repeated logic; no copy-paste code.
|
|
8
|
+
- **KISS** — simplest solution that works; no clever tricks.
|
|
9
|
+
- **YAGNI** — no code for hypothetical future needs.
|
|
10
|
+
- **SRP** — one function/class, one responsibility.
|
|
11
|
+
- **Fail fast** — validate at boundaries; trust internal code.
|
|
12
|
+
- No dead code, commented-out blocks, or unused imports.
|
|
13
|
+
- Every third-party import must have an explicit entry in `pyproject.toml`. Use extras when the
|
|
14
|
+
runtime feature requires them (e.g. `elasticsearch[async]`).
|
|
15
|
+
|
|
16
|
+
## Python Style
|
|
17
|
+
|
|
18
|
+
- Follow PEP 8. Use 4 spaces, never tabs.
|
|
19
|
+
- Lines: 79 chars for code, 72 for comments/docstrings.
|
|
20
|
+
- `snake_case` for functions/variables, `CapWords` for classes, `UPPER_CASE` for constants.
|
|
21
|
+
- Imports order: stdlib → third-party → local.
|
|
22
|
+
- Add docstrings to all public functions, methods, classes, and modules (params, return values,
|
|
23
|
+
exceptions).
|
|
24
|
+
|
|
25
|
+
## Commands
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv sync # install all dependencies
|
|
29
|
+
uv run pytest # run all tests
|
|
30
|
+
uv run pytest tests/test_mail_parser.py::TestMailParser::test_name # single test
|
|
31
|
+
uv run ruff check . # lint
|
|
32
|
+
uv run ruff format . # format
|
|
33
|
+
make check # lint + tests
|
|
34
|
+
make build # clean + build wheel/sdist
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Architecture
|
|
38
|
+
|
|
39
|
+
**Package layout**: `src/mailparser/` (src layout, no runtime deps — stdlib only).
|
|
40
|
+
|
|
41
|
+
### Module responsibilities
|
|
42
|
+
|
|
43
|
+
| Module | Role |
|
|
44
|
+
| --------------- | --------------------------------------------------------------------- |
|
|
45
|
+
| `core.py` | `MailParser` class + `parse_from_*` factory functions |
|
|
46
|
+
| `utils.py` | Stateless parsing helpers (address, received headers, date, encoding) |
|
|
47
|
+
| `const.py` | Compiled regexes, header sets, clause splitter |
|
|
48
|
+
| `exceptions.py` | Custom exception hierarchy |
|
|
49
|
+
| `__main__.py` | CLI entry point (`mail-parser` command) |
|
|
50
|
+
| `__init__.py` | Re-exports public API |
|
|
51
|
+
|
|
52
|
+
### Key design patterns
|
|
53
|
+
|
|
54
|
+
**Dynamic attribute access via `__getattr__`**: `MailParser` exposes every header dynamically.
|
|
55
|
+
Three access modes are supported for any attribute `X`:
|
|
56
|
+
|
|
57
|
+
- `parser.X` → Python object (str or list)
|
|
58
|
+
- `parser.X_json` → JSON string
|
|
59
|
+
- `parser.X_raw` → raw JSON via `message.get_all()`
|
|
60
|
+
|
|
61
|
+
Address headers listed in `const.ADDRESSES_HEADERS` (`from`, `to`, `cc`, `bcc`, `reply-to`,
|
|
62
|
+
`delivered-to`) return `list[tuple[str, str]]` (display_name, email) instead of plain strings.
|
|
63
|
+
|
|
64
|
+
**RFC non-compliance fallback in `get_addresses()`**: Python's
|
|
65
|
+
`email.utils.getaddresses(strict=True)` (hardened against CVE-2023-27043) rejects headers where
|
|
66
|
+
the display name contains `@`. Since this is a forensics tool, `utils.py` applies
|
|
67
|
+
`_ADDR_FALLBACK_RE` whenever strict parsing returns empty results — always surfacing what is
|
|
68
|
+
actually in the header.
|
|
69
|
+
|
|
70
|
+
**Received header parsing**: `utils.receiveds_parsing()` tokenizes on RFC 5321 clause keywords
|
|
71
|
+
(`from`, `by`, `via`, `with`, `id`, `for`, `envelope-from`) using `const._CLAUSE_SPLITTER`.
|
|
72
|
+
Output list is ordered first-hop first. Unparseable headers fall back to `{"raw": ...}`.
|
|
73
|
+
|
|
74
|
+
**Defect detection**: During `parse()`, every MIME part is walked and `_append_defects()` records
|
|
75
|
+
RFC violations. `EPILOGUE_DEFECTS` triggers special epilogue extraction to recover hidden payloads
|
|
76
|
+
in malformed boundaries.
|
|
77
|
+
|
|
78
|
+
**Outlook support**: `parse_from_file_msg()` shells out to the system `msgconvert` Perl tool
|
|
79
|
+
(`libemail-outlook-message-perl`) to convert `.msg` → `.eml`, then parses normally.
|
|
80
|
+
|
|
81
|
+
**Partial vs full mail**: `_make_mail(complete=True)` includes all headers found in the message;
|
|
82
|
+
`complete=False` restricts to `const.ADDRESSES_HEADERS | const.OTHERS_PARTS` (the "main" headers).
|
|
83
|
+
Accessible as `parser.mail` / `parser.mail_partial`.
|
|
84
|
+
|
|
85
|
+
### Adding new headers
|
|
86
|
+
|
|
87
|
+
To make a header always appear in partial output, add its lowercase name to `OTHERS_PARTS` in
|
|
88
|
+
`const.py`. Address-type headers (returning parsed name/email tuples) go in `ADDRESSES_HEADERS`.
|
|
89
|
+
|
|
90
|
+
## Workflow
|
|
91
|
+
|
|
92
|
+
After every change:
|
|
93
|
+
|
|
94
|
+
1. Add/update unittests to cover the change.
|
|
95
|
+
1. Update README.md if the change affects usage, API, or setup.
|
|
96
|
+
1. Stage changes and run pre-commit; fix all reported issues before proceeding.
|
|
97
|
+
1. Run full test suite; fix all failures before reporting done.
|
|
98
|
+
|
|
99
|
+
### Test fixtures
|
|
100
|
+
|
|
101
|
+
Raw email files in `tests/mails/` are the fixtures. `mail_malformed_*` files exercise defect
|
|
102
|
+
detection; `mail_outlook_*` require `msgconvert` installed. Tests that need the Outlook tool should
|
|
103
|
+
be marked `@pytest.mark.integration`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mail-parser
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.3.0
|
|
4
4
|
Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
|
|
5
5
|
Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
6
6
|
Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
|
|
@@ -18,7 +18,6 @@ limitations under the License.
|
|
|
18
18
|
|
|
19
19
|
import base64
|
|
20
20
|
import email
|
|
21
|
-
import email.utils
|
|
22
21
|
import ipaddress
|
|
23
22
|
import json
|
|
24
23
|
import logging
|
|
@@ -29,6 +28,7 @@ from mailparser.utils import (
|
|
|
29
28
|
convert_mail_date,
|
|
30
29
|
decode_header_part,
|
|
31
30
|
find_between,
|
|
31
|
+
get_addresses,
|
|
32
32
|
get_header,
|
|
33
33
|
get_mail_keys,
|
|
34
34
|
get_to_domains,
|
|
@@ -429,22 +429,29 @@ class MailParser:
|
|
|
429
429
|
else:
|
|
430
430
|
log.debug(f"Email part {i!r} is not an attachment")
|
|
431
431
|
|
|
432
|
-
# Get the payload using get_payload method with decode=True
|
|
433
|
-
# As Python truly decodes only 'base64',
|
|
434
|
-
# 'quoted-printable', 'x-uuencode',
|
|
435
|
-
# 'uuencode', 'uue', 'x-uue'
|
|
436
|
-
# And for other encodings it breaks the characters so
|
|
437
|
-
# we need to decode them with encoding python is appying
|
|
438
|
-
# To maintain the characters
|
|
439
432
|
payload = p.get_payload(decode=True)
|
|
440
433
|
cte = p.get("Content-Transfer-Encoding")
|
|
441
434
|
if cte:
|
|
442
435
|
cte = cte.lower()
|
|
443
436
|
|
|
444
437
|
if not cte or cte in ["7bit", "8bit"]:
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
438
|
+
# message_from_bytes stores non-ASCII body bytes via
|
|
439
|
+
# ascii+surrogateescape, producing surrogates in the
|
|
440
|
+
# payload string. message_from_string stores a proper
|
|
441
|
+
# Unicode str (no surrogates). Detect which case we
|
|
442
|
+
# have via get_payload(decode=False) and decode
|
|
443
|
+
# accordingly so the declared charset is honoured.
|
|
444
|
+
raw_str = p.get_payload(decode=False)
|
|
445
|
+
if isinstance(raw_str, str):
|
|
446
|
+
try:
|
|
447
|
+
# Raises if surrogates present (from_bytes path)
|
|
448
|
+
raw_str.encode("utf-8")
|
|
449
|
+
payload = raw_str
|
|
450
|
+
except UnicodeEncodeError:
|
|
451
|
+
# Recover original bytes then decode with charset
|
|
452
|
+
orig_bytes = raw_str.encode("ascii", "surrogateescape")
|
|
453
|
+
payload = ported_string(orig_bytes, encoding=charset)
|
|
454
|
+
else:
|
|
448
455
|
payload = ported_string(payload, encoding=charset)
|
|
449
456
|
else:
|
|
450
457
|
payload = ported_string(payload, encoding=charset)
|
|
@@ -569,10 +576,17 @@ class MailParser:
|
|
|
569
576
|
# object headers
|
|
570
577
|
elif name_header in ADDRESSES_HEADERS:
|
|
571
578
|
raw_header = self.message.get(name_header, "") if self.message else ""
|
|
572
|
-
#
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
#
|
|
579
|
+
# Parse addresses. RFC 5322 §3.4 does not allow unquoted "@" in
|
|
580
|
+
# display names, so a strict parser correctly rejects headers like
|
|
581
|
+
# From: alice@example.com <bob@example.com>
|
|
582
|
+
# and returns ('', ''). mail-parser is a security/forensics tool,
|
|
583
|
+
# not an MTA: hiding addresses from analysts is worse than accepting
|
|
584
|
+
# non-conforming input. get_addresses() applies a regex fallback
|
|
585
|
+
# when strict parsing yields only empty results — see its docstring
|
|
586
|
+
# in utils.py for the full rationale.
|
|
587
|
+
parsed_addresses = get_addresses(raw_header)
|
|
588
|
+
|
|
589
|
+
# decoded addresses — skip entries with no address (absent header)
|
|
576
590
|
return [
|
|
577
591
|
(
|
|
578
592
|
(
|
|
@@ -583,6 +597,7 @@ class MailParser:
|
|
|
583
597
|
email_addr,
|
|
584
598
|
)
|
|
585
599
|
for name, email_addr in parsed_addresses
|
|
600
|
+
if email_addr
|
|
586
601
|
]
|
|
587
602
|
|
|
588
603
|
# others headers
|
|
@@ -16,9 +16,12 @@ See the License for the specific language governing permissions and
|
|
|
16
16
|
limitations under the License.
|
|
17
17
|
"""
|
|
18
18
|
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
19
21
|
import base64
|
|
20
22
|
import datetime
|
|
21
23
|
import email
|
|
24
|
+
import email.header
|
|
22
25
|
import email.utils
|
|
23
26
|
import functools
|
|
24
27
|
import hashlib
|
|
@@ -49,6 +52,113 @@ from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingEr
|
|
|
49
52
|
|
|
50
53
|
log = logging.getLogger(__name__)
|
|
51
54
|
|
|
55
|
+
# ---------------------------------------------------------------------------
|
|
56
|
+
# RFC 5322 address parsing — fallback for non-compliant display names
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# RFC 5322 §3.4 defines the display-name as a "phrase", which must not contain
|
|
59
|
+
# unquoted special characters such as "@". A header like
|
|
60
|
+
#
|
|
61
|
+
# From: alice@example.com <bob@example.com>
|
|
62
|
+
#
|
|
63
|
+
# is therefore *technically non-conforming*: the display name contains an
|
|
64
|
+
# unquoted "@". Python's ``email.utils.getaddresses`` with ``strict=True``
|
|
65
|
+
# (hardened against CVE-2023-27043) correctly rejects this and returns
|
|
66
|
+
# ``[('', '')]``, leaving the real address invisible.
|
|
67
|
+
#
|
|
68
|
+
# mail-parser is a security / forensics tool, not an MTA. Silently hiding an
|
|
69
|
+
# address because its display-name looks like an e-mail address defeats the
|
|
70
|
+
# purpose of the tool — analysts *need* to see those values. We therefore
|
|
71
|
+
# bypass strict compliance with a regex fallback whenever strict parsing yields
|
|
72
|
+
# an empty address, always surfacing the value that is actually in the header.
|
|
73
|
+
_ADDR_FALLBACK_RE = re.compile(
|
|
74
|
+
r'"([^"]*?)"\s*<([^>]+)>' # "Quoted Name" <email@addr>
|
|
75
|
+
r"|([^<,]*?)\s*<([^>]+)>" # Any Name <email@addr> (incl. email-as-name)
|
|
76
|
+
r"|([^\s,<>]+@[^\s,<>]+)" # bare email@addr
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_addresses(
|
|
81
|
+
raw_header: str | email.header.Header | None,
|
|
82
|
+
) -> list[tuple[str, str]]:
|
|
83
|
+
"""
|
|
84
|
+
Parse email addresses from a raw address header with a fallback for
|
|
85
|
+
RFC-non-compliant but real-world-common formats.
|
|
86
|
+
|
|
87
|
+
RFC 5322 §3.4 requires the display name (phrase) before an angle-bracket
|
|
88
|
+
address to consist only of printable ASCII characters that are *not*
|
|
89
|
+
special. The ``@`` character is special, so a header such as::
|
|
90
|
+
|
|
91
|
+
From: alice@example.com <bob@example.com>
|
|
92
|
+
|
|
93
|
+
is technically non-conforming because the display name contains an
|
|
94
|
+
unquoted ``@``. Python's ``email.utils.getaddresses`` with
|
|
95
|
+
``strict=True`` (hardened against CVE-2023-27043) correctly returns
|
|
96
|
+
``[('', '')]`` for this input, making the real sender invisible.
|
|
97
|
+
|
|
98
|
+
mail-parser is a *security / forensics* tool, not an MTA. Silently
|
|
99
|
+
discarding an address because its display name happens to look like an
|
|
100
|
+
e-mail address would hide relevant forensic information from analysts —
|
|
101
|
+
the very opposite of what the tool is for. We therefore bypass strict
|
|
102
|
+
RFC compliance by applying a regex-based fallback whenever the strict
|
|
103
|
+
parser yields only empty addresses, so that analysts always see the value
|
|
104
|
+
that was actually present in the header.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
raw_header (str | email.header.Header | None): raw value of an
|
|
108
|
+
address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a
|
|
109
|
+
plain ``str``, an ``email.header.Header`` instance (returned
|
|
110
|
+
by ``email.message.Message.get`` for headers containing
|
|
111
|
+
RFC 2047 encoded-words such as non-ASCII display names), or
|
|
112
|
+
``None``.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
|
|
116
|
+
``display_name`` is an empty string when absent.
|
|
117
|
+
"""
|
|
118
|
+
# ``Message.get(name)`` returns an ``email.header.Header`` for any header
|
|
119
|
+
# whose value contains RFC 2047 encoded-words (typical for non-ASCII
|
|
120
|
+
# display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does
|
|
121
|
+
# not implement string methods such as ``.strip()`` and is not a valid
|
|
122
|
+
# input to ``email.utils.getaddresses``.
|
|
123
|
+
#
|
|
124
|
+
# Important: decode ``Header`` values into a plain parseable string first.
|
|
125
|
+
# In practice, strict address parsing can treat raw encoded-word tokens like
|
|
126
|
+
# ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as
|
|
127
|
+
# ``To: =?unknown-8bit?...?=``. Decoding first gives
|
|
128
|
+
# ``Álpám Longsom <recipient@example.com>`` so getaddresses() can split
|
|
129
|
+
# name/address correctly.
|
|
130
|
+
if raw_header is None:
|
|
131
|
+
return []
|
|
132
|
+
if isinstance(raw_header, email.header.Header):
|
|
133
|
+
raw_header = decode_header_part(raw_header.encode())
|
|
134
|
+
elif not isinstance(raw_header, str):
|
|
135
|
+
raw_header = str(raw_header)
|
|
136
|
+
|
|
137
|
+
parsed = email.utils.getaddresses([raw_header], strict=True)
|
|
138
|
+
|
|
139
|
+
# If every result from the strict parser has an empty address — while the
|
|
140
|
+
# raw header is non-empty — fall back to regex extraction so that the
|
|
141
|
+
# actual address values are not silently lost.
|
|
142
|
+
if raw_header.strip() and all(not addr for _, addr in parsed):
|
|
143
|
+
results = []
|
|
144
|
+
for m in _ADDR_FALLBACK_RE.finditer(raw_header):
|
|
145
|
+
if m.group(2): # "Quoted Name" <email>
|
|
146
|
+
results.append((m.group(1).strip(), m.group(2).strip()))
|
|
147
|
+
elif m.group(4): # Any Name <email> (incl. email-as-display-name)
|
|
148
|
+
results.append((m.group(3).strip(), m.group(4).strip()))
|
|
149
|
+
elif m.group(5): # bare email # pragma: no branch
|
|
150
|
+
results.append(("", m.group(5).strip()))
|
|
151
|
+
if results:
|
|
152
|
+
log.debug(
|
|
153
|
+
"Strict address parsing yielded empty results for %r; "
|
|
154
|
+
"regex fallback recovered %d address(es)",
|
|
155
|
+
raw_header,
|
|
156
|
+
len(results),
|
|
157
|
+
)
|
|
158
|
+
return results
|
|
159
|
+
|
|
160
|
+
return parsed
|
|
161
|
+
|
|
52
162
|
|
|
53
163
|
def custom_log(level="WARNING", name=None): # pragma: no cover
|
|
54
164
|
"""
|
|
@@ -107,6 +217,9 @@ def ported_string(raw_data, encoding="utf-8", errors="ignore"):
|
|
|
107
217
|
if not raw_data:
|
|
108
218
|
return str()
|
|
109
219
|
|
|
220
|
+
if isinstance(raw_data, email.header.Header):
|
|
221
|
+
return str(raw_data)
|
|
222
|
+
|
|
110
223
|
if isinstance(raw_data, str):
|
|
111
224
|
return raw_data
|
|
112
225
|
|
|
@@ -361,7 +474,7 @@ def receiveds_parsing(receiveds):
|
|
|
361
474
|
|
|
362
475
|
log.debug("len(receiveds) %s, len(parsed) %s" % (len(receiveds), len(parsed)))
|
|
363
476
|
|
|
364
|
-
if len(receiveds) != len(parsed):
|
|
477
|
+
if len(receiveds) != len(parsed): # pragma: no cover
|
|
365
478
|
# something really bad happened,
|
|
366
479
|
# so just return raw receiveds with hop indices
|
|
367
480
|
log.error(
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
From: alice@example.com <bob@example.com>
|
|
2
|
+
To: "Charlie Brown" <charlie@example.com>, dave@example.com
|
|
3
|
+
CC: eve@example.com <frank@example.com>
|
|
4
|
+
Reply-To: henry@example.com <ivan@example.com>
|
|
5
|
+
Subject: Test email with email address as display name
|
|
6
|
+
Message-ID: <test-email-as-name@example.com>
|
|
7
|
+
Date: Mon, 01 Jan 2024 12:00:00 +0000
|
|
8
|
+
MIME-Version: 1.0
|
|
9
|
+
Content-Type: text/plain; charset=utf-8
|
|
10
|
+
|
|
11
|
+
This email tests parsing of address headers where the display name is itself
|
|
12
|
+
an email address (RFC non-compliant but common in real-world mail).
|
|
@@ -29,6 +29,7 @@ import mailparser
|
|
|
29
29
|
from mailparser.utils import (
|
|
30
30
|
convert_mail_date,
|
|
31
31
|
fingerprints,
|
|
32
|
+
get_addresses,
|
|
32
33
|
get_header,
|
|
33
34
|
get_mail_keys,
|
|
34
35
|
get_to_domains,
|
|
@@ -62,6 +63,7 @@ mail_test_15 = os.path.join(base_path, "mails", "mail_test_15")
|
|
|
62
63
|
mail_test_16 = os.path.join(base_path, "mails", "mail_test_16")
|
|
63
64
|
mail_test_17 = os.path.join(base_path, "mails", "mail_test_17")
|
|
64
65
|
mail_test_18 = os.path.join(base_path, "mails", "mail_test_18")
|
|
66
|
+
mail_test_19 = os.path.join(base_path, "mails", "mail_test_19")
|
|
65
67
|
mail_malformed_1 = os.path.join(base_path, "mails", "mail_malformed_1")
|
|
66
68
|
mail_malformed_2 = os.path.join(base_path, "mails", "mail_malformed_2")
|
|
67
69
|
mail_malformed_3 = os.path.join(base_path, "mails", "mail_malformed_3")
|
|
@@ -1084,3 +1086,218 @@ This is plain text with 8bit encoding."""
|
|
|
1084
1086
|
mail = mailparser.parse_from_string(raw_mail)
|
|
1085
1087
|
# Should have parsed successfully and body contains the text
|
|
1086
1088
|
self.assertIn("hello", mail.body)
|
|
1089
|
+
|
|
1090
|
+
def _make_utf8_raw(self, cte_header=""):
|
|
1091
|
+
"""Return a minimal raw email bytes with UTF-8 body (em-dash U+2014)."""
|
|
1092
|
+
cte_line = f"Content-Transfer-Encoding: {cte_header}\n" if cte_header else ""
|
|
1093
|
+
raw = (
|
|
1094
|
+
"From: a@example.com\n"
|
|
1095
|
+
"To: b@example.com\n"
|
|
1096
|
+
"Subject: probe\n"
|
|
1097
|
+
f"Content-Type: text/plain; charset=utf-8\n"
|
|
1098
|
+
"MIME-Version: 1.0\n"
|
|
1099
|
+
f"{cte_line}"
|
|
1100
|
+
"\n"
|
|
1101
|
+
"Hello — world\n"
|
|
1102
|
+
)
|
|
1103
|
+
return raw.encode("utf-8")
|
|
1104
|
+
|
|
1105
|
+
def test_utf8_body_from_bytes_no_cte(self):
|
|
1106
|
+
"""parse_from_bytes must not mojibake UTF-8 body with no CTE (issue #152)."""
|
|
1107
|
+
raw = self._make_utf8_raw()
|
|
1108
|
+
mail = mailparser.parse_from_bytes(raw)
|
|
1109
|
+
self.assertEqual(mail.text_plain[0], "Hello — world\n")
|
|
1110
|
+
|
|
1111
|
+
def test_utf8_body_from_bytes_cte_8bit(self):
|
|
1112
|
+
"""parse_from_bytes must not mojibake UTF-8 body with CTE: 8bit (issue #152)."""
|
|
1113
|
+
raw = self._make_utf8_raw("8bit")
|
|
1114
|
+
mail = mailparser.parse_from_bytes(raw)
|
|
1115
|
+
self.assertEqual(mail.text_plain[0], "Hello — world\n")
|
|
1116
|
+
|
|
1117
|
+
def test_utf8_body_from_bytes_cte_7bit(self):
|
|
1118
|
+
"""parse_from_bytes must not mojibake ASCII body with CTE: 7bit (issue #152)."""
|
|
1119
|
+
raw = (
|
|
1120
|
+
b"From: a@example.com\n"
|
|
1121
|
+
b"To: b@example.com\n"
|
|
1122
|
+
b"Content-Type: text/plain; charset=utf-8\n"
|
|
1123
|
+
b"Content-Transfer-Encoding: 7bit\n"
|
|
1124
|
+
b"\n"
|
|
1125
|
+
b"Hello world\n"
|
|
1126
|
+
)
|
|
1127
|
+
mail = mailparser.parse_from_bytes(raw)
|
|
1128
|
+
self.assertEqual(mail.text_plain[0], "Hello world\n")
|
|
1129
|
+
|
|
1130
|
+
def test_utf8_body_from_bytes_matches_from_string(self):
|
|
1131
|
+
"""parse_from_bytes and parse_from_string: identical text_plain (issue #152)."""
|
|
1132
|
+
raw_bytes = self._make_utf8_raw()
|
|
1133
|
+
raw_str = raw_bytes.decode("utf-8")
|
|
1134
|
+
mail_b = mailparser.parse_from_bytes(raw_bytes)
|
|
1135
|
+
mail_s = mailparser.parse_from_string(raw_str)
|
|
1136
|
+
self.assertEqual(mail_b.text_plain[0], mail_s.text_plain[0])
|
|
1137
|
+
|
|
1138
|
+
def test_utf8_body_from_bytes_8bit_matches_from_string(self):
|
|
1139
|
+
"""parse_from_bytes (8bit CTE) matches parse_from_string (issue #152)."""
|
|
1140
|
+
raw_bytes = self._make_utf8_raw("8bit")
|
|
1141
|
+
raw_str = raw_bytes.decode("utf-8")
|
|
1142
|
+
mail_b = mailparser.parse_from_bytes(raw_bytes)
|
|
1143
|
+
mail_s = mailparser.parse_from_string(raw_str)
|
|
1144
|
+
self.assertEqual(mail_b.text_plain[0], mail_s.text_plain[0])
|
|
1145
|
+
|
|
1146
|
+
|
|
1147
|
+
class TestEmailAsDisplayName(unittest.TestCase):
|
|
1148
|
+
"""
|
|
1149
|
+
Tests for address parsing when the display name is itself an email address.
|
|
1150
|
+
|
|
1151
|
+
RFC 5322 §3.4 forbids unquoted "@" in the display-name phrase, so a header
|
|
1152
|
+
like ``From: alice@example.com <bob@example.com>`` is technically
|
|
1153
|
+
non-conforming. Python's strict parser (CVE-2023-27043 hardening) returns
|
|
1154
|
+
``[('', '')]`` for such input, which would silently hide the real sender.
|
|
1155
|
+
|
|
1156
|
+
mail-parser is a security/forensics tool: it intentionally bypasses this
|
|
1157
|
+
strict compliance and applies a regex fallback so that analysts always see
|
|
1158
|
+
the address values that are actually present in the header.
|
|
1159
|
+
"""
|
|
1160
|
+
|
|
1161
|
+
def test_from_email_as_display_name(self):
|
|
1162
|
+
"""From header with an email address as display name is parsed correctly."""
|
|
1163
|
+
mail = mailparser.parse_from_file(mail_test_19)
|
|
1164
|
+
result = mail.from_
|
|
1165
|
+
self.assertIsInstance(result, list)
|
|
1166
|
+
self.assertEqual(len(result), 1)
|
|
1167
|
+
name, addr = result[0]
|
|
1168
|
+
self.assertEqual(addr, "bob@example.com")
|
|
1169
|
+
self.assertEqual(name, "alice@example.com")
|
|
1170
|
+
|
|
1171
|
+
def test_cc_email_as_display_name(self):
|
|
1172
|
+
"""CC header with an email address as display name is parsed correctly."""
|
|
1173
|
+
mail = mailparser.parse_from_file(mail_test_19)
|
|
1174
|
+
result = mail.cc
|
|
1175
|
+
self.assertIsInstance(result, list)
|
|
1176
|
+
self.assertEqual(len(result), 1)
|
|
1177
|
+
name, addr = result[0]
|
|
1178
|
+
self.assertEqual(addr, "frank@example.com")
|
|
1179
|
+
self.assertEqual(name, "eve@example.com")
|
|
1180
|
+
|
|
1181
|
+
def test_reply_to_email_as_display_name(self):
|
|
1182
|
+
"""Reply-To header with an email address as display name is parsed correctly."""
|
|
1183
|
+
mail = mailparser.parse_from_file(mail_test_19)
|
|
1184
|
+
result = mail.reply_to
|
|
1185
|
+
self.assertIsInstance(result, list)
|
|
1186
|
+
self.assertEqual(len(result), 1)
|
|
1187
|
+
name, addr = result[0]
|
|
1188
|
+
self.assertEqual(addr, "ivan@example.com")
|
|
1189
|
+
self.assertEqual(name, "henry@example.com")
|
|
1190
|
+
|
|
1191
|
+
def test_to_mixed_addresses(self):
|
|
1192
|
+
"""To header with a mix of quoted name and bare address is parsed correctly."""
|
|
1193
|
+
mail = mailparser.parse_from_file(mail_test_19)
|
|
1194
|
+
result = mail.to
|
|
1195
|
+
self.assertIsInstance(result, list)
|
|
1196
|
+
self.assertEqual(len(result), 2)
|
|
1197
|
+
# "Charlie Brown" <charlie@example.com>
|
|
1198
|
+
name0, addr0 = result[0]
|
|
1199
|
+
self.assertEqual(addr0, "charlie@example.com")
|
|
1200
|
+
self.assertEqual(name0, "Charlie Brown")
|
|
1201
|
+
# dave@example.com (bare address, no display name)
|
|
1202
|
+
name1, addr1 = result[1]
|
|
1203
|
+
self.assertEqual(addr1, "dave@example.com")
|
|
1204
|
+
self.assertEqual(name1, "")
|
|
1205
|
+
|
|
1206
|
+
# ------------------------------------------------------------------
|
|
1207
|
+
# Edge-case tests via parse_from_string (no additional mail files needed)
|
|
1208
|
+
# ------------------------------------------------------------------
|
|
1209
|
+
|
|
1210
|
+
def test_same_email_as_name_and_address_suppresses_name(self):
|
|
1211
|
+
"""When display name == address, name is suppressed to empty string.
|
|
1212
|
+
|
|
1213
|
+
This covers the case ``From: bob@example.com <bob@example.com>`` which
|
|
1214
|
+
is both RFC non-compliant (unquoted @) AND redundant. After the regex
|
|
1215
|
+
fallback recovers the address, the existing name-suppression logic
|
|
1216
|
+
(decoded_name == email_addr → "") must still fire correctly.
|
|
1217
|
+
"""
|
|
1218
|
+
mail = mailparser.parse_from_string(
|
|
1219
|
+
"From: bob@example.com <bob@example.com>\nSubject: x\n\nBody"
|
|
1220
|
+
)
|
|
1221
|
+
result = mail.from_
|
|
1222
|
+
self.assertEqual(len(result), 1)
|
|
1223
|
+
name, addr = result[0]
|
|
1224
|
+
self.assertEqual(addr, "bob@example.com")
|
|
1225
|
+
self.assertEqual(name, "")
|
|
1226
|
+
|
|
1227
|
+
def test_quoted_email_as_display_name(self):
|
|
1228
|
+
"""Properly quoted email-as-name (RFC-compliant) is parsed by strict parser."""
|
|
1229
|
+
mail = mailparser.parse_from_string(
|
|
1230
|
+
'From: "alice@example.com" <bob@example.com>\nSubject: x\n\nBody'
|
|
1231
|
+
)
|
|
1232
|
+
result = mail.from_
|
|
1233
|
+
self.assertEqual(len(result), 1)
|
|
1234
|
+
name, addr = result[0]
|
|
1235
|
+
self.assertEqual(addr, "bob@example.com")
|
|
1236
|
+
self.assertEqual(name, "alice@example.com")
|
|
1237
|
+
|
|
1238
|
+
def test_standard_display_name_unchanged(self):
|
|
1239
|
+
"""Standard ``Name <email>`` format still works correctly (no regression)."""
|
|
1240
|
+
mail = mailparser.parse_from_string(
|
|
1241
|
+
"From: Alice Smith <alice@example.com>\nSubject: x\n\nBody"
|
|
1242
|
+
)
|
|
1243
|
+
result = mail.from_
|
|
1244
|
+
self.assertEqual(len(result), 1)
|
|
1245
|
+
name, addr = result[0]
|
|
1246
|
+
self.assertEqual(addr, "alice@example.com")
|
|
1247
|
+
self.assertEqual(name, "Alice Smith")
|
|
1248
|
+
|
|
1249
|
+
def test_bare_address_no_display_name(self):
|
|
1250
|
+
"""Bare address with no display name returns empty name (no regression)."""
|
|
1251
|
+
mail = mailparser.parse_from_string(
|
|
1252
|
+
"From: alice@example.com\nSubject: x\n\nBody"
|
|
1253
|
+
)
|
|
1254
|
+
result = mail.from_
|
|
1255
|
+
self.assertEqual(len(result), 1)
|
|
1256
|
+
name, addr = result[0]
|
|
1257
|
+
self.assertEqual(addr, "alice@example.com")
|
|
1258
|
+
self.assertEqual(name, "")
|
|
1259
|
+
|
|
1260
|
+
def test_empty_header_returns_empty_list(self):
|
|
1261
|
+
"""A missing address header returns [] — absent headers must not appear."""
|
|
1262
|
+
mail = mailparser.parse_from_string("Subject: x\n\nBody")
|
|
1263
|
+
# Python's getaddresses("") yields [('', '')], but we filter out entries
|
|
1264
|
+
# with an empty address so that absent headers are not included in the
|
|
1265
|
+
# parsed mail object.
|
|
1266
|
+
self.assertEqual(mail.from_, [])
|
|
1267
|
+
|
|
1268
|
+
# ------------------------------------------------------------------
|
|
1269
|
+
# Unit tests for get_addresses() helper directly
|
|
1270
|
+
# ------------------------------------------------------------------
|
|
1271
|
+
|
|
1272
|
+
def test_get_addresses_email_as_name(self):
|
|
1273
|
+
"""get_addresses() fallback recovers address when display name is an email."""
|
|
1274
|
+
result = get_addresses("alice@example.com <bob@example.com>")
|
|
1275
|
+
self.assertEqual(result, [("alice@example.com", "bob@example.com")])
|
|
1276
|
+
|
|
1277
|
+
def test_get_addresses_standard_format(self):
|
|
1278
|
+
"""get_addresses() strict path handles normal ``Name <email>`` correctly."""
|
|
1279
|
+
result = get_addresses("Alice Smith <alice@example.com>")
|
|
1280
|
+
self.assertEqual(result, [("Alice Smith", "alice@example.com")])
|
|
1281
|
+
|
|
1282
|
+
def test_get_addresses_bare_email(self):
|
|
1283
|
+
"""get_addresses() handles bare email address with no display name."""
|
|
1284
|
+
result = get_addresses("alice@example.com")
|
|
1285
|
+
self.assertEqual(result, [("", "alice@example.com")])
|
|
1286
|
+
|
|
1287
|
+
def test_get_addresses_empty_header(self):
|
|
1288
|
+
"""get_addresses() on empty string returns [('', '')] — raw Python lib result.
|
|
1289
|
+
|
|
1290
|
+
The ('', '') entry is filtered out in __getattr__ (core.py) so that
|
|
1291
|
+
absent headers do not appear in the parsed mail output.
|
|
1292
|
+
"""
|
|
1293
|
+
result = get_addresses("")
|
|
1294
|
+
self.assertEqual(result, [("", "")])
|
|
1295
|
+
|
|
1296
|
+
def test_get_addresses_multiple_with_email_as_name(self):
|
|
1297
|
+
"""get_addresses() fallback handles multiple addresses when all fail strict."""
|
|
1298
|
+
result = get_addresses(
|
|
1299
|
+
"alice@example.com <bob@example.com>, eve@example.com <frank@example.com>"
|
|
1300
|
+
)
|
|
1301
|
+
self.assertEqual(len(result), 2)
|
|
1302
|
+
self.assertEqual(result[0], ("alice@example.com", "bob@example.com"))
|
|
1303
|
+
self.assertEqual(result[1], ("eve@example.com", "frank@example.com"))
|
|
@@ -26,6 +26,7 @@ from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingEr
|
|
|
26
26
|
from mailparser.utils import (
|
|
27
27
|
decode_header_part,
|
|
28
28
|
find_between,
|
|
29
|
+
get_addresses,
|
|
29
30
|
msgconvert,
|
|
30
31
|
parse_received,
|
|
31
32
|
ported_open,
|
|
@@ -240,23 +241,14 @@ class TestUtilsEdgeCases(unittest.TestCase):
|
|
|
240
241
|
self.assertEqual(len(result.sha256), 64)
|
|
241
242
|
self.assertEqual(len(result.sha512), 128)
|
|
242
243
|
|
|
243
|
-
def
|
|
244
|
-
"""
|
|
245
|
-
# This tests the error branch when multiple matches are found
|
|
246
|
-
# We need a received header that triggers duplicate matches
|
|
244
|
+
def test_parse_received_with_multiple_from_clauses(self):
|
|
245
|
+
"""parse_received keeps only first 'from' clause (RFC 5321 one-from rule)"""
|
|
247
246
|
received = (
|
|
248
247
|
"from server.example.com from server2.example.com by mail.example.com"
|
|
249
248
|
)
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
try:
|
|
254
|
-
result = parse_received(received)
|
|
255
|
-
# If it succeeds, it should return a dict
|
|
256
|
-
self.assertIsInstance(result, dict)
|
|
257
|
-
except MailParserReceivedParsingError as e:
|
|
258
|
-
# This is the expected path for multiple matches
|
|
259
|
-
self.assertIn("More than one match", str(e))
|
|
249
|
+
result = parse_received(received)
|
|
250
|
+
self.assertIsInstance(result, dict)
|
|
251
|
+
self.assertEqual(result["from"], "server.example.com")
|
|
260
252
|
|
|
261
253
|
def test_get_to_domains_with_keyerror(self):
|
|
262
254
|
"""Test get_to_domains handles KeyError gracefully"""
|
|
@@ -442,10 +434,8 @@ class TestUtilsEdgeCases(unittest.TestCase):
|
|
|
442
434
|
|
|
443
435
|
result = receiveds_format(parsed)
|
|
444
436
|
self.assertIsInstance(result, list)
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
self.assertIn("date_utc", result[0])
|
|
448
|
-
self.assertIn("delay", result[1])
|
|
437
|
+
self.assertIn("date_utc", result[0])
|
|
438
|
+
self.assertIn("delay", result[1])
|
|
449
439
|
|
|
450
440
|
def test_receiveds_format_with_invalid_date(self):
|
|
451
441
|
"""Test receiveds_format handles invalid dates"""
|
|
@@ -587,6 +577,118 @@ class TestUtilsEdgeCases(unittest.TestCase):
|
|
|
587
577
|
# But should have a valid date itself
|
|
588
578
|
self.assertIsNotNone(result[1].get("date_utc"))
|
|
589
579
|
|
|
580
|
+
def test_ported_string_handles_header_object(self):
|
|
581
|
+
"""
|
|
582
|
+
Test that ported_string can accept an email.header.Header object
|
|
583
|
+
and return a decoded string without crashing.
|
|
584
|
+
"""
|
|
585
|
+
from email.header import Header
|
|
586
|
+
|
|
587
|
+
raw_val = 'attachment;\r\nfilename="Just a text – 2026.pdf'
|
|
588
|
+
header_obj = Header(raw_val, charset="utf-8")
|
|
589
|
+
result = ported_string(header_obj)
|
|
590
|
+
self.assertIsInstance(result, str)
|
|
591
|
+
self.assertEqual(result, raw_val)
|
|
592
|
+
|
|
593
|
+
def test_get_addresses_handles_header_object(self):
|
|
594
|
+
"""
|
|
595
|
+
Test that get_addresses accepts an email.header.Header instance
|
|
596
|
+
without raising AttributeError on `.strip()`.
|
|
597
|
+
|
|
598
|
+
Regression for the case where Message.get(name) returns a Header
|
|
599
|
+
for address headers containing RFC 2047 encoded-words (e.g.
|
|
600
|
+
non-ASCII display names like ``=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=``).
|
|
601
|
+
Before the fix, this raised:
|
|
602
|
+
|
|
603
|
+
AttributeError: 'Header' object has no attribute 'strip'
|
|
604
|
+
"""
|
|
605
|
+
from email.header import Header
|
|
606
|
+
|
|
607
|
+
header_obj = Header("Álpám Longsom", charset="utf-8")
|
|
608
|
+
header_obj.append(" <recipient@example.com>", charset="us-ascii")
|
|
609
|
+
result = get_addresses(header_obj)
|
|
610
|
+
self.assertIsInstance(result, list)
|
|
611
|
+
self.assertEqual(len(result), 1)
|
|
612
|
+
display_name, addr = result[0]
|
|
613
|
+
self.assertEqual(addr, "recipient@example.com")
|
|
614
|
+
# get_addresses returns encoded-word form for the display name.
|
|
615
|
+
# Decoding to Unicode happens in core.py via decode_header_part.
|
|
616
|
+
self.assertEqual(display_name, "Álpám Longsom")
|
|
617
|
+
|
|
618
|
+
def test_get_addresses_handles_unknown_8bit_header_object(self):
|
|
619
|
+
"""
|
|
620
|
+
Regression for real-world unknown-8bit encoded-word headers.
|
|
621
|
+
Address parsing must not treat the encoded-word token itself as
|
|
622
|
+
the address.
|
|
623
|
+
"""
|
|
624
|
+
from email.header import Header
|
|
625
|
+
|
|
626
|
+
encoded_name = "=?unknown-8bit?b?w4FscMOhbSBMb25nc29t?="
|
|
627
|
+
header_obj = Header()
|
|
628
|
+
header_obj.append(encoded_name, charset="us-ascii")
|
|
629
|
+
header_obj.append(" <recipient@example.com>", charset="us-ascii")
|
|
630
|
+
|
|
631
|
+
result = get_addresses(header_obj)
|
|
632
|
+
self.assertEqual(result, [("Álpám Longsom", "recipient@example.com")])
|
|
633
|
+
|
|
634
|
+
def test_get_addresses_handles_none(self):
|
|
635
|
+
"""
|
|
636
|
+
Test that get_addresses returns an empty list when given None,
|
|
637
|
+
rather than crashing on attribute access.
|
|
638
|
+
"""
|
|
639
|
+
self.assertEqual(get_addresses(None), [])
|
|
640
|
+
|
|
641
|
+
def test_get_addresses_plain_string_unchanged(self):
|
|
642
|
+
"""
|
|
643
|
+
Test that the existing plain-string path still works. This guards
|
|
644
|
+
against accidentally regressing the common case while adding
|
|
645
|
+
Header / None handling.
|
|
646
|
+
"""
|
|
647
|
+
result = get_addresses("Plain Name <plain@example.com>")
|
|
648
|
+
self.assertEqual(result, [("Plain Name", "plain@example.com")])
|
|
649
|
+
|
|
650
|
+
def test_mailparser_from_bytes_preserves_unicode_display_name(self):
|
|
651
|
+
"""
|
|
652
|
+
Regression: Header objects from Message.get(name) must round-trip
|
|
653
|
+
through get_addresses() without introducing replacement characters.
|
|
654
|
+
|
|
655
|
+
The parser should expose the decoded Unicode display name on
|
|
656
|
+
MailParser.to.
|
|
657
|
+
"""
|
|
658
|
+
from mailparser.core import MailParser
|
|
659
|
+
|
|
660
|
+
raw_email = (
|
|
661
|
+
b"From: Sender <sender@example.com>\r\n"
|
|
662
|
+
b"To: =?utf-8?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
|
|
663
|
+
b"Subject: Test\r\n"
|
|
664
|
+
b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
|
|
665
|
+
b"Content-Type: text/plain; charset=utf-8\r\n"
|
|
666
|
+
b"\r\n"
|
|
667
|
+
b"hello\r\n"
|
|
668
|
+
)
|
|
669
|
+
|
|
670
|
+
mail = MailParser.from_bytes(raw_email)
|
|
671
|
+
self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])
|
|
672
|
+
|
|
673
|
+
def test_mailparser_from_bytes_unknown_8bit_display_name(self):
|
|
674
|
+
"""
|
|
675
|
+
End-to-end regression for unknown-8bit encoded-word in To header.
|
|
676
|
+
"""
|
|
677
|
+
from mailparser.core import MailParser
|
|
678
|
+
|
|
679
|
+
raw_email = (
|
|
680
|
+
b"From: Sender <sender@example.com>\r\n"
|
|
681
|
+
b"To: =?unknown-8bit?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
|
|
682
|
+
b"Subject: Test\r\n"
|
|
683
|
+
b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
|
|
684
|
+
b"Content-Type: text/plain; charset=utf-8\r\n"
|
|
685
|
+
b"\r\n"
|
|
686
|
+
b"hello\r\n"
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
mail = MailParser.from_bytes(raw_email)
|
|
690
|
+
self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])
|
|
691
|
+
|
|
590
692
|
def test_parse_received_envelope_from_with_angle_brackets(self):
|
|
591
693
|
"""Test utils.py:294-296 — envelope-from clause with angle-bracket match"""
|
|
592
694
|
# When envelope-from keyword is present AND its value has angle
|
|
@@ -631,3 +733,69 @@ class TestUtilsEdgeCases(unittest.TestCase):
|
|
|
631
733
|
self.assertIsInstance(result, dict)
|
|
632
734
|
# envelope_from must NOT be set
|
|
633
735
|
self.assertNotIn("envelope_from", result)
|
|
736
|
+
|
|
737
|
+
def test_get_addresses_non_str_non_header(self):
|
|
738
|
+
"""get_addresses coerces unexpected types via str() (utils.py:135)"""
|
|
739
|
+
result = get_addresses(42) # type: ignore[arg-type]
|
|
740
|
+
self.assertIsInstance(result, list)
|
|
741
|
+
|
|
742
|
+
def test_get_addresses_fallback_regex_bare_email(self):
|
|
743
|
+
"""fallback regex matches bare email (utils.py:149-150)"""
|
|
744
|
+
with patch(
|
|
745
|
+
"mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
|
|
746
|
+
):
|
|
747
|
+
result = get_addresses("user@example.com")
|
|
748
|
+
self.assertEqual(result, [("", "user@example.com")])
|
|
749
|
+
|
|
750
|
+
def test_get_addresses_fallback_regex_quoted_name(self):
|
|
751
|
+
"""fallback regex matches quoted-name format (utils.py:145-146)"""
|
|
752
|
+
with patch(
|
|
753
|
+
"mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
|
|
754
|
+
):
|
|
755
|
+
result = get_addresses('"Quoted Name" <user@example.com>')
|
|
756
|
+
self.assertEqual(result, [("Quoted Name", "user@example.com")])
|
|
757
|
+
|
|
758
|
+
def test_get_addresses_fallback_regex_any_name(self):
|
|
759
|
+
"""fallback regex matches any-name format (utils.py:147-148)"""
|
|
760
|
+
with patch(
|
|
761
|
+
"mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
|
|
762
|
+
):
|
|
763
|
+
result = get_addresses("Any Name <user@example.com>")
|
|
764
|
+
self.assertEqual(result, [("Any Name", "user@example.com")])
|
|
765
|
+
|
|
766
|
+
def test_get_addresses_fallback_regex_no_matches(self):
|
|
767
|
+
"""fallback regex finds no addresses, falls through to return parsed"""
|
|
768
|
+
with patch(
|
|
769
|
+
"mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
|
|
770
|
+
):
|
|
771
|
+
result = get_addresses("not an email address at all")
|
|
772
|
+
self.assertEqual(result, [("", "")])
|
|
773
|
+
|
|
774
|
+
def test_parse_received_sendgrid_date(self):
|
|
775
|
+
"""parse_received extracts SendGrid non-standard date (utils.py:389-390)"""
|
|
776
|
+
received = (
|
|
777
|
+
"from sendgrid.net by mx.example.com with SMTP"
|
|
778
|
+
" 2024-01-15 10:30:45.123456789 +0000 UTC m=+1234.567890123"
|
|
779
|
+
)
|
|
780
|
+
result = parse_received(received)
|
|
781
|
+
self.assertIn("date", result)
|
|
782
|
+
self.assertIn("2024-01-15", result["date"])
|
|
783
|
+
|
|
784
|
+
def test_parse_received_for_clause(self):
|
|
785
|
+
"""parse_received captures 'for' clause (utils.py:411)"""
|
|
786
|
+
received = (
|
|
787
|
+
"from mail.example.com by mx.example.com"
|
|
788
|
+
" for <user@example.com>; Mon, 15 Jan 2024 10:30:45 +0000"
|
|
789
|
+
)
|
|
790
|
+
result = parse_received(received)
|
|
791
|
+
self.assertIn("for", result)
|
|
792
|
+
self.assertIn("user@example.com", result["for"])
|
|
793
|
+
|
|
794
|
+
def test_parse_received_envelope_from_embedded_in_clause(self):
|
|
795
|
+
"""parse_received extracts envelope-from embedded in clause value"""
|
|
796
|
+
received = (
|
|
797
|
+
"from mail.example.com (envelope-from <sender@example.com>)"
|
|
798
|
+
" by mx.example.com; Mon, 15 Jan 2024 10:30:45 +0000"
|
|
799
|
+
)
|
|
800
|
+
result = parse_received(received)
|
|
801
|
+
self.assertEqual(result.get("envelope_from"), "sender@example.com")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|