mail-parser 4.2.1__tar.gz → 4.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. mail_parser-4.3.0/CLAUDE.md +103 -0
  2. {mail_parser-4.2.1 → mail_parser-4.3.0}/PKG-INFO +1 -1
  3. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/core.py +17 -10
  4. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/utils.py +32 -5
  5. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/version.py +1 -1
  6. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/test_mail_parser.py +56 -0
  7. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/test_utils.py +173 -18
  8. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/FUNDING.yml +0 -0
  9. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  10. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  11. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/copilot-instructions.md +0 -0
  12. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/instructions/containerization-docker-best-practices.instructions.md +0 -0
  13. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/instructions/github-actions-ci-cd-best-practices.instructions.md +0 -0
  14. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/instructions/markdown.instructions.md +0 -0
  15. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/instructions/python.instructions.md +0 -0
  16. {mail_parser-4.2.1 → mail_parser-4.3.0}/.github/workflows/main.yml +0 -0
  17. {mail_parser-4.2.1 → mail_parser-4.3.0}/.gitignore +0 -0
  18. {mail_parser-4.2.1 → mail_parser-4.3.0}/.markdownlint.json +0 -0
  19. {mail_parser-4.2.1 → mail_parser-4.3.0}/.pre-commit-config.yaml +0 -0
  20. {mail_parser-4.2.1 → mail_parser-4.3.0}/Dockerfile +0 -0
  21. {mail_parser-4.2.1 → mail_parser-4.3.0}/LICENSE.txt +0 -0
  22. {mail_parser-4.2.1 → mail_parser-4.3.0}/Makefile +0 -0
  23. {mail_parser-4.2.1 → mail_parser-4.3.0}/NOTICE.txt +0 -0
  24. {mail_parser-4.2.1 → mail_parser-4.3.0}/README.md +0 -0
  25. {mail_parser-4.2.1 → mail_parser-4.3.0}/docker-compose.yml +0 -0
  26. {mail_parser-4.2.1 → mail_parser-4.3.0}/docs/images/Bitcoin SpamScope.jpg +0 -0
  27. {mail_parser-4.2.1 → mail_parser-4.3.0}/pyproject.toml +0 -0
  28. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/__init__.py +0 -0
  29. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/__main__.py +0 -0
  30. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/const.py +0 -0
  31. {mail_parser-4.2.1 → mail_parser-4.3.0}/src/mailparser/exceptions.py +0 -0
  32. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_malformed_1 +0 -0
  33. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_malformed_2 +0 -0
  34. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_malformed_3 +0 -0
  35. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_outlook_1 +0 -0
  36. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_1 +0 -0
  37. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_10 +0 -0
  38. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_11 +0 -0
  39. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_12 +0 -0
  40. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_13 +0 -0
  41. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_14 +0 -0
  42. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_15 +0 -0
  43. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_16 +0 -0
  44. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_17 +0 -0
  45. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_18 +0 -0
  46. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_19 +0 -0
  47. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_2 +0 -0
  48. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_3 +0 -0
  49. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_4 +0 -0
  50. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_5 +0 -0
  51. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_6 +0 -0
  52. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_7 +0 -0
  53. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_8 +0 -0
  54. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/mails/mail_test_9 +0 -0
  55. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/test_improved_received_patterns.py +0 -0
  56. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/test_main.py +0 -0
  57. {mail_parser-4.2.1 → mail_parser-4.3.0}/tests/test_received_corpus.py +0 -0
  58. {mail_parser-4.2.1 → mail_parser-4.3.0}/uv.lock +0 -0
@@ -0,0 +1,103 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Engineering Principles
6
+
7
+ - **DRY** — extract repeated logic; no copy-paste code.
8
+ - **KISS** — simplest solution that works; no clever tricks.
9
+ - **YAGNI** — no code for hypothetical future needs.
10
+ - **SRP** — one function/class, one responsibility.
11
+ - **Fail fast** — validate at boundaries; trust internal code.
12
+ - No dead code, commented-out blocks, or unused imports.
13
+ - Every third-party import must have an explicit entry in `pyproject.toml`. Use extras when the
14
+ runtime feature requires them (e.g. `elasticsearch[async]`).
15
+
16
+ ## Python Style
17
+
18
+ - Follow PEP 8. Use 4 spaces, never tabs.
19
+ - Lines: 79 chars for code, 72 for comments/docstrings.
20
+ - `snake_case` for functions/variables, `CapWords` for classes, `UPPER_CASE` for constants.
21
+ - Imports order: stdlib → third-party → local.
22
+ - Add docstrings to all public functions, methods, classes, and modules (params, return values,
23
+ exceptions).
24
+
25
+ ## Commands
26
+
27
+ ```bash
28
+ uv sync # install all dependencies
29
+ uv run pytest # run all tests
30
+ uv run pytest tests/test_mail_parser.py::TestMailParser::test_name # single test
31
+ uv run ruff check . # lint
32
+ uv run ruff format . # format
33
+ make check # lint + tests
34
+ make build # clean + build wheel/sdist
35
+ ```
36
+
37
+ ## Architecture
38
+
39
+ **Package layout**: `src/mailparser/` (src layout, no runtime deps — stdlib only).
40
+
41
+ ### Module responsibilities
42
+
43
+ | Module | Role |
44
+ | --------------- | --------------------------------------------------------------------- |
45
+ | `core.py` | `MailParser` class + `parse_from_*` factory functions |
46
+ | `utils.py` | Stateless parsing helpers (address, received headers, date, encoding) |
47
+ | `const.py` | Compiled regexes, header sets, clause splitter |
48
+ | `exceptions.py` | Custom exception hierarchy |
49
+ | `__main__.py` | CLI entry point (`mail-parser` command) |
50
+ | `__init__.py` | Re-exports public API |
51
+
52
+ ### Key design patterns
53
+
54
+ **Dynamic attribute access via `__getattr__`**: `MailParser` exposes every header dynamically.
55
+ Three access modes are supported for any attribute `X`:
56
+
57
+ - `parser.X` → Python object (str or list)
58
+ - `parser.X_json` → JSON string
59
+ - `parser.X_raw` → raw JSON via `message.get_all()`
60
+
61
+ Address headers listed in `const.ADDRESSES_HEADERS` (`from`, `to`, `cc`, `bcc`, `reply-to`,
62
+ `delivered-to`) return `list[tuple[str, str]]` (display_name, email) instead of plain strings.
63
+
64
+ **RFC non-compliance fallback in `get_addresses()`**: Python's
65
+ `email.utils.getaddresses(strict=True)` (hardened against CVE-2023-27043) rejects headers where
66
+ the display name contains `@`. Since this is a forensics tool, `utils.py` applies
67
+ `_ADDR_FALLBACK_RE` whenever strict parsing returns empty results — always surfacing what is
68
+ actually in the header.
69
+
70
+ **Received header parsing**: `utils.receiveds_parsing()` tokenizes on RFC 5321 clause keywords
71
+ (`from`, `by`, `via`, `with`, `id`, `for`, `envelope-from`) using `const._CLAUSE_SPLITTER`.
72
+ Output list is ordered first-hop first. Unparseable headers fall back to `{"raw": ...}`.
73
+
74
+ **Defect detection**: During `parse()`, every MIME part is walked and `_append_defects()` records
75
+ RFC violations. `EPILOGUE_DEFECTS` triggers special epilogue extraction to recover hidden payloads
76
+ in malformed boundaries.
77
+
78
+ **Outlook support**: `parse_from_file_msg()` shells out to the system `msgconvert` Perl tool
79
+ (`libemail-outlook-message-perl`) to convert `.msg` → `.eml`, then parses normally.
80
+
81
+ **Partial vs full mail**: `_make_mail(complete=True)` includes all headers found in the message;
82
+ `complete=False` restricts to `const.ADDRESSES_HEADERS | const.OTHERS_PARTS` (the "main" headers).
83
+ Accessible as `parser.mail` / `parser.mail_partial`.
84
+
85
+ ### Adding new headers
86
+
87
+ To make a header always appear in partial output, add its lowercase name to `OTHERS_PARTS` in
88
+ `const.py`. Address-type headers (returning parsed name/email tuples) go in `ADDRESSES_HEADERS`.
89
+
90
+ ## Workflow
91
+
92
+ After every change:
93
+
94
+ 1. Add/update unittests to cover the change.
95
+ 1. Update README.md if the change affects usage, API, or setup.
96
+ 1. Stage changes and run pre-commit; fix all reported issues before proceeding.
97
+ 1. Run full test suite; fix all failures before reporting done.
98
+
99
+ ### Test fixtures
100
+
101
+ Raw email files in `tests/mails/` are the fixtures. `mail_malformed_*` files exercise defect
102
+ detection; `mail_outlook_*` require `msgconvert` installed. Tests that need the Outlook tool should
103
+ be marked `@pytest.mark.integration`.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mail-parser
3
- Version: 4.2.1
3
+ Version: 4.3.0
4
4
  Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
5
5
  Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
6
6
  Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
@@ -429,22 +429,29 @@ class MailParser:
429
429
  else:
430
430
  log.debug(f"Email part {i!r} is not an attachment")
431
431
 
432
- # Get the payload using get_payload method with decode=True
433
- # As Python truly decodes only 'base64',
434
- # 'quoted-printable', 'x-uuencode',
435
- # 'uuencode', 'uue', 'x-uue'
436
- # And for other encodings it breaks the characters so
437
- # we need to decode them with encoding python is appying
438
- # To maintain the characters
439
432
  payload = p.get_payload(decode=True)
440
433
  cte = p.get("Content-Transfer-Encoding")
441
434
  if cte:
442
435
  cte = cte.lower()
443
436
 
444
437
  if not cte or cte in ["7bit", "8bit"]:
445
- try:
446
- payload = payload.decode("raw-unicode-escape")
447
- except UnicodeDecodeError:
438
+ # message_from_bytes stores non-ASCII body bytes via
439
+ # ascii+surrogateescape, producing surrogates in the
440
+ # payload string. message_from_string stores a proper
441
+ # Unicode str (no surrogates). Detect which case we
442
+ # have via get_payload(decode=False) and decode
443
+ # accordingly so the declared charset is honoured.
444
+ raw_str = p.get_payload(decode=False)
445
+ if isinstance(raw_str, str):
446
+ try:
447
+ # Raises if surrogates present (from_bytes path)
448
+ raw_str.encode("utf-8")
449
+ payload = raw_str
450
+ except UnicodeEncodeError:
451
+ # Recover original bytes then decode with charset
452
+ orig_bytes = raw_str.encode("ascii", "surrogateescape")
453
+ payload = ported_string(orig_bytes, encoding=charset)
454
+ else:
448
455
  payload = ported_string(payload, encoding=charset)
449
456
  else:
450
457
  payload = ported_string(payload, encoding=charset)
@@ -16,6 +16,8 @@ See the License for the specific language governing permissions and
16
16
  limitations under the License.
17
17
  """
18
18
 
19
+ from __future__ import annotations
20
+
19
21
  import base64
20
22
  import datetime
21
23
  import email
@@ -75,7 +77,9 @@ _ADDR_FALLBACK_RE = re.compile(
75
77
  )
76
78
 
77
79
 
78
- def get_addresses(raw_header):
80
+ def get_addresses(
81
+ raw_header: str | email.header.Header | None,
82
+ ) -> list[tuple[str, str]]:
79
83
  """
80
84
  Parse email addresses from a raw address header with a fallback for
81
85
  RFC-non-compliant but real-world-common formats.
@@ -100,13 +104,36 @@ def get_addresses(raw_header):
100
104
  that was actually present in the header.
101
105
 
102
106
  Args:
103
- raw_header (str): raw value of an address header
104
- (e.g. ``From``, ``To``, ``CC`` …)
107
+ raw_header (str | email.header.Header | None): raw value of an
108
+ address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a
109
+ plain ``str``, an ``email.header.Header`` instance (returned
110
+ by ``email.message.Message.get`` for headers containing
111
+ RFC 2047 encoded-words such as non-ASCII display names), or
112
+ ``None``.
105
113
 
106
114
  Returns:
107
115
  list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
108
116
  ``display_name`` is an empty string when absent.
109
117
  """
118
+ # ``Message.get(name)`` returns an ``email.header.Header`` for any header
119
+ # whose value contains RFC 2047 encoded-words (typical for non-ASCII
120
+ # display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does
121
+ # not implement string methods such as ``.strip()`` and is not a valid
122
+ # input to ``email.utils.getaddresses``.
123
+ #
124
+ # Important: decode ``Header`` values into a plain parseable string first.
125
+ # In practice, strict address parsing can treat raw encoded-word tokens like
126
+ # ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as
127
+ # ``To: =?unknown-8bit?...?=``. Decoding first gives
128
+ # ``Álpám Longsom <recipient@example.com>`` so getaddresses() can split
129
+ # name/address correctly.
130
+ if raw_header is None:
131
+ return []
132
+ if isinstance(raw_header, email.header.Header):
133
+ raw_header = decode_header_part(raw_header.encode())
134
+ elif not isinstance(raw_header, str):
135
+ raw_header = str(raw_header)
136
+
110
137
  parsed = email.utils.getaddresses([raw_header], strict=True)
111
138
 
112
139
  # If every result from the strict parser has an empty address — while the
@@ -119,7 +146,7 @@ def get_addresses(raw_header):
119
146
  results.append((m.group(1).strip(), m.group(2).strip()))
120
147
  elif m.group(4): # Any Name <email> (incl. email-as-display-name)
121
148
  results.append((m.group(3).strip(), m.group(4).strip()))
122
- elif m.group(5): # bare email
149
+ elif m.group(5): # bare email # pragma: no branch
123
150
  results.append(("", m.group(5).strip()))
124
151
  if results:
125
152
  log.debug(
@@ -447,7 +474,7 @@ def receiveds_parsing(receiveds):
447
474
 
448
475
  log.debug("len(receiveds) %s, len(parsed) %s" % (len(receiveds), len(parsed)))
449
476
 
450
- if len(receiveds) != len(parsed):
477
+ if len(receiveds) != len(parsed): # pragma: no cover
451
478
  # something really bad happened,
452
479
  # so just return raw receiveds with hop indices
453
480
  log.error(
@@ -16,4 +16,4 @@ See the License for the specific language governing permissions and
16
16
  limitations under the License.
17
17
  """
18
18
 
19
- __version__ = "4.2.1"
19
+ __version__ = "4.3.0"
@@ -1087,6 +1087,62 @@ This is plain text with 8bit encoding."""
1087
1087
  # Should have parsed successfully and body contains the text
1088
1088
  self.assertIn("hello", mail.body)
1089
1089
 
1090
+ def _make_utf8_raw(self, cte_header=""):
1091
+ """Return a minimal raw email bytes with UTF-8 body (em-dash U+2014)."""
1092
+ cte_line = f"Content-Transfer-Encoding: {cte_header}\n" if cte_header else ""
1093
+ raw = (
1094
+ "From: a@example.com\n"
1095
+ "To: b@example.com\n"
1096
+ "Subject: probe\n"
1097
+ f"Content-Type: text/plain; charset=utf-8\n"
1098
+ "MIME-Version: 1.0\n"
1099
+ f"{cte_line}"
1100
+ "\n"
1101
+ "Hello — world\n"
1102
+ )
1103
+ return raw.encode("utf-8")
1104
+
1105
+ def test_utf8_body_from_bytes_no_cte(self):
1106
+ """parse_from_bytes must not mojibake UTF-8 body with no CTE (issue #152)."""
1107
+ raw = self._make_utf8_raw()
1108
+ mail = mailparser.parse_from_bytes(raw)
1109
+ self.assertEqual(mail.text_plain[0], "Hello — world\n")
1110
+
1111
+ def test_utf8_body_from_bytes_cte_8bit(self):
1112
+ """parse_from_bytes must not mojibake UTF-8 body with CTE: 8bit (issue #152)."""
1113
+ raw = self._make_utf8_raw("8bit")
1114
+ mail = mailparser.parse_from_bytes(raw)
1115
+ self.assertEqual(mail.text_plain[0], "Hello — world\n")
1116
+
1117
+ def test_utf8_body_from_bytes_cte_7bit(self):
1118
+ """parse_from_bytes must not mojibake ASCII body with CTE: 7bit (issue #152)."""
1119
+ raw = (
1120
+ b"From: a@example.com\n"
1121
+ b"To: b@example.com\n"
1122
+ b"Content-Type: text/plain; charset=utf-8\n"
1123
+ b"Content-Transfer-Encoding: 7bit\n"
1124
+ b"\n"
1125
+ b"Hello world\n"
1126
+ )
1127
+ mail = mailparser.parse_from_bytes(raw)
1128
+ self.assertEqual(mail.text_plain[0], "Hello world\n")
1129
+
1130
+ def test_utf8_body_from_bytes_matches_from_string(self):
1131
+ """parse_from_bytes and parse_from_string: identical text_plain (issue #152)."""
1132
+ raw_bytes = self._make_utf8_raw()
1133
+ raw_str = raw_bytes.decode("utf-8")
1134
+ mail_b = mailparser.parse_from_bytes(raw_bytes)
1135
+ mail_s = mailparser.parse_from_string(raw_str)
1136
+ self.assertEqual(mail_b.text_plain[0], mail_s.text_plain[0])
1137
+
1138
+ def test_utf8_body_from_bytes_8bit_matches_from_string(self):
1139
+ """parse_from_bytes (8bit CTE) matches parse_from_string (issue #152)."""
1140
+ raw_bytes = self._make_utf8_raw("8bit")
1141
+ raw_str = raw_bytes.decode("utf-8")
1142
+ mail_b = mailparser.parse_from_bytes(raw_bytes)
1143
+ mail_s = mailparser.parse_from_string(raw_str)
1144
+ self.assertEqual(mail_b.text_plain[0], mail_s.text_plain[0])
1145
+
1090
1146
 
1091
1147
  class TestEmailAsDisplayName(unittest.TestCase):
1092
1148
  """
@@ -26,6 +26,7 @@ from mailparser.exceptions import MailParserOSError, MailParserReceivedParsingEr
26
26
  from mailparser.utils import (
27
27
  decode_header_part,
28
28
  find_between,
29
+ get_addresses,
29
30
  msgconvert,
30
31
  parse_received,
31
32
  ported_open,
@@ -240,23 +241,14 @@ class TestUtilsEdgeCases(unittest.TestCase):
240
241
  self.assertEqual(len(result.sha256), 64)
241
242
  self.assertEqual(len(result.sha512), 128)
242
243
 
243
- def test_parse_received_with_multiple_matches_error(self):
244
- """Test parse_received raises error on multiple matches for same pattern"""
245
- # This tests the error branch when multiple matches are found
246
- # We need a received header that triggers duplicate matches
244
+ def test_parse_received_with_multiple_from_clauses(self):
245
+ """parse_received keeps only first 'from' clause (RFC 5321 one-from rule)"""
247
246
  received = (
248
247
  "from server.example.com from server2.example.com by mail.example.com"
249
248
  )
250
-
251
- # Depending on the patterns, this might raise an error or succeed
252
- # We test that it handles the scenario correctly
253
- try:
254
- result = parse_received(received)
255
- # If it succeeds, it should return a dict
256
- self.assertIsInstance(result, dict)
257
- except MailParserReceivedParsingError as e:
258
- # This is the expected path for multiple matches
259
- self.assertIn("More than one match", str(e))
249
+ result = parse_received(received)
250
+ self.assertIsInstance(result, dict)
251
+ self.assertEqual(result["from"], "server.example.com")
260
252
 
261
253
  def test_get_to_domains_with_keyerror(self):
262
254
  """Test get_to_domains handles KeyError gracefully"""
@@ -442,10 +434,8 @@ class TestUtilsEdgeCases(unittest.TestCase):
442
434
 
443
435
  result = receiveds_format(parsed)
444
436
  self.assertIsInstance(result, list)
445
- # Should have date_utc and delay calculated
446
- if result[0].get("date_utc"):
447
- self.assertIn("date_utc", result[0])
448
- self.assertIn("delay", result[1])
437
+ self.assertIn("date_utc", result[0])
438
+ self.assertIn("delay", result[1])
449
439
 
450
440
  def test_receiveds_format_with_invalid_date(self):
451
441
  """Test receiveds_format handles invalid dates"""
@@ -600,6 +590,105 @@ class TestUtilsEdgeCases(unittest.TestCase):
600
590
  self.assertIsInstance(result, str)
601
591
  self.assertEqual(result, raw_val)
602
592
 
593
+ def test_get_addresses_handles_header_object(self):
594
+ """
595
+ Test that get_addresses accepts an email.header.Header instance
596
+ without raising AttributeError on `.strip()`.
597
+
598
+ Regression for the case where Message.get(name) returns a Header
599
+ for address headers containing RFC 2047 encoded-words (e.g.
600
+ non-ASCII display names like ``=?utf-8?q?=C3=81lp=C3=A1m_Longsom?=``).
601
+ Before the fix, this raised:
602
+
603
+ AttributeError: 'Header' object has no attribute 'strip'
604
+ """
605
+ from email.header import Header
606
+
607
+ header_obj = Header("Álpám Longsom", charset="utf-8")
608
+ header_obj.append(" <recipient@example.com>", charset="us-ascii")
609
+ result = get_addresses(header_obj)
610
+ self.assertIsInstance(result, list)
611
+ self.assertEqual(len(result), 1)
612
+ display_name, addr = result[0]
613
+ self.assertEqual(addr, "recipient@example.com")
614
+ # get_addresses returns encoded-word form for the display name.
615
+ # Decoding to Unicode happens in core.py via decode_header_part.
616
+ self.assertEqual(display_name, "Álpám Longsom")
617
+
618
+ def test_get_addresses_handles_unknown_8bit_header_object(self):
619
+ """
620
+ Regression for real-world unknown-8bit encoded-word headers.
621
+ Address parsing must not treat the encoded-word token itself as
622
+ the address.
623
+ """
624
+ from email.header import Header
625
+
626
+ encoded_name = "=?unknown-8bit?b?w4FscMOhbSBMb25nc29t?="
627
+ header_obj = Header()
628
+ header_obj.append(encoded_name, charset="us-ascii")
629
+ header_obj.append(" <recipient@example.com>", charset="us-ascii")
630
+
631
+ result = get_addresses(header_obj)
632
+ self.assertEqual(result, [("Álpám Longsom", "recipient@example.com")])
633
+
634
+ def test_get_addresses_handles_none(self):
635
+ """
636
+ Test that get_addresses returns an empty list when given None,
637
+ rather than crashing on attribute access.
638
+ """
639
+ self.assertEqual(get_addresses(None), [])
640
+
641
+ def test_get_addresses_plain_string_unchanged(self):
642
+ """
643
+ Test that the existing plain-string path still works. This guards
644
+ against accidentally regressing the common case while adding
645
+ Header / None handling.
646
+ """
647
+ result = get_addresses("Plain Name <plain@example.com>")
648
+ self.assertEqual(result, [("Plain Name", "plain@example.com")])
649
+
650
+ def test_mailparser_from_bytes_preserves_unicode_display_name(self):
651
+ """
652
+ Regression: Header objects from Message.get(name) must round-trip
653
+ through get_addresses() without introducing replacement characters.
654
+
655
+ The parser should expose the decoded Unicode display name on
656
+ MailParser.to.
657
+ """
658
+ from mailparser.core import MailParser
659
+
660
+ raw_email = (
661
+ b"From: Sender <sender@example.com>\r\n"
662
+ b"To: =?utf-8?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
663
+ b"Subject: Test\r\n"
664
+ b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
665
+ b"Content-Type: text/plain; charset=utf-8\r\n"
666
+ b"\r\n"
667
+ b"hello\r\n"
668
+ )
669
+
670
+ mail = MailParser.from_bytes(raw_email)
671
+ self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])
672
+
673
+ def test_mailparser_from_bytes_unknown_8bit_display_name(self):
674
+ """
675
+ End-to-end regression for unknown-8bit encoded-word in To header.
676
+ """
677
+ from mailparser.core import MailParser
678
+
679
+ raw_email = (
680
+ b"From: Sender <sender@example.com>\r\n"
681
+ b"To: =?unknown-8bit?b?w4FscMOhbSBMb25nc29t?= <recipient@example.com>\r\n"
682
+ b"Subject: Test\r\n"
683
+ b"Date: Tue, 12 May 2026 18:00:00 +0000\r\n"
684
+ b"Content-Type: text/plain; charset=utf-8\r\n"
685
+ b"\r\n"
686
+ b"hello\r\n"
687
+ )
688
+
689
+ mail = MailParser.from_bytes(raw_email)
690
+ self.assertEqual(mail.to, [("Álpám Longsom", "recipient@example.com")])
691
+
603
692
  def test_parse_received_envelope_from_with_angle_brackets(self):
604
693
  """Test utils.py:294-296 — envelope-from clause with angle-bracket match"""
605
694
  # When envelope-from keyword is present AND its value has angle
@@ -644,3 +733,69 @@ class TestUtilsEdgeCases(unittest.TestCase):
644
733
  self.assertIsInstance(result, dict)
645
734
  # envelope_from must NOT be set
646
735
  self.assertNotIn("envelope_from", result)
736
+
737
+ def test_get_addresses_non_str_non_header(self):
738
+ """get_addresses coerces unexpected types via str() (utils.py:135)"""
739
+ result = get_addresses(42) # type: ignore[arg-type]
740
+ self.assertIsInstance(result, list)
741
+
742
+ def test_get_addresses_fallback_regex_bare_email(self):
743
+ """fallback regex matches bare email (utils.py:149-150)"""
744
+ with patch(
745
+ "mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
746
+ ):
747
+ result = get_addresses("user@example.com")
748
+ self.assertEqual(result, [("", "user@example.com")])
749
+
750
+ def test_get_addresses_fallback_regex_quoted_name(self):
751
+ """fallback regex matches quoted-name format (utils.py:145-146)"""
752
+ with patch(
753
+ "mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
754
+ ):
755
+ result = get_addresses('"Quoted Name" <user@example.com>')
756
+ self.assertEqual(result, [("Quoted Name", "user@example.com")])
757
+
758
+ def test_get_addresses_fallback_regex_any_name(self):
759
+ """fallback regex matches any-name format (utils.py:147-148)"""
760
+ with patch(
761
+ "mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
762
+ ):
763
+ result = get_addresses("Any Name <user@example.com>")
764
+ self.assertEqual(result, [("Any Name", "user@example.com")])
765
+
766
+ def test_get_addresses_fallback_regex_no_matches(self):
767
+ """fallback regex finds no addresses, falls through to return parsed"""
768
+ with patch(
769
+ "mailparser.utils.email.utils.getaddresses", return_value=[("", "")]
770
+ ):
771
+ result = get_addresses("not an email address at all")
772
+ self.assertEqual(result, [("", "")])
773
+
774
+ def test_parse_received_sendgrid_date(self):
775
+ """parse_received extracts SendGrid non-standard date (utils.py:389-390)"""
776
+ received = (
777
+ "from sendgrid.net by mx.example.com with SMTP"
778
+ " 2024-01-15 10:30:45.123456789 +0000 UTC m=+1234.567890123"
779
+ )
780
+ result = parse_received(received)
781
+ self.assertIn("date", result)
782
+ self.assertIn("2024-01-15", result["date"])
783
+
784
+ def test_parse_received_for_clause(self):
785
+ """parse_received captures 'for' clause (utils.py:411)"""
786
+ received = (
787
+ "from mail.example.com by mx.example.com"
788
+ " for <user@example.com>; Mon, 15 Jan 2024 10:30:45 +0000"
789
+ )
790
+ result = parse_received(received)
791
+ self.assertIn("for", result)
792
+ self.assertIn("user@example.com", result["for"])
793
+
794
+ def test_parse_received_envelope_from_embedded_in_clause(self):
795
+ """parse_received extracts envelope-from embedded in clause value"""
796
+ received = (
797
+ "from mail.example.com (envelope-from <sender@example.com>)"
798
+ " by mx.example.com; Mon, 15 Jan 2024 10:30:45 +0000"
799
+ )
800
+ result = parse_received(received)
801
+ self.assertEqual(result.get("envelope_from"), "sender@example.com")
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes