mail-parser 4.2.1__tar.gz → 4.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/workflows/main.yml +2 -1
  2. mail_parser-4.4.0/CLAUDE.md +97 -0
  3. mail_parser-4.2.1/README.md → mail_parser-4.4.0/PKG-INFO +61 -10
  4. mail_parser-4.2.1/PKG-INFO → mail_parser-4.4.0/README.md +34 -35
  5. {mail_parser-4.2.1 → mail_parser-4.4.0}/pyproject.toml +3 -0
  6. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/core.py +38 -11
  7. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/utils.py +103 -10
  8. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/version.py +1 -1
  9. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_mail_parser.py +148 -1
  10. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_main.py +11 -4
  11. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_utils.py +173 -18
  12. mail_parser-4.4.0/uv.lock +2529 -0
  13. mail_parser-4.2.1/uv.lock +0 -1322
  14. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/FUNDING.yml +0 -0
  15. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  16. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  17. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/copilot-instructions.md +0 -0
  18. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/containerization-docker-best-practices.instructions.md +0 -0
  19. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/github-actions-ci-cd-best-practices.instructions.md +0 -0
  20. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/markdown.instructions.md +0 -0
  21. {mail_parser-4.2.1 → mail_parser-4.4.0}/.github/instructions/python.instructions.md +0 -0
  22. {mail_parser-4.2.1 → mail_parser-4.4.0}/.gitignore +0 -0
  23. {mail_parser-4.2.1 → mail_parser-4.4.0}/.markdownlint.json +0 -0
  24. {mail_parser-4.2.1 → mail_parser-4.4.0}/.pre-commit-config.yaml +0 -0
  25. {mail_parser-4.2.1 → mail_parser-4.4.0}/Dockerfile +0 -0
  26. {mail_parser-4.2.1 → mail_parser-4.4.0}/LICENSE.txt +0 -0
  27. {mail_parser-4.2.1 → mail_parser-4.4.0}/Makefile +0 -0
  28. {mail_parser-4.2.1 → mail_parser-4.4.0}/NOTICE.txt +0 -0
  29. {mail_parser-4.2.1 → mail_parser-4.4.0}/docker-compose.yml +0 -0
  30. {mail_parser-4.2.1 → mail_parser-4.4.0}/docs/images/Bitcoin SpamScope.jpg +0 -0
  31. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/__init__.py +0 -0
  32. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/__main__.py +0 -0
  33. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/const.py +0 -0
  34. {mail_parser-4.2.1 → mail_parser-4.4.0}/src/mailparser/exceptions.py +0 -0
  35. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_1 +0 -0
  36. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_2 +0 -0
  37. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_malformed_3 +0 -0
  38. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_outlook_1 +0 -0
  39. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_1 +0 -0
  40. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_10 +0 -0
  41. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_11 +0 -0
  42. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_12 +0 -0
  43. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_13 +0 -0
  44. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_14 +0 -0
  45. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_15 +0 -0
  46. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_16 +0 -0
  47. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_17 +0 -0
  48. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_18 +0 -0
  49. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_19 +0 -0
  50. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_2 +0 -0
  51. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_3 +0 -0
  52. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_4 +0 -0
  53. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_5 +0 -0
  54. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_6 +0 -0
  55. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_7 +0 -0
  56. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_8 +0 -0
  57. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/mails/mail_test_9 +0 -0
  58. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_improved_received_patterns.py +0 -0
  59. {mail_parser-4.2.1 → mail_parser-4.4.0}/tests/test_received_corpus.py +0 -0
@@ -29,7 +29,7 @@ jobs:
29
29
  curl -LsSf https://astral.sh/uv/install.sh | sh
30
30
  sudo apt-get -qq update
31
31
  sudo apt-get install -y libemail-outlook-message-perl
32
- uv sync
32
+ uv sync --all-extras
33
33
  export PERL_MM_USE_DEFAULT=1
34
34
  sudo cpan -f -i Email::Outlook::Message
35
35
 
@@ -41,6 +41,7 @@ jobs:
41
41
  uv run mail-parser -v
42
42
  uv run mail-parser -h
43
43
  uv run mail-parser -f tests/mails/mail_malformed_3 -j
44
+ uv run mail-parser -f tests/mails/mail_outlook_1 -o -j
44
45
  cat tests/mails/mail_malformed_3 | uv run mail-parser -k -j
45
46
 
46
47
  - name: Run pre-commit
@@ -0,0 +1,97 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Engineering Principles
6
+
7
+ - **DRY** — extract repeated logic; no copy-paste code.
8
+ - **KISS** — simplest solution that works; no clever tricks.
9
+ - **YAGNI** — no code for hypothetical future needs.
10
+ - **SRP** — one function/class, one responsibility.
11
+ - **Fail fast** — validate at boundaries; trust internal code.
12
+ - No dead code, commented-out blocks, or unused imports.
13
+ - Every third-party import must have an explicit entry in `pyproject.toml`. Use extras when the
14
+ runtime feature requires them (e.g. `elasticsearch[async]`).
15
+
16
+ ## Python Style
17
+
18
+ - Follow PEP 8. Use 4 spaces, never tabs.
19
+ - Lines: 79 chars for code, 72 for comments/docstrings.
20
+ - `snake_case` for functions/variables, `CapWords` for classes, `UPPER_CASE` for constants.
21
+ - Imports order: stdlib → third-party → local.
22
+ - Add docstrings to all public functions, methods, classes, and modules (params, return values,
23
+ exceptions).
24
+
25
+ ## Commands
26
+
27
+ ```bash
28
+ uv sync # install all dependencies
29
+ uv run pytest # run all tests
30
+ uv run pytest tests/test_mail_parser.py::TestMailParser::test_name # single test
31
+ uv run ruff check . # lint
32
+ uv run ruff format . # format
33
+ make check # lint + tests
34
+ make build # clean + build wheel/sdist
35
+ ```
36
+
37
+ ## Architecture
38
+
39
+ **Package layout**: `src/mailparser/` (src layout, no runtime deps — stdlib only).
40
+
41
+ ### Module responsibilities
42
+
43
+ | Module | Role |
44
+ | --------------- | --------------------------------------------------------------------- |
45
+ | `core.py` | `MailParser` class + `parse_from_*` factory functions |
46
+ | `utils.py` | Stateless parsing helpers (address, received headers, date, encoding) |
47
+ | `const.py` | Compiled regexes, header sets, clause splitter |
48
+ | `exceptions.py` | Custom exception hierarchy |
49
+ | `__main__.py` | CLI entry point (`mail-parser` command) |
50
+ | `__init__.py` | Re-exports public API |
51
+
52
+ ### Key design patterns
53
+
54
+ **Dynamic attribute access via `__getattr__`**: `MailParser` exposes every header dynamically.
55
+ Three access modes are supported for any attribute `X`:
56
+
57
+ - `parser.X` → Python object (str or list)
58
+ - `parser.X_json` → JSON string
59
+ - `parser.X_raw` → raw JSON via `message.get_all()`
60
+
61
+ Address headers listed in `const.ADDRESSES_HEADERS` (`from`, `to`, `cc`, `bcc`, `reply-to`,
62
+ `delivered-to`) return `list[tuple[str, str]]` (display_name, email) instead of plain strings.
63
+
64
+ **RFC non-compliance fallback in `get_addresses()`**: Python's
65
+ `email.utils.getaddresses(strict=True)` (hardened against CVE-2023-27043) rejects headers where
66
+ the display name contains `@`. Since this is a forensics tool, `utils.py` applies
67
+ `_ADDR_FALLBACK_RE` whenever strict parsing returns empty results — always surfacing what is
68
+ actually in the header.
69
+
70
+ **Received header parsing**: `utils.receiveds_parsing()` tokenizes on RFC 5321 clause keywords
71
+ (`from`, `by`, `via`, `with`, `id`, `for`, `envelope-from`) using `const._CLAUSE_SPLITTER`.
72
+ Output list is ordered first-hop first. Unparseable headers fall back to `{"raw": ...}`.
73
+
74
+ **Defect detection**: During `parse()`, every MIME part is walked and `_append_defects()` records
75
+ RFC violations. `EPILOGUE_DEFECTS` triggers special epilogue extraction to recover hidden payloads
76
+ in malformed boundaries.
77
+
78
+ **Outlook support**: `parse_from_file_msg()` shells out to the system `msgconvert` Perl tool
79
+ (`libemail-outlook-message-perl`) to convert `.msg` → `.eml`, then parses normally.
80
+
81
+ **Partial vs full mail**: `_make_mail(complete=True)` includes all headers found in the message;
82
+ `complete=False` restricts to `const.ADDRESSES_HEADERS | const.OTHERS_PARTS` (the "main" headers).
83
+ Accessible as `parser.mail` / `parser.mail_partial`.
84
+
85
+ ### Adding new headers
86
+
87
+ To make a header always appear in partial output, add its lowercase name to `OTHERS_PARTS` in
88
+ `const.py`. Address-type headers (returning parsed name/email tuples) go in `ADDRESSES_HEADERS`.
89
+
90
+ ## Workflow
91
+
92
+ After every change:
93
+
94
+ 1. Add/update unittests to cover the change.
95
+ 1. Update README.md if the change affects usage, API, or setup.
96
+ 1. Stage changes and run pre-commit; fix all reported issues before proceeding.
97
+ 1. Run full test suite; fix all failures before reporting done.
@@ -1,3 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: mail-parser
3
+ Version: 4.4.0
4
+ Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
5
+ Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
6
+ Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
7
+ License-Expression: Apache-2.0
8
+ License-File: LICENSE.txt
9
+ License-File: NOTICE.txt
10
+ Keywords: email,forensics,mail,malware,parser,phishing,security,spam,threat detection
11
+ Classifier: Natural Language :: English
12
+ Classifier: Operating System :: MacOS
13
+ Classifier: Operating System :: Microsoft :: Windows
14
+ Classifier: Operating System :: Unix
15
+ Classifier: Programming Language :: Python
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Programming Language :: Python :: 3.14
23
+ Requires-Python: <3.15,>=3.9
24
+ Provides-Extra: outlook
25
+ Requires-Dist: extract-msg>=0.54; extra == 'outlook'
26
+ Description-Content-Type: text/markdown
27
+
1
28
  [![PyPI - Version](https://img.shields.io/pypi/v/mail-parser)](https://pypi.org/project/mail-parser/)
2
29
  [![Coverage Status](https://coveralls.io/repos/github/SpamScope/mail-parser/badge.svg?branch=develop)](https://coveralls.io/github/SpamScope/mail-parser?branch=develop)
3
30
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/mail-parser?color=blue)](https://pypistats.org/packages/mail-parser)
@@ -36,20 +63,44 @@ formats, making it versatile for diverse email ecosystems.
36
63
  **⚡ Production-Ready**: Trusted by security professionals and developers worldwide, with extensive
37
64
  test coverage and proven reliability in high-stakes environments.
38
65
 
39
- Additionally, mail-parser provides full support for parsing Outlook email formats (.msg). To enable
40
- this functionality on Debian-based systems, simply install the required system package:
66
+ mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
41
67
 
42
- ```bash
43
- apt-get install libemail-outlook-message-perl
44
- ```
68
+ ## Parsing Outlook `.msg` files
45
69
 
46
- For further details about the package, you can run:
70
+ mail-parser converts Outlook `.msg` files to standard `.eml` before parsing.
71
+ Two conversion backends are supported:
47
72
 
48
- ```bash
49
- apt-cache show libemail-outlook-message-perl
50
- ```
73
+ 1. **`extract-msg` (recommended, pure Python).** No external tools required.
74
+ Install the optional extra:
51
75
 
52
- mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
76
+ ```bash
77
+ pip install mail-parser[outlook]
78
+ ```
79
+
80
+ 1. **`msgconvert` (deprecated, external Perl tool).** Requires the
81
+ `libemail-outlook-message-perl` system package:
82
+
83
+ ```bash
84
+ apt-get install libemail-outlook-message-perl # Debian-based systems
85
+ apt-cache show libemail-outlook-message-perl # package details
86
+ ```
87
+
88
+ **Backend precedence:** when `extract-msg` is installed it is used first.
89
+ Only when it is *not* available does mail-parser fall back to the `msgconvert`
90
+ external tool, logging a deprecation warning. If neither backend is available,
91
+ `parse_from_file_msg()` raises `MailParserOSError` telling you to install
92
+ either path.
93
+
94
+ > **⚠️ Deprecated:** the `msgconvert` external-tool backend is deprecated and
95
+ > will be removed in a future release. Migrate to the pure-Python backend with
96
+ > `pip install mail-parser[outlook]`.
97
+
98
+ **💥 BREAKING CHANGE:** the default `.msg` conversion backend changed.
99
+ When `extract-msg` is installed it is now preferred over `msgconvert`. The two
100
+ converters produce different intermediate `.eml` output, so some parsed fields
101
+ (header ordering, encoding edge cases, attachment naming) can differ from the
102
+ previous `msgconvert`-only behavior. Downstream code asserting on exact
103
+ `.msg`-derived output may need updating.
53
104
 
54
105
  # Apache 2 Open Source License
55
106
 
@@ -1,28 +1,3 @@
1
- Metadata-Version: 2.4
2
- Name: mail-parser
3
- Version: 4.2.1
4
- Summary: A tool that parses emails by enhancing the Python standard library, extracting all details into a comprehensive object.
5
- Author-email: Fedele Mantuano <mantuano.fedele@gmail.com>
6
- Maintainer-email: Fedele Mantuano <mantuano.fedele@gmail.com>
7
- License-Expression: Apache-2.0
8
- License-File: LICENSE.txt
9
- License-File: NOTICE.txt
10
- Keywords: email,forensics,mail,malware,parser,phishing,security,spam,threat detection
11
- Classifier: Natural Language :: English
12
- Classifier: Operating System :: MacOS
13
- Classifier: Operating System :: Microsoft :: Windows
14
- Classifier: Operating System :: Unix
15
- Classifier: Programming Language :: Python
16
- Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.9
18
- Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3.11
20
- Classifier: Programming Language :: Python :: 3.12
21
- Classifier: Programming Language :: Python :: 3.13
22
- Classifier: Programming Language :: Python :: 3.14
23
- Requires-Python: <3.15,>=3.9
24
- Description-Content-Type: text/markdown
25
-
26
1
  [![PyPI - Version](https://img.shields.io/pypi/v/mail-parser)](https://pypi.org/project/mail-parser/)
27
2
  [![Coverage Status](https://coveralls.io/repos/github/SpamScope/mail-parser/badge.svg?branch=develop)](https://coveralls.io/github/SpamScope/mail-parser?branch=develop)
28
3
  [![PyPI - Downloads](https://img.shields.io/pypi/dm/mail-parser?color=blue)](https://pypistats.org/packages/mail-parser)
@@ -61,20 +36,44 @@ formats, making it versatile for diverse email ecosystems.
61
36
  **⚡ Production-Ready**: Trusted by security professionals and developers worldwide, with extensive
62
37
  test coverage and proven reliability in high-stakes environments.
63
38
 
64
- Additionally, mail-parser provides full support for parsing Outlook email formats (.msg). To enable
65
- this functionality on Debian-based systems, simply install the required system package:
39
+ mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
66
40
 
67
- ```bash
68
- apt-get install libemail-outlook-message-perl
69
- ```
41
+ ## Parsing Outlook `.msg` files
70
42
 
71
- For further details about the package, you can run:
43
+ mail-parser converts Outlook `.msg` files to standard `.eml` before parsing.
44
+ Two conversion backends are supported:
72
45
 
73
- ```bash
74
- apt-cache show libemail-outlook-message-perl
75
- ```
46
+ 1. **`extract-msg` (recommended, pure Python).** No external tools required.
47
+ Install the optional extra:
76
48
 
77
- mail-parser is fully compatible with Python 3, ensuring modern performance and reliability.
49
+ ```bash
50
+ pip install mail-parser[outlook]
51
+ ```
52
+
53
+ 1. **`msgconvert` (deprecated, external Perl tool).** Requires the
54
+ `libemail-outlook-message-perl` system package:
55
+
56
+ ```bash
57
+ apt-get install libemail-outlook-message-perl # Debian-based systems
58
+ apt-cache show libemail-outlook-message-perl # package details
59
+ ```
60
+
61
+ **Backend precedence:** when `extract-msg` is installed it is used first.
62
+ Only when it is *not* available does mail-parser fall back to the `msgconvert`
63
+ external tool, logging a deprecation warning. If neither backend is available,
64
+ `parse_from_file_msg()` raises `MailParserOSError` telling you to install
65
+ either path.
66
+
67
+ > **⚠️ Deprecated:** the `msgconvert` external-tool backend is deprecated and
68
+ > will be removed in a future release. Migrate to the pure-Python backend with
69
+ > `pip install mail-parser[outlook]`.
70
+
71
+ **💥 BREAKING CHANGE:** the default `.msg` conversion backend changed.
72
+ When `extract-msg` is installed it is now preferred over `msgconvert`. The two
73
+ converters produce different intermediate `.eml` output, so some parsed fields
74
+ (header ordering, encoding edge cases, attachment naming) can differ from the
75
+ previous `msgconvert`-only behavior. Downstream code asserting on exact
76
+ `.msg`-derived output may need updating.
78
77
 
79
78
  # Apache 2 Open Source License
80
79
 
@@ -28,6 +28,9 @@ maintainers = [
28
28
  ]
29
29
  dependencies = []
30
30
 
31
+ [project.optional-dependencies]
32
+ outlook = ["extract-msg>=0.54"]
33
+
31
34
  [dependency-groups]
32
35
  dev = [
33
36
  "build>=1.2.2.post1",
@@ -18,6 +18,7 @@ limitations under the License.
18
18
 
19
19
  import base64
20
20
  import email
21
+ import importlib.util
21
22
  import ipaddress
22
23
  import json
23
24
  import logging
@@ -27,6 +28,7 @@ from mailparser.const import ADDRESSES_HEADERS, EPILOGUE_DEFECTS, REGXIP, REGXIP
27
28
  from mailparser.utils import (
28
29
  convert_mail_date,
29
30
  decode_header_part,
31
+ extract_msg_convert,
30
32
  find_between,
31
33
  get_addresses,
32
34
  get_header,
@@ -186,14 +188,32 @@ class MailParser:
186
188
  Init a new object from a Outlook message file,
187
189
  mime type: application/vnd.ms-outlook
188
190
 
191
+ Conversion backend precedence:
192
+ 1. ``extract-msg`` (pure Python, optional ``outlook`` extra).
193
+ 2. ``msgconvert`` external Perl tool — **deprecated** fallback,
194
+ used only when ``extract-msg`` is not installed.
195
+
189
196
  Args:
190
197
  fp (string): file path of raw Outlook email
191
198
 
192
199
  Returns:
193
200
  Instance of MailParser
201
+
202
+ Raises:
203
+ MailParserOSError: if no conversion backend is available
194
204
  """
195
205
  log.debug("Parsing email from file Outlook")
196
- f, _ = msgconvert(fp)
206
+
207
+ if importlib.util.find_spec("extract_msg") is not None:
208
+ f, _ = extract_msg_convert(fp)
209
+ else:
210
+ log.warning(
211
+ "msgconvert backend is deprecated and will be removed "
212
+ "in a future release. Install the pure-Python Outlook "
213
+ "support with 'pip install mail-parser[outlook]'."
214
+ )
215
+ f, _ = msgconvert(fp)
216
+
197
217
  return cls.from_file(f, True)
198
218
 
199
219
  @classmethod
@@ -429,22 +449,29 @@ class MailParser:
429
449
  else:
430
450
  log.debug(f"Email part {i!r} is not an attachment")
431
451
 
432
- # Get the payload using get_payload method with decode=True
433
- # As Python truly decodes only 'base64',
434
- # 'quoted-printable', 'x-uuencode',
435
- # 'uuencode', 'uue', 'x-uue'
436
- # And for other encodings it breaks the characters so
437
- # we need to decode them with encoding python is appying
438
- # To maintain the characters
439
452
  payload = p.get_payload(decode=True)
440
453
  cte = p.get("Content-Transfer-Encoding")
441
454
  if cte:
442
455
  cte = cte.lower()
443
456
 
444
457
  if not cte or cte in ["7bit", "8bit"]:
445
- try:
446
- payload = payload.decode("raw-unicode-escape")
447
- except UnicodeDecodeError:
458
+ # message_from_bytes stores non-ASCII body bytes via
459
+ # ascii+surrogateescape, producing surrogates in the
460
+ # payload string. message_from_string stores a proper
461
+ # Unicode str (no surrogates). Detect which case we
462
+ # have via get_payload(decode=False) and decode
463
+ # accordingly so the declared charset is honoured.
464
+ raw_str = p.get_payload(decode=False)
465
+ if isinstance(raw_str, str):
466
+ try:
467
+ # Raises if surrogates present (from_bytes path)
468
+ raw_str.encode("utf-8")
469
+ payload = raw_str
470
+ except UnicodeEncodeError:
471
+ # Recover original bytes then decode with charset
472
+ orig_bytes = raw_str.encode("ascii", "surrogateescape")
473
+ payload = ported_string(orig_bytes, encoding=charset)
474
+ else:
448
475
  payload = ported_string(payload, encoding=charset)
449
476
  else:
450
477
  payload = ported_string(payload, encoding=charset)
@@ -16,6 +16,8 @@ See the License for the specific language governing permissions and
16
16
  limitations under the License.
17
17
  """
18
18
 
19
+ from __future__ import annotations
20
+
19
21
  import base64
20
22
  import datetime
21
23
  import email
@@ -75,7 +77,9 @@ _ADDR_FALLBACK_RE = re.compile(
75
77
  )
76
78
 
77
79
 
78
- def get_addresses(raw_header):
80
+ def get_addresses(
81
+ raw_header: str | email.header.Header | None,
82
+ ) -> list[tuple[str, str]]:
79
83
  """
80
84
  Parse email addresses from a raw address header with a fallback for
81
85
  RFC-non-compliant but real-world-common formats.
@@ -100,13 +104,36 @@ def get_addresses(raw_header):
100
104
  that was actually present in the header.
101
105
 
102
106
  Args:
103
- raw_header (str): raw value of an address header
104
- (e.g. ``From``, ``To``, ``CC`` …)
107
+ raw_header (str | email.header.Header | None): raw value of an
108
+ address header (e.g. ``From``, ``To``, ``CC`` …). Accepts a
109
+ plain ``str``, an ``email.header.Header`` instance (returned
110
+ by ``email.message.Message.get`` for headers containing
111
+ RFC 2047 encoded-words such as non-ASCII display names), or
112
+ ``None``.
105
113
 
106
114
  Returns:
107
115
  list[tuple[str, str]]: list of ``(display_name, email_addr)`` tuples.
108
116
  ``display_name`` is an empty string when absent.
109
117
  """
118
+ # ``Message.get(name)`` returns an ``email.header.Header`` for any header
119
+ # whose value contains RFC 2047 encoded-words (typical for non-ASCII
120
+ # display names like ``=?utf-8?q?=C3=81rp=C3=A1d?=``). ``Header`` does
121
+ # not implement string methods such as ``.strip()`` and is not a valid
122
+ # input to ``email.utils.getaddresses``.
123
+ #
124
+ # Important: decode ``Header`` values into a plain parseable string first.
125
+ # In practice, strict address parsing can treat raw encoded-word tokens like
126
+ # ``=?unknown-8bit?...?=`` as the *address* itself, producing output such as
127
+ # ``To: =?unknown-8bit?...?=``. Decoding first gives
128
+ # ``Álpám Longsom <recipient@example.com>`` so getaddresses() can split
129
+ # name/address correctly.
130
+ if raw_header is None:
131
+ return []
132
+ if isinstance(raw_header, email.header.Header):
133
+ raw_header = decode_header_part(raw_header.encode())
134
+ elif not isinstance(raw_header, str):
135
+ raw_header = str(raw_header)
136
+
110
137
  parsed = email.utils.getaddresses([raw_header], strict=True)
111
138
 
112
139
  # If every result from the strict parser has an empty address — while the
@@ -119,7 +146,7 @@ def get_addresses(raw_header):
119
146
  results.append((m.group(1).strip(), m.group(2).strip()))
120
147
  elif m.group(4): # Any Name <email> (incl. email-as-display-name)
121
148
  results.append((m.group(3).strip(), m.group(4).strip()))
122
- elif m.group(5): # bare email
149
+ elif m.group(5): # bare email # pragma: no branch
123
150
  results.append(("", m.group(5).strip()))
124
151
  if results:
125
152
  log.debug(
@@ -291,6 +318,66 @@ def fingerprints(data):
291
318
  return hashes(md5, sha1, sha256, sha512)
292
319
 
293
320
 
321
+ def _new_outlook_tempfile():
322
+ """
323
+ Create an empty temporary file to hold a converted Outlook email.
324
+
325
+ The OS-level file handle is closed immediately; callers write to the
326
+ returned path with their own handle (a subprocess ``--outfile`` for
327
+ ``msgconvert`` or a plain ``open`` for the pure-Python backend).
328
+
329
+ Returns:
330
+ str: path of the new temporary ``.eml`` file
331
+ """
332
+ handle, path = tempfile.mkstemp(prefix="outlook_")
333
+ os.close(handle)
334
+ return path
335
+
336
+
337
+ def extract_msg_convert(fp):
338
+ """
339
+ Convert an Outlook ``.msg`` file to ``.eml`` using the pure-Python
340
+ ``extract-msg`` library (no external Perl tool required).
341
+
342
+ The ``extract_msg`` import is performed lazily inside this function so
343
+ that the package keeps importing with zero runtime dependencies when
344
+ the optional ``outlook`` extra is not installed.
345
+
346
+ Args:
347
+ fp (string): file path of the Outlook ``.msg`` mail
348
+
349
+ Returns:
350
+ tuple: ``(eml_path, info)`` where ``eml_path`` is the path of the
351
+ converted ``.eml`` file and ``info`` is a short descriptive string
352
+
353
+ Raises:
354
+ ImportError: if the ``extract-msg`` library is not installed
355
+ MailParserOSError: if the ``.msg`` is not a convertible email
356
+ message (e.g. a contact or calendar item)
357
+ """
358
+ import extract_msg # lazy: keep package import stdlib-only
359
+
360
+ log.debug("Started converting Outlook email with extract-msg")
361
+ msg = extract_msg.openMsg(fp)
362
+ try:
363
+ # openMsg() may return a non-email MSGFile (contact, calendar,
364
+ # task...) which cannot be rendered as an email message.
365
+ as_email_message = getattr(msg, "asEmailMessage", None)
366
+ if as_email_message is None:
367
+ raise MailParserOSError(
368
+ f"Outlook file {fp!r} is not a convertible email "
369
+ f"message (type {type(msg).__name__})"
370
+ )
371
+ eml = as_email_message()
372
+ info = f"{eml.get('From', '')} | {eml.get('Subject', '')}".strip()
373
+ temp = _new_outlook_tempfile()
374
+ with open(temp, "wb") as f:
375
+ f.write(eml.as_bytes())
376
+ return temp, info
377
+ finally:
378
+ msg.close()
379
+
380
+
294
381
  def msgconvert(email):
295
382
  """
296
383
  Exec msgconvert tool, to convert msg Outlook
@@ -302,9 +389,12 @@ def msgconvert(email):
302
389
  Returns:
303
390
  tuple with file path of mail converted and
304
391
  standard output data (str)
392
+
393
+ Raises:
394
+ MailParserOSError: if the ``msgconvert`` tool is not installed
305
395
  """
306
396
  log.debug("Started converting Outlook email")
307
- temph, temp = tempfile.mkstemp(prefix="outlook_")
397
+ temp = _new_outlook_tempfile()
308
398
  command = ["msgconvert", "--outfile", temp, email]
309
399
 
310
400
  try:
@@ -316,7 +406,13 @@ def msgconvert(email):
316
406
  )
317
407
 
318
408
  except OSError as e:
319
- message = f"Check if 'msgconvert' tool is installed / {e!r}"
409
+ message = (
410
+ "Cannot convert Outlook .msg: no conversion backend "
411
+ "available. Install pure-Python support with "
412
+ "'pip install mail-parser[outlook]', or install the "
413
+ "'msgconvert' Perl tool "
414
+ f"(libemail-outlook-message-perl). {e!r}"
415
+ )
320
416
  log.exception(message)
321
417
  raise MailParserOSError(message)
322
418
 
@@ -324,9 +420,6 @@ def msgconvert(email):
324
420
  stdoutdata, _ = out.communicate()
325
421
  return temp, stdoutdata.decode("utf-8").strip()
326
422
 
327
- finally:
328
- os.close(temph)
329
-
330
423
 
331
424
  def parse_received(received):
332
425
  """
@@ -447,7 +540,7 @@ def receiveds_parsing(receiveds):
447
540
 
448
541
  log.debug("len(receiveds) %s, len(parsed) %s" % (len(receiveds), len(parsed)))
449
542
 
450
- if len(receiveds) != len(parsed):
543
+ if len(receiveds) != len(parsed): # pragma: no cover
451
544
  # something really bad happened,
452
545
  # so just return raw receiveds with hop indices
453
546
  log.error(
@@ -16,4 +16,4 @@ See the License for the specific language governing permissions and
16
16
  limitations under the License.
17
17
  """
18
18
 
19
- __version__ = "4.2.1"
19
+ __version__ = "4.4.0"