unmhtml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ name: publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ name: Build package
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.x"
18
+ - name: Install uv
19
+ uses: astral-sh/setup-uv@v4
20
+ - name: Install dependencies
21
+ run: uv sync
22
+ - name: Verify version matches tag
23
+ run: |
24
+ TAG_VERSION=${GITHUB_REF#refs/tags/v}
25
+ PYPROJECT_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
26
+ if [ "$TAG_VERSION" != "$PYPROJECT_VERSION" ]; then
27
+ echo "Tag version ($TAG_VERSION) doesn't match pyproject.toml version ($PYPROJECT_VERSION)"
28
+ exit 1
29
+ fi
30
+ - name: Build a binary wheel and a source tarball
31
+ run: uv build
32
+ - name: Store the distribution packages
33
+ uses: actions/upload-artifact@v4
34
+ with:
35
+ name: python-package-distributions
36
+ path: dist/
37
+
38
+ publish-to-pypi:
39
+ name: Publish distribution package to PyPI
40
+ needs:
41
+ - build
42
+ runs-on: ubuntu-latest
43
+ environment:
44
+ name: pypi
45
+ url: https://pypi.org/p/unmhtml
46
+ permissions:
47
+ id-token: write
48
+ steps:
49
+ - name: Download all the dists
50
+ uses: actions/download-artifact@v4
51
+ with:
52
+ name: python-package-distributions
53
+ path: dist/
54
+ - name: Publish distribution package
55
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,10 @@
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,153 @@
1
+ # **unmhtml - MHTML to HTML Converter Library Specification**
2
+
3
+ This document outlines the requirements and design for a Python library that converts MHTML (MIME HTML) files to standalone HTML files with embedded CSS, using only Python standard library modules.
4
+
5
+ ## **1. Project Goals and Scope**
6
+
7
+ This library is responsible for:
8
+
9
+ - Converting MHTML files to standalone HTML files with embedded CSS
10
+ - Preserving the original rendered content structure for accurate display
11
+ - Providing a pure Python implementation using only standard library modules
12
+ - Supporting integration with web applications that need to display archived content
13
+
14
+ ## **2. Technology Stack**
15
+
16
+ - **Programming Language:** Python 3.8+
17
+ - **Package Manager:** uv toolchain
18
+ - **Core Libraries:** Built-in Python standard library only
19
+ - `email` module for MIME parsing
20
+ - `base64` for resource decoding
21
+ - `mimetypes` for content type detection
22
+ - `urllib.parse` for URL handling
23
+ - `html` for HTML escaping
24
+ - `re` for text processing
25
+ - **No External Dependencies:** Pure stdlib implementation for maximum portability
26
+
27
+ ## **3. MHTML Format Understanding**
28
+
29
+ MHTML files are structured as MIME multipart documents:
30
+
31
+ - **Content-Type:** `multipart/related` or `message/rfc822`
32
+ - **Main HTML Document:** First part containing the primary HTML content
33
+ - **Embedded Resources:** Subsequent parts containing CSS, images, fonts, etc.
34
+ - **Resource References:** Content-Location headers link resources to HTML references
35
+ - **Encoding:** Resources typically base64-encoded for binary content
36
+
37
+ ## **4. Core Functionality**
38
+
39
+ ### **4.1. MHTML Parser**
40
+
41
+ **Class: `MHTMLParser`**
42
+
43
+ ```python
44
+ import email
45
+ import base64
46
+ from typing import Dict, Tuple
47
+
48
+ class MHTMLParser:
49
+ def __init__(self, mhtml_content: str):
50
+ self.mhtml_content = mhtml_content
51
+
52
+ def parse(self) -> Tuple[str, Dict[str, bytes]]:
53
+ """Parse MHTML and return main HTML + resource map"""
54
+ message = email.message_from_string(self.mhtml_content)
55
+ # Extract main HTML and resources
56
+ return main_html, resources
57
+ ```
58
+
59
+ ### **4.2. HTML Processor**
60
+
61
+ **Class: `HTMLProcessor`**
62
+
63
+ ```python
64
+ class HTMLProcessor:
65
+ def __init__(self, html_content: str, resources: Dict[str, bytes]):
66
+ self.html_content = html_content
67
+ self.resources = resources
68
+
69
+ def embed_css(self) -> str:
70
+ """Convert <link> tags to <style> tags with embedded CSS"""
71
+ # Replace external CSS references with inline styles
72
+ return modified_html
73
+
74
+ def convert_to_data_uris(self) -> str:
75
+ """Convert resource references to data URIs"""
76
+ # Replace src/href attributes with data: URLs
77
+ return modified_html
78
+ ```
79
+
80
+ ### **4.3. Main Converter**
81
+
82
+ **Class: `MHTMLConverter`**
83
+
84
+ ```python
85
+ class MHTMLConverter:
86
+ def convert_file(self, mhtml_path: str) -> str:
87
+ """Convert MHTML file to HTML string"""
88
+ with open(mhtml_path, 'r', encoding='utf-8') as f:
89
+ mhtml_content = f.read()
90
+ return self.convert(mhtml_content)
91
+
92
+ def convert(self, mhtml_content: str) -> str:
93
+ """Convert MHTML content to HTML string"""
94
+ parser = MHTMLParser(mhtml_content)
95
+ main_html, resources = parser.parse()
96
+
97
+ processor = HTMLProcessor(main_html, resources)
98
+ html_with_css = processor.embed_css()
99
+ final_html = processor.convert_to_data_uris()
100
+
101
+ return final_html
102
+ ```
103
+
104
+ ## **5. Package Structure**
105
+
106
+ ```
107
+ unmhtml/
108
+ ├── __init__.py
109
+ ├── parser.py # MHTMLParser class
110
+ ├── processor.py # HTMLProcessor class
111
+ ├── converter.py # MHTMLConverter main class
112
+ ├── utils.py # Utility functions
113
+ └── py.typed # Type hints marker
114
+ ```
115
+
116
+ ## **6. API Design**
117
+
118
+ ### **6.1. Simple API**
119
+
120
+ ```python
121
+ from unmhtml import MHTMLConverter
122
+
123
+ # Basic conversion
124
+ converter = MHTMLConverter()
125
+ html_content = converter.convert_file('page.mhtml')
126
+
127
+ # Direct content conversion
128
+ html_content = converter.convert(mhtml_string)
129
+ ```
130
+
131
+ ## **7. Key Features**
132
+
133
+ - **CSS Embedding:** Convert `<link rel="stylesheet">` to `<style>` tags
134
+ - **Resource Embedding:** Convert images/fonts to data URIs
135
+ - **URL Resolution:** Handle relative and absolute resource references
136
+ - **Error Handling:** Graceful degradation for malformed MHTML
137
+ - **Memory Efficient:** Process large files without excessive memory usage
138
+
139
+ ## **8. Testing Strategy**
140
+
141
+ - **Basic Functionality:** Test MHTML to HTML conversion works
142
+ - **Error Handling:** Test graceful handling of malformed input
143
+ - **Resource Embedding:** Verify CSS and resources are properly embedded
144
+
145
+ ## **9. Success Criteria**
146
+
147
+ - **Functionality:** Successfully convert MHTML files to standalone HTML
148
+ - **Performance:** Process typical web pages (1-5MB MHTML) efficiently
149
+ - **Reliability:** Handle malformed MHTML gracefully
150
+ - **Simplicity:** Clean, minimal API with good documentation
151
+ - **Portability:** Zero external dependencies, pure Python stdlib
152
+
153
+ This specification provides a focused foundation for building a lightweight MHTML to HTML converter library called unmhtml using the uv toolchain.
unmhtml-0.1.0/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Resolve Services
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
unmhtml-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: unmhtml
3
+ Version: 0.1.0
4
+ Summary: MHTML to HTML converter library using Python stdlib
5
+ Author: Johan Schuijt
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: archive,converter,html,mhtml,mime,web
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing :: Markup :: HTML
21
+ Requires-Python: >=3.8
22
+ Provides-Extra: test
23
+ Requires-Dist: pytest-cov; extra == 'test'
24
+ Requires-Dist: pytest>=6.0; extra == 'test'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # unmhtml
28
+
29
+ Convert MHTML files to standalone HTML with embedded CSS and resources.
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install unmhtml
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ ```python
40
+ from unmhtml import MHTMLConverter
41
+
42
+ # Convert MHTML file to HTML
43
+ converter = MHTMLConverter()
44
+ html_content = converter.convert_file('saved_page.mhtml')
45
+
46
+ # Save as standalone HTML
47
+ with open('output.html', 'w') as f:
48
+ f.write(html_content)
49
+ ```
50
+
51
+ ## Features
52
+
53
+ - **Pure Python** - No external dependencies, uses only standard library
54
+ - **Standalone HTML** - Embeds CSS and converts resources to data URIs
55
+ - **Graceful degradation** - Handles malformed MHTML files
56
+ - **Memory efficient** - Processes large files without excessive memory usage
57
+
58
+ ## Requirements
59
+
60
+ - Python 3.8+
61
+
62
+ ## License
63
+
64
+ MIT
@@ -0,0 +1,38 @@
1
+ # unmhtml
2
+
3
+ Convert MHTML files to standalone HTML with embedded CSS and resources.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install unmhtml
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from unmhtml import MHTMLConverter
15
+
16
+ # Convert MHTML file to HTML
17
+ converter = MHTMLConverter()
18
+ html_content = converter.convert_file('saved_page.mhtml')
19
+
20
+ # Save as standalone HTML
21
+ with open('output.html', 'w') as f:
22
+ f.write(html_content)
23
+ ```
24
+
25
+ ## Features
26
+
27
+ - **Pure Python** - No external dependencies, uses only standard library
28
+ - **Standalone HTML** - Embeds CSS and converts resources to data URIs
29
+ - **Graceful degradation** - Handles malformed MHTML files
30
+ - **Memory efficient** - Processes large files without excessive memory usage
31
+
32
+ ## Requirements
33
+
34
+ - Python 3.8+
35
+
36
+ ## License
37
+
38
+ MIT
@@ -0,0 +1,53 @@
1
+ [project]
2
+ name = "unmhtml"
3
+ version = "0.1.0"
4
+ description = "MHTML to HTML converter library using Python stdlib"
5
+ readme = "README.md"
6
+ requires-python = ">=3.8"
7
+ dependencies = []
8
+ authors = [
9
+ {name = "Johan Schuijt"}
10
+ ]
11
+ license = {text = "MIT"}
12
+ homepage = "https://github.com/resolve-works/unmhtml"
13
+ repository = "https://github.com/resolve-works/unmhtml"
14
+ keywords = ["mhtml", "html", "converter", "mime", "web", "archive"]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.8",
21
+ "Programming Language :: Python :: 3.9",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Topic :: Internet :: WWW/HTTP",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ "Topic :: Text Processing :: Markup :: HTML",
28
+ ]
29
+
30
+ [project.optional-dependencies]
31
+ test = ["pytest>=6.0", "pytest-cov"]
32
+
33
+ [build-system]
34
+ requires = ["hatchling"]
35
+ build-backend = "hatchling.build"
36
+
37
+ [tool.hatch.build.targets.wheel]
38
+ packages = ["unmhtml"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+ python_files = ["test_*.py"]
43
+ python_classes = ["Test*"]
44
+ python_functions = ["test_*"]
45
+ addopts = [
46
+ "--strict-markers",
47
+ "--strict-config",
48
+ "--verbose",
49
+ ]
50
+ markers = [
51
+ "slow: marks tests as slow (deselect with '-m \"not slow\"')",
52
+ "integration: marks tests as integration tests",
53
+ ]
File without changes
@@ -0,0 +1,147 @@
1
+ """Pytest configuration and fixtures"""
2
+
3
+ import pytest
4
+ import tempfile
5
+ import os
6
+
7
+
8
+ @pytest.fixture
9
+ def simple_mhtml():
10
+ """Simple MHTML content for testing"""
11
+ return """From: <Saved by Blink>
12
+ Snapshot-Content-Location: https://example.com/page.html
13
+ Subject: Test Page
14
+ Date: Mon, 1 Jan 2024 12:00:00 GMT
15
+ MIME-Version: 1.0
16
+ Content-Type: multipart/related;
17
+ type="text/html";
18
+ boundary="----MultipartBoundary--test123"
19
+
20
+ ------MultipartBoundary--test123
21
+ Content-Type: text/html
22
+ Content-ID: <frame-test@example.com>
23
+ Content-Transfer-Encoding: quoted-printable
24
+ Content-Location: https://example.com/page.html
25
+
26
+ <!DOCTYPE html>
27
+ <html>
28
+ <head>
29
+ <title>Test Page</title>
30
+ <link rel="stylesheet" href="style.css">
31
+ </head>
32
+ <body>
33
+ <h1>Hello World</h1>
34
+ <img src="image.png" alt="Test Image">
35
+ </body>
36
+ </html>
37
+
38
+ ------MultipartBoundary--test123
39
+ Content-Type: text/css
40
+ Content-Transfer-Encoding: quoted-printable
41
+ Content-Location: https://example.com/style.css
42
+
43
+ body {
44
+ font-family: Arial, sans-serif;
45
+ margin: 0;
46
+ padding: 20px;
47
+ }
48
+
49
+ h1 {
50
+ color: #333;
51
+ }
52
+
53
+ ------MultipartBoundary--test123
54
+ Content-Type: image/png
55
+ Content-Transfer-Encoding: base64
56
+ Content-Location: https://example.com/image.png
57
+
58
+ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==
59
+
60
+ ------MultipartBoundary--test123--
61
+ """
62
+
63
+
64
+ @pytest.fixture
65
+ def malformed_mhtml():
66
+ """Malformed MHTML content for testing error handling"""
67
+ return """This is not a valid MHTML file
68
+ Just some random text
69
+ Without proper MIME structure
70
+ """
71
+
72
+
73
+ @pytest.fixture
74
+ def empty_mhtml():
75
+ """Empty MHTML content for testing"""
76
+ return """From: <Saved by Blink>
77
+ MIME-Version: 1.0
78
+ Content-Type: multipart/related;
79
+ boundary="----MultipartBoundary--empty"
80
+
81
+ ------MultipartBoundary--empty--
82
+ """
83
+
84
+
85
+ @pytest.fixture
86
+ def html_with_css():
87
+ """HTML content with CSS links for testing"""
88
+ return """<!DOCTYPE html>
89
+ <html>
90
+ <head>
91
+ <title>Test Page</title>
92
+ <link rel="stylesheet" href="style.css">
93
+ <link rel="stylesheet" href="https://example.com/external.css">
94
+ </head>
95
+ <body>
96
+ <h1>Hello World</h1>
97
+ <p>This is a test page.</p>
98
+ <img src="image.png" alt="Test Image">
99
+ <img src="https://example.com/logo.png" alt="Logo">
100
+ </body>
101
+ </html>"""
102
+
103
+
104
+ @pytest.fixture
105
+ def sample_css():
106
+ """Sample CSS content for testing"""
107
+ return """body {
108
+ font-family: Arial, sans-serif;
109
+ margin: 0;
110
+ padding: 20px;
111
+ background: url('background.jpg');
112
+ }
113
+
114
+ h1 {
115
+ color: #333;
116
+ }
117
+
118
+ .container {
119
+ background-image: url("pattern.png");
120
+ }"""
121
+
122
+
123
+ @pytest.fixture
124
+ def sample_resources():
125
+ """Sample resource map for testing"""
126
+ return {
127
+ 'style.css': b'body { font-family: Arial; }',
128
+ 'https://example.com/external.css': b'h1 { color: red; }',
129
+ 'image.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82',
130
+ 'https://example.com/logo.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82',
131
+ 'background.jpg': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xc0\x00\x11\x08\x00\x01\x00\x01\x01\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x83\x84\x85\x86\x87\x88\x89\x8a\x92\x93\x94\x95\x96\x97\x98\x99\x9a\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xf5\xff\xd9',
132
+ 'pattern.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82'
133
+ }
134
+
135
+
136
+ @pytest.fixture
137
+ def temp_mhtml_file(simple_mhtml):
138
+ """Create a temporary MHTML file for testing"""
139
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.mhtml', delete=False) as f:
140
+ f.write(simple_mhtml)
141
+ temp_path = f.name
142
+
143
+ yield temp_path
144
+
145
+ # Cleanup
146
+ if os.path.exists(temp_path):
147
+ os.unlink(temp_path)