PyPI - unmhtml - Versions diffs - 0.1.0__tar.gz - Mend

unmhtml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

unmhtml-0.1.0/.github/workflows/publish.yml +55 -0
unmhtml-0.1.0/.gitignore +10 -0
unmhtml-0.1.0/.python-version +1 -0
unmhtml-0.1.0/CLAUDE.md +153 -0
unmhtml-0.1.0/LICENSE +22 -0
unmhtml-0.1.0/PKG-INFO +64 -0
unmhtml-0.1.0/README.md +38 -0
unmhtml-0.1.0/pyproject.toml +53 -0
unmhtml-0.1.0/tests/__init__.py +0 -0
unmhtml-0.1.0/tests/conftest.py +147 -0
unmhtml-0.1.0/tests/test_converter.py +282 -0
unmhtml-0.1.0/tests/test_parser.py +148 -0
unmhtml-0.1.0/tests/test_processor.py +221 -0
unmhtml-0.1.0/tests/test_utils.py +266 -0
unmhtml-0.1.0/unmhtml/__init__.py +42 -0
unmhtml-0.1.0/unmhtml/converter.py +86 -0
unmhtml-0.1.0/unmhtml/parser.py +147 -0
unmhtml-0.1.0/unmhtml/processor.py +262 -0
unmhtml-0.1.0/unmhtml/py.typed +0 -0
unmhtml-0.1.0/unmhtml/utils.py +172 -0
unmhtml-0.1.0/uv.lock +406 -0

unmhtml-0.1.0/.github/workflows/publish.yml ADDED Viewed

@@ -0,0 +1,55 @@
+name: publish
+on:
+  release:
+    types: [published]
+jobs:
+  build:
+    name: Build package
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.x"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+      - name: Install dependencies
+        run: uv sync
+      - name: Verify version matches tag
+        run: |
+          TAG_VERSION=${GITHUB_REF#refs/tags/v}
+          PYPROJECT_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])")
+          if [ "$TAG_VERSION" != "$PYPROJECT_VERSION" ]; then
+            echo "Tag version ($TAG_VERSION) doesn't match pyproject.toml version ($PYPROJECT_VERSION)"
+            exit 1
+          fi
+      - name: Build a binary wheel and a source tarball
+        run: uv build
+      - name: Store the distribution packages
+        uses: actions/upload-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+  publish-to-pypi:
+    name: Publish distribution package to PyPI
+    needs:
+      - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/unmhtml
+    permissions:
+      id-token: write
+    steps:
+      - name: Download all the dists
+        uses: actions/download-artifact@v4
+        with:
+          name: python-package-distributions
+          path: dist/
+      - name: Publish distribution package
+        uses: pypa/gh-action-pypi-publish@release/v1

unmhtml-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,10 @@
+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+# Virtual environments
+.venv

unmhtml-0.1.0/.python-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3.12

unmhtml-0.1.0/CLAUDE.md ADDED Viewed

@@ -0,0 +1,153 @@
+# **unmhtml - MHTML to HTML Converter Library Specification**
+This document outlines the requirements and design for a Python library that converts MHTML (MIME HTML) files to standalone HTML files with embedded CSS, using only Python standard library modules.
+## **1. Project Goals and Scope**
+This library is responsible for:
+- Converting MHTML files to standalone HTML files with embedded CSS
+- Preserving the original rendered content structure for accurate display
+- Providing a pure Python implementation using only standard library modules
+- Supporting integration with web applications that need to display archived content
+## **2. Technology Stack**
+- **Programming Language:** Python 3.8+
+- **Package Manager:** uv toolchain
+- **Core Libraries:** Built-in Python standard library only
+  - `email` module for MIME parsing
+  - `base64` for resource decoding
+  - `mimetypes` for content type detection
+  - `urllib.parse` for URL handling
+  - `html` for HTML escaping
+  - `re` for text processing
+- **No External Dependencies:** Pure stdlib implementation for maximum portability
+## **3. MHTML Format Understanding**
+MHTML files are structured as MIME multipart documents:
+- **Content-Type:** `multipart/related` or `message/rfc822`
+- **Main HTML Document:** First part containing the primary HTML content
+- **Embedded Resources:** Subsequent parts containing CSS, images, fonts, etc.
+- **Resource References:** Content-Location headers link resources to HTML references
+- **Encoding:** Resources typically base64-encoded for binary content
+## **4. Core Functionality**
+### **4.1. MHTML Parser**
+**Class: `MHTMLParser`**
+```python
+import email
+import base64
+from typing import Dict, Tuple
+class MHTMLParser:
+    def __init__(self, mhtml_content: str):
+        self.mhtml_content = mhtml_content
+    def parse(self) -> Tuple[str, Dict[str, bytes]]:
+        """Parse MHTML and return main HTML + resource map"""
+        message = email.message_from_string(self.mhtml_content)
+        # Extract main HTML and resources
+        return main_html, resources
+```
+### **4.2. HTML Processor**
+**Class: `HTMLProcessor`**
+```python
+class HTMLProcessor:
+    def __init__(self, html_content: str, resources: Dict[str, bytes]):
+        self.html_content = html_content
+        self.resources = resources
+    def embed_css(self) -> str:
+        """Convert <link> tags to <style> tags with embedded CSS"""
+        # Replace external CSS references with inline styles
+        return modified_html
+    def convert_to_data_uris(self) -> str:
+        """Convert resource references to data URIs"""
+        # Replace src/href attributes with data: URLs
+        return modified_html
+```
+### **4.3. Main Converter**
+**Class: `MHTMLConverter`**
+```python
+class MHTMLConverter:
+    def convert_file(self, mhtml_path: str) -> str:
+        """Convert MHTML file to HTML string"""
+        with open(mhtml_path, 'r', encoding='utf-8') as f:
+            mhtml_content = f.read()
+        return self.convert(mhtml_content)
+    def convert(self, mhtml_content: str) -> str:
+        """Convert MHTML content to HTML string"""
+        parser = MHTMLParser(mhtml_content)
+        main_html, resources = parser.parse()
+        processor = HTMLProcessor(main_html, resources)
+        html_with_css = processor.embed_css()
+        final_html = processor.convert_to_data_uris()
+        return final_html
+```
+## **5. Package Structure**
+```
+unmhtml/
+├── __init__.py
+├── parser.py          # MHTMLParser class
+├── processor.py       # HTMLProcessor class
+├── converter.py       # MHTMLConverter main class
+├── utils.py           # Utility functions
+└── py.typed           # Type hints marker
+```
+## **6. API Design**
+### **6.1. Simple API**
+```python
+from unmhtml import MHTMLConverter
+# Basic conversion
+converter = MHTMLConverter()
+html_content = converter.convert_file('page.mhtml')
+# Direct content conversion
+html_content = converter.convert(mhtml_string)
+```
+## **7. Key Features**
+- **CSS Embedding:** Convert `<link rel="stylesheet">` to `<style>` tags
+- **Resource Embedding:** Convert images/fonts to data URIs
+- **URL Resolution:** Handle relative and absolute resource references
+- **Error Handling:** Graceful degradation for malformed MHTML
+- **Memory Efficient:** Process large files without excessive memory usage
+## **8. Testing Strategy**
+- **Basic Functionality:** Test MHTML to HTML conversion works
+- **Error Handling:** Test graceful handling of malformed input
+- **Resource Embedding:** Verify CSS and resources are properly embedded
+## **9. Success Criteria**
+- **Functionality:** Successfully convert MHTML files to standalone HTML
+- **Performance:** Process typical web pages (1-5MB MHTML) efficiently
+- **Reliability:** Handle malformed MHTML gracefully
+- **Simplicity:** Clean, minimal API with good documentation
+- **Portability:** Zero external dependencies, pure Python stdlib
+This specification provides a focused foundation for building a lightweight MHTML to HTML converter library called unmhtml using the uv toolchain.

unmhtml-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+MIT License
+Copyright (c) 2025 Resolve Services
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

unmhtml-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,64 @@
+Metadata-Version: 2.4
+Name: unmhtml
+Version: 0.1.0
+Summary: MHTML to HTML converter library using Python stdlib
+Author: Johan Schuijt
+License: MIT
+License-File: LICENSE
+Keywords: archive,converter,html,mhtml,mime,web
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup :: HTML
+Requires-Python: >=3.8
+Provides-Extra: test
+Requires-Dist: pytest-cov; extra == 'test'
+Requires-Dist: pytest>=6.0; extra == 'test'
+Description-Content-Type: text/markdown
+# unmhtml
+Convert MHTML files to standalone HTML with embedded CSS and resources.
+## Installation
+```bash
+pip install unmhtml
+```
+## Usage
+```python
+from unmhtml import MHTMLConverter
+# Convert MHTML file to HTML
+converter = MHTMLConverter()
+html_content = converter.convert_file('saved_page.mhtml')
+# Save as standalone HTML
+with open('output.html', 'w') as f:
+    f.write(html_content)
+```
+## Features
+- **Pure Python** - No external dependencies, uses only standard library
+- **Standalone HTML** - Embeds CSS and converts resources to data URIs
+- **Graceful degradation** - Handles malformed MHTML files
+- **Memory efficient** - Processes large files without excessive memory usage
+## Requirements
+- Python 3.8+
+## License
+MIT

unmhtml-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,38 @@
+# unmhtml
+Convert MHTML files to standalone HTML with embedded CSS and resources.
+## Installation
+```bash
+pip install unmhtml
+```
+## Usage
+```python
+from unmhtml import MHTMLConverter
+# Convert MHTML file to HTML
+converter = MHTMLConverter()
+html_content = converter.convert_file('saved_page.mhtml')
+# Save as standalone HTML
+with open('output.html', 'w') as f:
+    f.write(html_content)
+```
+## Features
+- **Pure Python** - No external dependencies, uses only standard library
+- **Standalone HTML** - Embeds CSS and converts resources to data URIs
+- **Graceful degradation** - Handles malformed MHTML files
+- **Memory efficient** - Processes large files without excessive memory usage
+## Requirements
+- Python 3.8+
+## License
+MIT

unmhtml-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,53 @@
+[project]
+name = "unmhtml"
+version = "0.1.0"
+description = "MHTML to HTML converter library using Python stdlib"
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = []
+authors = [
+    {name = "Johan Schuijt"}
+]
+license = {text = "MIT"}
+homepage = "https://github.com/resolve-works/unmhtml"
+repository = "https://github.com/resolve-works/unmhtml"
+keywords = ["mhtml", "html", "converter", "mime", "web", "archive"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Internet :: WWW/HTTP",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+    "Topic :: Text Processing :: Markup :: HTML",
+]
+[project.optional-dependencies]
+test = ["pytest>=6.0", "pytest-cov"]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["unmhtml"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py"]
+python_classes = ["Test*"]
+python_functions = ["test_*"]
+addopts = [
+    "--strict-markers",
+    "--strict-config",
+    "--verbose",
+]
+markers = [
+    "slow: marks tests as slow (deselect with '-m \"not slow\"')",
+    "integration: marks tests as integration tests",
+]

unmhtml-0.1.0/tests/__init__.py ADDED Viewed

File without changes

unmhtml-0.1.0/tests/conftest.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""Pytest configuration and fixtures"""
+import pytest
+import tempfile
+import os
+@pytest.fixture
+def simple_mhtml():
+    """Simple MHTML content for testing"""
+    return """From: <Saved by Blink>
+Snapshot-Content-Location: https://example.com/page.html
+Subject: Test Page
+Date: Mon, 1 Jan 2024 12:00:00 GMT
+MIME-Version: 1.0
+Content-Type: multipart/related;
+	type="text/html";
+	boundary="----MultipartBoundary--test123"
+------MultipartBoundary--test123
+Content-Type: text/html
+Content-ID: <frame-test@example.com>
+Content-Transfer-Encoding: quoted-printable
+Content-Location: https://example.com/page.html
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Page</title>
+    <link rel="stylesheet" href="style.css">
+</head>
+<body>
+    <h1>Hello World</h1>
+    <img src="image.png" alt="Test Image">
+</body>
+</html>
+------MultipartBoundary--test123
+Content-Type: text/css
+Content-Transfer-Encoding: quoted-printable
+Content-Location: https://example.com/style.css
+body {
+    font-family: Arial, sans-serif;
+    margin: 0;
+    padding: 20px;
+}
+h1 {
+    color: #333;
+}
+------MultipartBoundary--test123
+Content-Type: image/png
+Content-Transfer-Encoding: base64
+Content-Location: https://example.com/image.png
+iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==
+------MultipartBoundary--test123--
+"""
+@pytest.fixture
+def malformed_mhtml():
+    """Malformed MHTML content for testing error handling"""
+    return """This is not a valid MHTML file
+Just some random text
+Without proper MIME structure
+"""
+@pytest.fixture
+def empty_mhtml():
+    """Empty MHTML content for testing"""
+    return """From: <Saved by Blink>
+MIME-Version: 1.0
+Content-Type: multipart/related;
+	boundary="----MultipartBoundary--empty"
+------MultipartBoundary--empty--
+"""
+@pytest.fixture
+def html_with_css():
+    """HTML content with CSS links for testing"""
+    return """<!DOCTYPE html>
+<html>
+<head>
+    <title>Test Page</title>
+    <link rel="stylesheet" href="style.css">
+    <link rel="stylesheet" href="https://example.com/external.css">
+</head>
+<body>
+    <h1>Hello World</h1>
+    <p>This is a test page.</p>
+    <img src="image.png" alt="Test Image">
+    <img src="https://example.com/logo.png" alt="Logo">
+</body>
+</html>"""
+@pytest.fixture
+def sample_css():
+    """Sample CSS content for testing"""
+    return """body {
+    font-family: Arial, sans-serif;
+    margin: 0;
+    padding: 20px;
+    background: url('background.jpg');
+}
+h1 {
+    color: #333;
+}
+.container {
+    background-image: url("pattern.png");
+}"""
+@pytest.fixture
+def sample_resources():
+    """Sample resource map for testing"""
+    return {
+        'style.css': b'body { font-family: Arial; }',
+        'https://example.com/external.css': b'h1 { color: red; }',
+        'image.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82',
+        'https://example.com/logo.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82',
+        'background.jpg': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x01\x00H\x00H\x00\x00\xff\xdb\x00C\x00\x08\x06\x06\x07\x06\x05\x08\x07\x07\x07\t\t\x08\n\x0c\x14\r\x0c\x0b\x0b\x0c\x19\x12\x13\x0f\x14\x1d\x1a\x1f\x1e\x1d\x1a\x1c\x1c $.\' ",#\x1c\x1c(7),01444\x1f\'9=82<.342\xff\xc0\x00\x11\x08\x00\x01\x00\x01\x01\x01\x11\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x83\x84\x85\x86\x87\x88\x89\x8a\x92\x93\x94\x95\x96\x97\x98\x99\x9a\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xff\xda\x00\x08\x01\x01\x00\x00?\x00\xf5\xff\xd9',
+        'pattern.png': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\rIDATx\x9cc\x98\x81\xf1\x7f\x0f\x00\x02\x87\x01\x80\xebG\xba\x92\x00\x00\x00\x00IEND\xaeB`\x82'
+    }
+@pytest.fixture
+def temp_mhtml_file(simple_mhtml):
+    """Create a temporary MHTML file for testing"""
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.mhtml', delete=False) as f:
+        f.write(simple_mhtml)
+        temp_path = f.name
+    yield temp_path
+    # Cleanup
+    if os.path.exists(temp_path):
+        os.unlink(temp_path)