PyPI - contextpack-md - Versions diffs - 0.1.1__tar.gz - Mend

contextpack-md 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

contextpack_md-0.1.1/LICENSE.md +21 -0
contextpack_md-0.1.1/PKG-INFO +93 -0
contextpack_md-0.1.1/README.md +50 -0
contextpack_md-0.1.1/pyproject.toml +51 -0
contextpack_md-0.1.1/src/contextpack_md/__init__.py +4 -0
contextpack_md-0.1.1/src/contextpack_md/api.py +54 -0
contextpack_md-0.1.1/src/contextpack_md/cli.py +115 -0
contextpack_md-0.1.1/src/contextpack_md/scraper.py +23 -0

contextpack_md-0.1.1/LICENSE.md ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) [2026] [Theo Guegan]
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

contextpack_md-0.1.1/PKG-INFO ADDED Viewed

@@ -0,0 +1,93 @@
+Metadata-Version: 2.3
+Name: contextpack-md
+Version: 0.1.1
+Summary: A dead-simple tool to extract high-quality Markdown from any URL or PDF for LLMs
+Keywords: markdown,scraper,llm,pdf,converter
+Author: Theo
+Author-email: Theo <130441797+theguega@users.noreply.github.com>
+License: MIT License
+         Copyright (c) [2026] [Theo Guegan]
+         Permission is hereby granted, free of charge, to any person obtaining a copy
+         of this software and associated documentation files (the "Software"), to deal
+         in the Software without restriction, including without limitation the rights
+         to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+         copies of the Software, and to permit persons to whom the Software is
+         furnished to do so, subject to the following conditions:
+         The above copyright notice and this permission notice shall be included in all
+         copies or substantial portions of the Software.
+         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+         IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+         FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+         AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+         LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+         SOFTWARE.
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Dist: typer>=0.24.0,<0.25
+Requires-Dist: trafilatura>=2.0.0,<3
+Requires-Dist: httpx>=0.28.1,<0.29
+Requires-Dist: marker-pdf>=1.10.2 ; extra == 'pdf'
+Requires-Python: >=3.11
+Project-URL: Documentation, https://theguega.github.io/contextpack-md/
+Provides-Extra: pdf
+Description-Content-Type: text/markdown
+# 📦 contextpack-md
+<p align="center">
+  <a href="https://github.com/theguega/contextpack-md/actions/workflows/release.yml"><img src="https://github.com/theguega/contextpack-md/actions/workflows/release.yml/badge.svg" alt="Release Status"></a>
+  <a href="https://theguega.github.io/contextpack-md/"><img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg" alt="Documentation"></a>
+  <a href="https://pypi.org/project/contextpack-md/"><img src="https://img.shields.io/pypi/v/contextpack-md.svg" alt="PyPI version"></a>
+  <a href="https://github.com/theguega/contextpack-md/blob/main/LICENSE"><img src="https://img.shields.io/github/license/theguega/contextpack-md.svg" alt="License"></a>
+</p>
+A dead-simple tool to extract high-quality Markdown from any URL or PDF, optimized for LLMs.
+## 🚀 Quick Start
+Extract clean, LLM-ready Markdown from any URL in seconds.
+```bash
+# Using uv (recommended)
+uvx contextpack-md web https://docs.python.org/3/
+# Download and convert PDF
+uvx contextpack-md pdf https://arxiv.org/pdf/1706.03762.pdf
+```
+## ✨ Key Features
+- **🎯 LLM-Ready Output**: Clean, readable Markdown with links, but no junk.
+- **📄 PDF Support**: High-fidelity PDF-to-Markdown conversion (via `marker-pdf`).
+- **📂 Local Caching**: Optional timestamped local storage in `~/.contextpack-md/`.
+- **⚡ Fast & Lean**: Built on top of `trafilatura` for superior extraction speed and quality.
+## 🛠️ Installation
+```bash
+pip install contextpack-md
+# For PDF support (requires PyTorch)
+pip install "contextpack-md[pdf]"
+```
+## 📖 Documentation
+Full documentation is available at [https://theguega.github.io/contextpack-md/](https://theguega.github.io/contextpack-md/).
+## 🤝 Contributing
+Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
+## ⚖️ License
+MIT License. See [LICENSE](LICENSE) for more information.

contextpack_md-0.1.1/README.md ADDED Viewed

@@ -0,0 +1,50 @@
+# 📦 contextpack-md
+<p align="center">
+  <a href="https://github.com/theguega/contextpack-md/actions/workflows/release.yml"><img src="https://github.com/theguega/contextpack-md/actions/workflows/release.yml/badge.svg" alt="Release Status"></a>
+  <a href="https://theguega.github.io/contextpack-md/"><img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg" alt="Documentation"></a>
+  <a href="https://pypi.org/project/contextpack-md/"><img src="https://img.shields.io/pypi/v/contextpack-md.svg" alt="PyPI version"></a>
+  <a href="https://github.com/theguega/contextpack-md/blob/main/LICENSE"><img src="https://img.shields.io/github/license/theguega/contextpack-md.svg" alt="License"></a>
+</p>
+A dead-simple tool to extract high-quality Markdown from any URL or PDF, optimized for LLMs.
+## 🚀 Quick Start
+Extract clean, LLM-ready Markdown from any URL in seconds.
+```bash
+# Using uv (recommended)
+uvx contextpack-md web https://docs.python.org/3/
+# Download and convert PDF
+uvx contextpack-md pdf https://arxiv.org/pdf/1706.03762.pdf
+```
+## ✨ Key Features
+- **🎯 LLM-Ready Output**: Clean, readable Markdown with links, but no junk.
+- **📄 PDF Support**: High-fidelity PDF-to-Markdown conversion (via `marker-pdf`).
+- **📂 Local Caching**: Optional timestamped local storage in `~/.contextpack-md/`.
+- **⚡ Fast & Lean**: Built on top of `trafilatura` for superior extraction speed and quality.
+## 🛠️ Installation
+```bash
+pip install contextpack-md
+# For PDF support (requires PyTorch)
+pip install "contextpack-md[pdf]"
+```
+## 📖 Documentation
+Full documentation is available at [https://theguega.github.io/contextpack-md/](https://theguega.github.io/contextpack-md/).
+## 🤝 Contributing
+Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
+## ⚖️ License
+MIT License. See [LICENSE](LICENSE) for more information.

contextpack_md-0.1.1/pyproject.toml ADDED Viewed

@@ -0,0 +1,51 @@
+[project]
+name = "contextpack-md"
+version = "0.1.1"
+description = "A dead-simple tool to extract high-quality Markdown from any URL or PDF for LLMs"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { file = "LICENSE.md" }
+authors = [{ name = "Theo", email = "130441797+theguega@users.noreply.github.com" }]
+keywords = ["markdown", "scraper", "llm", "pdf", "converter"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+  "typer>=0.24.0,<0.25",
+  "trafilatura>=2.0.0,<3",
+  "httpx>=0.28.1,<0.29",
+]
+[project.scripts]
+contextpack-md = "contextpack-md.cli:app"
+[project.optional-dependencies]
+pdf = [
+    "marker-pdf>=1.10.2",
+]
+[build-system]
+requires = ["uv_build>=0.11.3,<0.12"]
+build-backend = "uv_build"
+[tool.pyapp]
+project = "contextpack-md"
+module = "contextpack-md.cli"
+command = "app"
+python-version = "3.12"
+features = ["pip"]
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+    "ruff>=0.15.9",
+]
+[project.urls]
+Documentation = "https://theguega.github.io/contextpack-md/"

contextpack_md-0.1.1/src/contextpack_md/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .api import get_url_context
+from .cli import app as cli_app
+__all__ = ["get_url_context", "cli_app"]

contextpack_md-0.1.1/src/contextpack_md/api.py ADDED Viewed

@@ -0,0 +1,54 @@
+from pathlib import Path
+from typing import Optional
+import httpx
+from .scraper import fetch_and_scrape
+def get_url_context(url: str) -> Optional[str]:
+    """
+    Fetches and scrapes a single URL to return clean Markdown.
+    """
+    return fetch_and_scrape(url)
+def download_pdf(url: str, save_path: Path) -> bool:
+    """
+    Downloads a PDF file from a URL.
+    """
+    try:
+        with httpx.Client(follow_redirects=True) as client:
+            response = client.get(url)
+            response.raise_for_status()
+            with save_path.open("wb") as f:
+                f.write(response.content)
+            return True
+    except Exception as e:
+        print(f"Error downloading PDF: {e}")
+        return False
+def convert_pdf_to_markdown(pdf_path: Path) -> Optional[str]:
+    """
+    Converts a PDF file to Markdown using marker-pdf.
+    """
+    try:
+        from marker.converters.pdf import PdfConverter
+        from marker.models import create_model_dict
+        from marker.output import text_from_rendered
+        converter = PdfConverter(
+            artifact_dict=create_model_dict(),
+        )
+        rendered = converter(str(pdf_path))
+        text, _, _ = text_from_rendered(rendered)
+        return text
+    except ImportError:
+        print(
+            "Error: marker-pdf is not installed. Please install it with 'pip install \"contextpack-md[pdf]\"'"
+        )
+        return None
+    except Exception as e:
+        print(f"Error converting PDF: {e}")
+        return None

contextpack_md-0.1.1/src/contextpack_md/cli.py ADDED Viewed

@@ -0,0 +1,115 @@
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+from urllib.parse import urlparse
+import typer
+from .api import convert_pdf_to_markdown, download_pdf, get_url_context
+app = typer.Typer(add_completion=False)
+contextpack-md_DIR = Path.home() / ".contextpack-md"
+def clean_domain(url: str) -> str:
+    parsed = urlparse(url)
+    domain = parsed.netloc or parsed.path.split("/")[-1]
+    return domain.replace(".", "_").replace("/", "_")
+@app.command()
+def web(
+    url: str = typer.Argument(help="URL to extract context from"),
+    write: bool = typer.Option(
+        False, "--write", "-w", help="Write result to .contextpack-md folder"
+    ),
+    output_dir: Optional[Path] = typer.Argument(None, help="Output directory path"),
+):
+    """
+    Extract context from a web URL and optionally write it to a file.
+    """
+    if not url:
+        typer.echo("Error: Please provide a URL or use --clear")
+        raise typer.Exit(code=1)
+    typer.echo(f"🔍 Fetching context from: {url}...")
+    content = get_url_context(url)
+    if not content:
+        typer.echo("❌ Error: Could not extract content from URL.")
+        raise typer.Exit(code=1)
+    if write:
+        dir = output_dir if output_dir else contextpack-md_DIR
+        domain = clean_domain(url)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{domain}_{timestamp}.md"
+        filepath = dir / filename
+        with filepath.open("w", encoding="utf-8") as f:
+            f.write(content)
+        typer.echo(f"💾 Content written to: {filepath}")
+    else:
+        typer.echo("\n--- Content Start ---\n")
+        typer.echo(content)
+        typer.echo("\n--- Content End ---\n")
+@app.command()
+def clear():
+    """
+    Clear the contextpack-md cache directory.
+    """
+    if contextpack-md_DIR.exists():
+        files = list(contextpack-md_DIR.glob("*"))
+        typer.echo(f"🗑️  Clearing {len(files)} item from {contextpack-md_DIR}")
+        for file in files:
+            if file.is_file():
+                file.unlink()
+    else:
+        typer.echo(f"❌ Cache directory {contextpack-md_DIR} does not exist.")
+@app.command()
+def pdf(
+    url: str = typer.Argument(..., help="URL to the PDF file"),
+    output_dir: Optional[Path] = typer.Argument(None, help="Output directory path"),
+):
+    """
+    Download a PDF and convert it to Markdown using marker-pdf.
+    """
+    dir = output_dir or contextpack-md_DIR
+    dir.mkdir(exist_ok=True)
+    domain = clean_domain(url)
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    pdf_filename = f"{domain}_{timestamp}.pdf"
+    pdf_path = dir / pdf_filename
+    typer.echo(f"📥 Downloading PDF from: {url}...")
+    if not download_pdf(url, pdf_path):
+        typer.echo("❌ Error: Failed to download PDF.")
+        raise typer.Exit(code=1)
+    typer.echo(
+        "⚙️  Converting PDF to Markdown... (This process uses ML and can take ~0.2s/page)"
+    )
+    markdown_content = convert_pdf_to_markdown(pdf_path)
+    if not markdown_content:
+        typer.echo("❌ Error: Failed to convert PDF to Markdown.")
+        raise typer.Exit(code=1)
+    md_filename = f"{domain}_{timestamp}.md"
+    md_path = dir / md_filename
+    with open(md_path, "w", encoding="utf-8") as f:
+        f.write(markdown_content)
+    typer.echo(f"✅ Successfully converted PDF. Result saved to: {md_path}")
+if __name__ == "__main__":
+    app()

contextpack_md-0.1.1/src/contextpack_md/scraper.py ADDED Viewed

@@ -0,0 +1,23 @@
+import trafilatura
+from typing import Optional
+def extract_content(html: str, url: str) -> Optional[str]:
+    """
+    Extracts readable markdown from HTML.
+    """
+    return trafilatura.extract(
+        html,
+        url=url,
+        output_format="markdown",
+        include_links=True,
+        include_images=False
+    )
+def fetch_and_scrape(url: str, timeout: int = 10) -> Optional[str]:
+    """
+    Downloads and extracts content from a URL.
+    """
+    downloaded = trafilatura.fetch_url(url)
+    if not downloaded:
+        return None
+    return extract_content(downloaded, url)