contextpack-md 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2026] [Theo Guegan]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.3
2
+ Name: contextpack-md
3
+ Version: 0.1.1
4
+ Summary: A dead-simple tool to extract high-quality Markdown from any URL or PDF for LLMs
5
+ Keywords: markdown,scraper,llm,pdf,converter
6
+ Author: Theo
7
+ Author-email: Theo <130441797+theguega@users.noreply.github.com>
8
+ License: MIT License
9
+
10
+ Copyright (c) [2026] [Theo Guegan]
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ Classifier: Development Status :: 4 - Beta
30
+ Classifier: Intended Audience :: Developers
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Programming Language :: Python :: 3
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Requires-Dist: typer>=0.24.0,<0.25
36
+ Requires-Dist: trafilatura>=2.0.0,<3
37
+ Requires-Dist: httpx>=0.28.1,<0.29
38
+ Requires-Dist: marker-pdf>=1.10.2 ; extra == 'pdf'
39
+ Requires-Python: >=3.11
40
+ Project-URL: Documentation, https://theguega.github.io/contextpack-md/
41
+ Provides-Extra: pdf
42
+ Description-Content-Type: text/markdown
43
+
44
+ # 📦 contextpack-md
45
+
46
+ <p align="center">
47
+ <a href="https://github.com/theguega/contextpack-md/actions/workflows/release.yml"><img src="https://github.com/theguega/contextpack-md/actions/workflows/release.yml/badge.svg" alt="Release Status"></a>
48
+ <a href="https://theguega.github.io/contextpack-md/"><img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg" alt="Documentation"></a>
49
+ <a href="https://pypi.org/project/contextpack-md/"><img src="https://img.shields.io/pypi/v/contextpack-md.svg" alt="PyPI version"></a>
50
+ <a href="https://github.com/theguega/contextpack-md/blob/main/LICENSE"><img src="https://img.shields.io/github/license/theguega/contextpack-md.svg" alt="License"></a>
51
+ </p>
52
+
53
+ A dead-simple tool to extract high-quality Markdown from any URL or PDF, optimized for LLMs.
54
+
55
+ ## 🚀 Quick Start
56
+
57
+ Extract clean, LLM-ready Markdown from any URL in seconds.
58
+
59
+ ```bash
60
+ # Using uv (recommended)
61
+ uvx contextpack-md web https://docs.python.org/3/
62
+
63
+ # Download and convert PDF
64
+ uvx contextpack-md pdf https://arxiv.org/pdf/1706.03762.pdf
65
+ ```
66
+
67
+ ## ✨ Key Features
68
+
69
+ - **🎯 LLM-Ready Output**: Clean, readable Markdown with links, but no junk.
70
+ - **📄 PDF Support**: High-fidelity PDF-to-Markdown conversion (via `marker-pdf`).
71
+ - **📂 Local Caching**: Optional timestamped local storage in `~/.contextpack-md/`.
72
+ - **⚡ Fast & Lean**: Built on top of `trafilatura` for superior extraction speed and quality.
73
+
74
+ ## 🛠️ Installation
75
+
76
+ ```bash
77
+ pip install contextpack-md
78
+
79
+ # For PDF support (requires PyTorch)
80
+ pip install "contextpack-md[pdf]"
81
+ ```
82
+
83
+ ## 📖 Documentation
84
+
85
+ Full documentation is available at [https://theguega.github.io/contextpack-md/](https://theguega.github.io/contextpack-md/).
86
+
87
+ ## 🤝 Contributing
88
+
89
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
90
+
91
+ ## ⚖️ License
92
+
93
+ MIT License. See [LICENSE](LICENSE) for more information.
@@ -0,0 +1,50 @@
1
+ # 📦 contextpack-md
2
+
3
+ <p align="center">
4
+ <a href="https://github.com/theguega/contextpack-md/actions/workflows/release.yml"><img src="https://github.com/theguega/contextpack-md/actions/workflows/release.yml/badge.svg" alt="Release Status"></a>
5
+ <a href="https://theguega.github.io/contextpack-md/"><img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue.svg" alt="Documentation"></a>
6
+ <a href="https://pypi.org/project/contextpack-md/"><img src="https://img.shields.io/pypi/v/contextpack-md.svg" alt="PyPI version"></a>
7
+ <a href="https://github.com/theguega/contextpack-md/blob/main/LICENSE"><img src="https://img.shields.io/github/license/theguega/contextpack-md.svg" alt="License"></a>
8
+ </p>
9
+
10
+ A dead-simple tool to extract high-quality Markdown from any URL or PDF, optimized for LLMs.
11
+
12
+ ## 🚀 Quick Start
13
+
14
+ Extract clean, LLM-ready Markdown from any URL in seconds.
15
+
16
+ ```bash
17
+ # Using uv (recommended)
18
+ uvx contextpack-md web https://docs.python.org/3/
19
+
20
+ # Download and convert PDF
21
+ uvx contextpack-md pdf https://arxiv.org/pdf/1706.03762.pdf
22
+ ```
23
+
24
+ ## ✨ Key Features
25
+
26
+ - **🎯 LLM-Ready Output**: Clean, readable Markdown with links, but no junk.
27
+ - **📄 PDF Support**: High-fidelity PDF-to-Markdown conversion (via `marker-pdf`).
28
+ - **📂 Local Caching**: Optional timestamped local storage in `~/.contextpack-md/`.
29
+ - **⚡ Fast & Lean**: Built on top of `trafilatura` for superior extraction speed and quality.
30
+
31
+ ## 🛠️ Installation
32
+
33
+ ```bash
34
+ pip install contextpack-md
35
+
36
+ # For PDF support (requires PyTorch)
37
+ pip install "contextpack-md[pdf]"
38
+ ```
39
+
40
+ ## 📖 Documentation
41
+
42
+ Full documentation is available at [https://theguega.github.io/contextpack-md/](https://theguega.github.io/contextpack-md/).
43
+
44
+ ## 🤝 Contributing
45
+
46
+ Contributions are welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more details.
47
+
48
+ ## ⚖️ License
49
+
50
+ MIT License. See [LICENSE](LICENSE) for more information.
@@ -0,0 +1,51 @@
1
+ [project]
2
+ name = "contextpack-md"
3
+ version = "0.1.1"
4
+ description = "A dead-simple tool to extract high-quality Markdown from any URL or PDF for LLMs"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = { file = "LICENSE.md" }
8
+ authors = [{ name = "Theo", email = "130441797+theguega@users.noreply.github.com" }]
9
+ keywords = ["markdown", "scraper", "llm", "pdf", "converter"]
10
+ classifiers = [
11
+ "Development Status :: 4 - Beta",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3",
15
+ "Programming Language :: Python :: 3.11",
16
+ "Programming Language :: Python :: 3.12",
17
+ ]
18
+
19
+ dependencies = [
20
+ "typer>=0.24.0,<0.25",
21
+ "trafilatura>=2.0.0,<3",
22
+ "httpx>=0.28.1,<0.29",
23
+ ]
24
+
25
+ [project.scripts]
26
+ contextpack-md = "contextpack-md.cli:app"
27
+
28
+ [project.optional-dependencies]
29
+ pdf = [
30
+ "marker-pdf>=1.10.2",
31
+ ]
32
+
33
+ [build-system]
34
+ requires = ["uv_build>=0.11.3,<0.12"]
35
+ build-backend = "uv_build"
36
+
37
+ [tool.pyapp]
38
+ project = "contextpack-md"
39
+ module = "contextpack-md.cli"
40
+ command = "app"
41
+ python-version = "3.12"
42
+ features = ["pip"]
43
+
44
+ [dependency-groups]
45
+ dev = [
46
+ "pytest>=9.0.2",
47
+ "ruff>=0.15.9",
48
+ ]
49
+
50
+ [project.urls]
51
+ Documentation = "https://theguega.github.io/contextpack-md/"
@@ -0,0 +1,4 @@
1
+ from .api import get_url_context
2
+ from .cli import app as cli_app
3
+
4
+ __all__ = ["get_url_context", "cli_app"]
@@ -0,0 +1,54 @@
1
+ from pathlib import Path
2
+ from typing import Optional
3
+
4
+ import httpx
5
+
6
+ from .scraper import fetch_and_scrape
7
+
8
+
9
+ def get_url_context(url: str) -> Optional[str]:
10
+ """
11
+ Fetches and scrapes a single URL to return clean Markdown.
12
+ """
13
+ return fetch_and_scrape(url)
14
+
15
+
16
+ def download_pdf(url: str, save_path: Path) -> bool:
17
+ """
18
+ Downloads a PDF file from a URL.
19
+ """
20
+ try:
21
+ with httpx.Client(follow_redirects=True) as client:
22
+ response = client.get(url)
23
+ response.raise_for_status()
24
+ with save_path.open("wb") as f:
25
+ f.write(response.content)
26
+ return True
27
+ except Exception as e:
28
+ print(f"Error downloading PDF: {e}")
29
+ return False
30
+
31
+
32
+ def convert_pdf_to_markdown(pdf_path: Path) -> Optional[str]:
33
+ """
34
+ Converts a PDF file to Markdown using marker-pdf.
35
+ """
36
+ try:
37
+ from marker.converters.pdf import PdfConverter
38
+ from marker.models import create_model_dict
39
+ from marker.output import text_from_rendered
40
+
41
+ converter = PdfConverter(
42
+ artifact_dict=create_model_dict(),
43
+ )
44
+ rendered = converter(str(pdf_path))
45
+ text, _, _ = text_from_rendered(rendered)
46
+ return text
47
+ except ImportError:
48
+ print(
49
+ "Error: marker-pdf is not installed. Please install it with 'pip install \"contextpack-md[pdf]\"'"
50
+ )
51
+ return None
52
+ except Exception as e:
53
+ print(f"Error converting PDF: {e}")
54
+ return None
@@ -0,0 +1,115 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ from urllib.parse import urlparse
5
+
6
+ import typer
7
+
8
+ from .api import convert_pdf_to_markdown, download_pdf, get_url_context
9
+
10
+ app = typer.Typer(add_completion=False)
11
+
12
+ contextpack-md_DIR = Path.home() / ".contextpack-md"
13
+
14
+
15
+ def clean_domain(url: str) -> str:
16
+ parsed = urlparse(url)
17
+ domain = parsed.netloc or parsed.path.split("/")[-1]
18
+ return domain.replace(".", "_").replace("/", "_")
19
+
20
+
21
+ @app.command()
22
+ def web(
23
+ url: str = typer.Argument(help="URL to extract context from"),
24
+ write: bool = typer.Option(
25
+ False, "--write", "-w", help="Write result to .contextpack-md folder"
26
+ ),
27
+ output_dir: Optional[Path] = typer.Argument(None, help="Output directory path"),
28
+ ):
29
+ """
30
+ Extract context from a web URL and optionally write it to a file.
31
+ """
32
+ if not url:
33
+ typer.echo("Error: Please provide a URL or use --clear")
34
+ raise typer.Exit(code=1)
35
+
36
+ typer.echo(f"🔍 Fetching context from: {url}...")
37
+ content = get_url_context(url)
38
+
39
+ if not content:
40
+ typer.echo("❌ Error: Could not extract content from URL.")
41
+ raise typer.Exit(code=1)
42
+
43
+ if write:
44
+ dir = output_dir if output_dir else contextpack-md_DIR
45
+ domain = clean_domain(url)
46
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ filename = f"{domain}_{timestamp}.md"
48
+ filepath = dir / filename
49
+
50
+ with filepath.open("w", encoding="utf-8") as f:
51
+ f.write(content)
52
+
53
+ typer.echo(f"💾 Content written to: {filepath}")
54
+ else:
55
+ typer.echo("\n--- Content Start ---\n")
56
+ typer.echo(content)
57
+ typer.echo("\n--- Content End ---\n")
58
+
59
+
60
+ @app.command()
61
+ def clear():
62
+ """
63
+ Clear the contextpack-md cache directory.
64
+ """
65
+ if contextpack-md_DIR.exists():
66
+ files = list(contextpack-md_DIR.glob("*"))
67
+ typer.echo(f"🗑️ Clearing {len(files)} item from {contextpack-md_DIR}")
68
+ for file in files:
69
+ if file.is_file():
70
+ file.unlink()
71
+ else:
72
+ typer.echo(f"❌ Cache directory {contextpack-md_DIR} does not exist.")
73
+
74
+
75
+ @app.command()
76
+ def pdf(
77
+ url: str = typer.Argument(..., help="URL to the PDF file"),
78
+ output_dir: Optional[Path] = typer.Argument(None, help="Output directory path"),
79
+ ):
80
+ """
81
+ Download a PDF and convert it to Markdown using marker-pdf.
82
+ """
83
+ dir = output_dir or contextpack-md_DIR
84
+ dir.mkdir(exist_ok=True)
85
+
86
+ domain = clean_domain(url)
87
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
88
+ pdf_filename = f"{domain}_{timestamp}.pdf"
89
+ pdf_path = dir / pdf_filename
90
+
91
+ typer.echo(f"📥 Downloading PDF from: {url}...")
92
+ if not download_pdf(url, pdf_path):
93
+ typer.echo("❌ Error: Failed to download PDF.")
94
+ raise typer.Exit(code=1)
95
+
96
+ typer.echo(
97
+ "⚙️ Converting PDF to Markdown... (This process uses ML and can take ~0.2s/page)"
98
+ )
99
+ markdown_content = convert_pdf_to_markdown(pdf_path)
100
+
101
+ if not markdown_content:
102
+ typer.echo("❌ Error: Failed to convert PDF to Markdown.")
103
+ raise typer.Exit(code=1)
104
+
105
+ md_filename = f"{domain}_{timestamp}.md"
106
+ md_path = dir / md_filename
107
+
108
+ with open(md_path, "w", encoding="utf-8") as f:
109
+ f.write(markdown_content)
110
+
111
+ typer.echo(f"✅ Successfully converted PDF. Result saved to: {md_path}")
112
+
113
+
114
+ if __name__ == "__main__":
115
+ app()
@@ -0,0 +1,23 @@
1
+ import trafilatura
2
+ from typing import Optional
3
+
4
+ def extract_content(html: str, url: str) -> Optional[str]:
5
+ """
6
+ Extracts readable markdown from HTML.
7
+ """
8
+ return trafilatura.extract(
9
+ html,
10
+ url=url,
11
+ output_format="markdown",
12
+ include_links=True,
13
+ include_images=False
14
+ )
15
+
16
+ def fetch_and_scrape(url: str, timeout: int = 10) -> Optional[str]:
17
+ """
18
+ Downloads and extracts content from a URL.
19
+ """
20
+ downloaded = trafilatura.fetch_url(url)
21
+ if not downloaded:
22
+ return None
23
+ return extract_content(downloaded, url)