docpull 2.0.0__tar.gz → 2.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-2.3.0/PKG-INFO +331 -0
- docpull-2.3.0/README.md +249 -0
- {docpull-2.0.0 → docpull-2.3.0}/pyproject.toml +20 -21
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/__init__.py +10 -4
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/cache/manager.py +132 -18
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/cache/streaming_dedup.py +6 -5
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/cli.py +224 -19
- docpull-2.3.0/src/docpull/concurrency/__init__.py +7 -0
- docpull-2.3.0/src/docpull/conversion/__init__.py +37 -0
- docpull-2.3.0/src/docpull/conversion/chunking.py +224 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/conversion/extractor.py +10 -13
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/conversion/markdown.py +22 -4
- docpull-2.3.0/src/docpull/conversion/special_cases.py +459 -0
- docpull-2.3.0/src/docpull/conversion/trafilatura_extractor.py +69 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/core/fetcher.py +285 -54
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/__init__.py +5 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/crawler.py +25 -12
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/filters.py +7 -6
- docpull-2.3.0/src/docpull/discovery/link_extractors/__init__.py +11 -0
- docpull-2.3.0/src/docpull/discovery/link_extractors/enhanced.py +315 -0
- docpull-2.3.0/src/docpull/discovery/link_extractors/protocols.py +33 -0
- docpull-2.3.0/src/docpull/discovery/link_extractors/static.py +160 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/sitemap.py +58 -11
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/doctor.py +13 -9
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/http/__init__.py +2 -1
- docpull-2.3.0/src/docpull/http/client.py +532 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/http/protocols.py +4 -2
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/http/rate_limiter.py +115 -4
- docpull-2.3.0/src/docpull/mcp/__init__.py +9 -0
- docpull-2.3.0/src/docpull/mcp/server.py +182 -0
- docpull-2.3.0/src/docpull/mcp/sources.py +124 -0
- docpull-2.3.0/src/docpull/mcp/tools.py +238 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/metadata_extractor.py +16 -14
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/models/__init__.py +6 -3
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/models/config.py +155 -42
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/models/events.py +17 -2
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/models/profiles.py +23 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/base.py +21 -13
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/__init__.py +8 -0
- docpull-2.3.0/src/docpull/pipeline/steps/chunk.py +54 -0
- docpull-2.3.0/src/docpull/pipeline/steps/convert.py +189 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/fetch.py +8 -1
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/save.py +24 -8
- docpull-2.3.0/src/docpull/pipeline/steps/save_json.py +192 -0
- docpull-2.3.0/src/docpull/pipeline/steps/save_ndjson.py +135 -0
- docpull-2.3.0/src/docpull/pipeline/steps/save_sqlite.py +169 -0
- docpull-2.3.0/src/docpull/security/robots.py +380 -0
- docpull-2.3.0/src/docpull/security/url_validator.py +266 -0
- docpull-2.3.0/src/docpull.egg-info/PKG-INFO +331 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull.egg-info/SOURCES.txt +24 -2
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull.egg-info/requires.txt +14 -5
- docpull-2.3.0/tests/test_chunking.py +59 -0
- docpull-2.3.0/tests/test_cli.py +13 -0
- docpull-2.3.0/tests/test_convert_step_new.py +80 -0
- docpull-2.3.0/tests/test_fixes_v2_3_0.py +121 -0
- docpull-2.3.0/tests/test_link_extractors.py +270 -0
- docpull-2.3.0/tests/test_mcp_tools.py +101 -0
- docpull-2.3.0/tests/test_save_ndjson.py +79 -0
- docpull-2.3.0/tests/test_security_hardening.py +357 -0
- docpull-2.3.0/tests/test_special_cases.py +150 -0
- {docpull-2.0.0 → docpull-2.3.0}/tests/test_v2_conversion.py +0 -1
- {docpull-2.0.0 → docpull-2.3.0}/tests/test_v2_discovery.py +22 -0
- {docpull-2.0.0 → docpull-2.3.0}/tests/test_v2_integration.py +25 -0
- {docpull-2.0.0 → docpull-2.3.0}/tests/test_v2_pipeline.py +1 -0
- docpull-2.0.0/PKG-INFO +0 -207
- docpull-2.0.0/README.md +0 -130
- docpull-2.0.0/src/docpull/concurrency/__init__.py +0 -15
- docpull-2.0.0/src/docpull/concurrency/browser_pool.py +0 -337
- docpull-2.0.0/src/docpull/conversion/__init__.py +0 -15
- docpull-2.0.0/src/docpull/http/client.py +0 -321
- docpull-2.0.0/src/docpull/pipeline/steps/browser_fetch.py +0 -141
- docpull-2.0.0/src/docpull/pipeline/steps/convert.py +0 -134
- docpull-2.0.0/src/docpull/security/robots.py +0 -192
- docpull-2.0.0/src/docpull/security/url_validator.py +0 -174
- docpull-2.0.0/src/docpull.egg-info/PKG-INFO +0 -207
- {docpull-2.0.0 → docpull-2.3.0}/LICENSE +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/setup.cfg +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/__main__.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/concurrency/manager.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/core/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/composite.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/discovery/protocols.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/logging_config.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/py.typed +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull/security/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.0.0 → docpull-2.3.0}/src/docpull.egg-info/top_level.txt +0 -0
docpull-2.3.0/PKG-INFO
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 2.3.0
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
36
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
|
+
Classifier: Typing :: Typed
|
|
38
|
+
Requires-Python: >=3.10
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
42
|
+
Requires-Dist: html2text>=2020.1.16
|
|
43
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
44
|
+
Requires-Dist: extruct>=0.15.0
|
|
45
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: rich>=13.0.0
|
|
47
|
+
Requires-Dist: pyyaml>=6.0
|
|
48
|
+
Requires-Dist: pydantic>=2.0
|
|
49
|
+
Provides-Extra: proxy
|
|
50
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
51
|
+
Provides-Extra: normalize
|
|
52
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
|
|
53
|
+
Provides-Extra: trafilatura
|
|
54
|
+
Requires-Dist: trafilatura>=1.12.0; extra == "trafilatura"
|
|
55
|
+
Provides-Extra: tokens
|
|
56
|
+
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
57
|
+
Provides-Extra: mcp
|
|
58
|
+
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
59
|
+
Provides-Extra: llm
|
|
60
|
+
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
61
|
+
Provides-Extra: all
|
|
62
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
|
|
63
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
64
|
+
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
65
|
+
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
66
|
+
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
67
|
+
Provides-Extra: dev
|
|
68
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
70
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
71
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
72
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
73
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
74
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
75
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
76
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
77
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
78
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
79
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
80
|
+
Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
|
|
81
|
+
Dynamic: license-file
|
|
82
|
+
|
|
83
|
+
# docpull
|
|
84
|
+
|
|
85
|
+
**Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
|
|
86
|
+
|
|
87
|
+
[](https://www.python.org/downloads/)
|
|
88
|
+
[](https://badge.fury.io/py/docpull)
|
|
89
|
+
[](https://pepy.tech/project/docpull)
|
|
90
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
91
|
+
|
|
92
|
+
<p align="center">
|
|
93
|
+
<a href="https://docpull.raintree.technology">
|
|
94
|
+
<img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
|
|
95
|
+
</a>
|
|
96
|
+
</p>
|
|
97
|
+
|
|
98
|
+
docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
|
|
99
|
+
extracts main content, and writes clean Markdown with source-URL frontmatter —
|
|
100
|
+
in seconds, with a small install footprint. It won't render JavaScript, but for
|
|
101
|
+
the large class of docs that don't need it (API references, Python/Go stdlib,
|
|
102
|
+
most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
|
|
103
|
+
fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
|
|
104
|
+
a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
|
|
105
|
+
CRLF-injection protections are on by default — a necessity when an AI agent
|
|
106
|
+
is choosing the URLs.
|
|
107
|
+
|
|
108
|
+
## Install
|
|
109
|
+
|
|
110
|
+
```bash
|
|
111
|
+
pip install docpull
|
|
112
|
+
|
|
113
|
+
# Optional extras
|
|
114
|
+
pip install 'docpull[llm]' # tiktoken for token-accurate chunking
|
|
115
|
+
pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
|
|
116
|
+
pip install 'docpull[mcp]' # run as an MCP server for AI agents
|
|
117
|
+
pip install 'docpull[all]' # everything above
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Quick start
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
# Crawl and save Markdown
|
|
124
|
+
docpull https://docs.example.com
|
|
125
|
+
|
|
126
|
+
# One page, no crawl — the fast path for agents
|
|
127
|
+
docpull https://docs.example.com/guide --single
|
|
128
|
+
|
|
129
|
+
# LLM-ready NDJSON with 4k-token chunks streamed to stdout
|
|
130
|
+
docpull https://docs.example.com --profile llm --stream | jq .
|
|
131
|
+
|
|
132
|
+
# Mirror a site for offline use
|
|
133
|
+
docpull https://docs.example.com --profile mirror --cache
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Framework-aware extraction
|
|
137
|
+
|
|
138
|
+
docpull inspects each page before running the generic extractor and can pull
|
|
139
|
+
content directly from framework data feeds:
|
|
140
|
+
|
|
141
|
+
| Framework | Strategy |
|
|
142
|
+
|-----------|----------|
|
|
143
|
+
| Next.js | Parses `__NEXT_DATA__` JSON |
|
|
144
|
+
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
145
|
+
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
146
|
+
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
147
|
+
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
148
|
+
|
|
149
|
+
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
150
|
+
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
151
|
+
can route elsewhere).
|
|
152
|
+
|
|
153
|
+
## Agent-friendly features
|
|
154
|
+
|
|
155
|
+
- **`--single`** — fetch a single URL without discovery. Designed for tool loops.
|
|
156
|
+
- **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
|
|
157
|
+
- **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
|
|
158
|
+
heading boundaries (exact counts with tiktoken, estimate without).
|
|
159
|
+
- **`--emit-chunks`** — write one file or record per chunk instead of per page.
|
|
160
|
+
- **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
|
|
161
|
+
skipping.
|
|
162
|
+
- **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
|
|
163
|
+
for sites where the default heuristics struggle.
|
|
164
|
+
|
|
165
|
+
## Python API
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from docpull import fetch_one
|
|
169
|
+
|
|
170
|
+
ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
|
|
171
|
+
print(ctx.title, ctx.source_type)
|
|
172
|
+
print(ctx.markdown[:500])
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Async streaming:
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
import asyncio
|
|
179
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
180
|
+
|
|
181
|
+
async def main():
|
|
182
|
+
cfg = DocpullConfig(
|
|
183
|
+
url="https://docs.example.com",
|
|
184
|
+
profile=ProfileName.LLM, # chunked NDJSON output
|
|
185
|
+
)
|
|
186
|
+
async with Fetcher(cfg) as fetcher:
|
|
187
|
+
async for event in fetcher.run():
|
|
188
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
189
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
190
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
191
|
+
|
|
192
|
+
asyncio.run(main())
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
Single-page from an agent tool:
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from docpull import Fetcher, DocpullConfig
|
|
199
|
+
|
|
200
|
+
async def tool_call(url: str) -> str:
|
|
201
|
+
async with Fetcher(DocpullConfig(url=url)) as f:
|
|
202
|
+
ctx = await f.fetch_one(url, save=False)
|
|
203
|
+
return ctx.markdown or ctx.error or ""
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Profiles
|
|
207
|
+
|
|
208
|
+
```bash
|
|
209
|
+
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
210
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
211
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
212
|
+
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## MCP server
|
|
216
|
+
|
|
217
|
+
docpull ships an MCP (Model Context Protocol) server so AI agents can call it
|
|
218
|
+
directly over stdio:
|
|
219
|
+
|
|
220
|
+
```bash
|
|
221
|
+
pip install 'docpull[mcp]'
|
|
222
|
+
docpull mcp # starts the stdio server
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Add to Claude Desktop or Claude Code:
|
|
226
|
+
|
|
227
|
+
```json
|
|
228
|
+
{
|
|
229
|
+
"mcpServers": {
|
|
230
|
+
"docpull": {
|
|
231
|
+
"command": "docpull",
|
|
232
|
+
"args": ["mcp"]
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Tools exposed:
|
|
239
|
+
|
|
240
|
+
- `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
|
|
241
|
+
- `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
|
|
242
|
+
- `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
|
|
243
|
+
- `list_indexed()` — what has been fetched locally
|
|
244
|
+
- `grep_docs(pattern, library?)` — regex search across fetched Markdown
|
|
245
|
+
|
|
246
|
+
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
247
|
+
|
|
248
|
+
```yaml
|
|
249
|
+
sources:
|
|
250
|
+
mydocs:
|
|
251
|
+
url: https://docs.example.com
|
|
252
|
+
description: My internal docs
|
|
253
|
+
category: internal
|
|
254
|
+
maxPages: 200
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Output
|
|
258
|
+
|
|
259
|
+
Markdown files with YAML frontmatter:
|
|
260
|
+
|
|
261
|
+
```markdown
|
|
262
|
+
---
|
|
263
|
+
title: "Getting Started"
|
|
264
|
+
source: https://docs.example.com/guide
|
|
265
|
+
source_type: "nextjs"
|
|
266
|
+
---
|
|
267
|
+
|
|
268
|
+
# Getting Started
|
|
269
|
+
…
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
NDJSON (one record per page or chunk):
|
|
273
|
+
|
|
274
|
+
```json
|
|
275
|
+
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Security
|
|
279
|
+
|
|
280
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
281
|
+
- SSRF protection: blocks private/internal network IPs, DNS rebinding
|
|
282
|
+
- XXE protection via `defusedxml` on sitemaps
|
|
283
|
+
- Path traversal and CRLF header injection guards
|
|
284
|
+
- Auth headers stripped on cross-origin redirects
|
|
285
|
+
|
|
286
|
+
## Options
|
|
287
|
+
|
|
288
|
+
Run `docpull --help` for the full list. Highlights:
|
|
289
|
+
|
|
290
|
+
```
|
|
291
|
+
Core:
|
|
292
|
+
--profile {rag,mirror,quick,llm,custom}
|
|
293
|
+
--single Fetch one URL (no crawl)
|
|
294
|
+
--format {markdown,json,ndjson,sqlite}
|
|
295
|
+
--stream Stream NDJSON to stdout
|
|
296
|
+
|
|
297
|
+
LLM / chunking:
|
|
298
|
+
--max-tokens-per-file N
|
|
299
|
+
--tokenizer NAME tiktoken encoding (default cl100k_base)
|
|
300
|
+
--emit-chunks One file/record per chunk
|
|
301
|
+
|
|
302
|
+
Content extraction:
|
|
303
|
+
--extractor {default,trafilatura}
|
|
304
|
+
--no-special-cases Disable framework extractors
|
|
305
|
+
--strict-js-required Error on JS-only pages
|
|
306
|
+
|
|
307
|
+
Cache:
|
|
308
|
+
--cache Enable incremental updates
|
|
309
|
+
--cache-dir DIR
|
|
310
|
+
--cache-ttl DAYS
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Troubleshooting
|
|
314
|
+
|
|
315
|
+
```bash
|
|
316
|
+
docpull --doctor # Check installation
|
|
317
|
+
docpull URL --verbose # Verbose output
|
|
318
|
+
docpull URL --dry-run # Test without downloading
|
|
319
|
+
docpull URL --preview-urls # List URLs without fetching
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
## Links
|
|
323
|
+
|
|
324
|
+
- [Website](https://docpull.raintree.technology)
|
|
325
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
326
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
327
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
328
|
+
|
|
329
|
+
## License
|
|
330
|
+
|
|
331
|
+
MIT
|
docpull-2.3.0/README.md
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
# docpull
|
|
2
|
+
|
|
3
|
+
**Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://badge.fury.io/py/docpull)
|
|
7
|
+
[](https://pepy.tech/project/docpull)
|
|
8
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
<a href="https://docpull.raintree.technology">
|
|
12
|
+
<img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
|
|
13
|
+
</a>
|
|
14
|
+
</p>
|
|
15
|
+
|
|
16
|
+
docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
|
|
17
|
+
extracts main content, and writes clean Markdown with source-URL frontmatter —
|
|
18
|
+
in seconds, with a small install footprint. It won't render JavaScript, but for
|
|
19
|
+
the large class of docs that don't need it (API references, Python/Go stdlib,
|
|
20
|
+
most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
|
|
21
|
+
fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
|
|
22
|
+
a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
|
|
23
|
+
CRLF-injection protections are on by default — a necessity when an AI agent
|
|
24
|
+
is choosing the URLs.
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install docpull
|
|
30
|
+
|
|
31
|
+
# Optional extras
|
|
32
|
+
pip install 'docpull[llm]' # tiktoken for token-accurate chunking
|
|
33
|
+
pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
|
|
34
|
+
pip install 'docpull[mcp]' # run as an MCP server for AI agents
|
|
35
|
+
pip install 'docpull[all]' # everything above
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quick start
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Crawl and save Markdown
|
|
42
|
+
docpull https://docs.example.com
|
|
43
|
+
|
|
44
|
+
# One page, no crawl — the fast path for agents
|
|
45
|
+
docpull https://docs.example.com/guide --single
|
|
46
|
+
|
|
47
|
+
# LLM-ready NDJSON with 4k-token chunks streamed to stdout
|
|
48
|
+
docpull https://docs.example.com --profile llm --stream | jq .
|
|
49
|
+
|
|
50
|
+
# Mirror a site for offline use
|
|
51
|
+
docpull https://docs.example.com --profile mirror --cache
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Framework-aware extraction
|
|
55
|
+
|
|
56
|
+
docpull inspects each page before running the generic extractor and can pull
|
|
57
|
+
content directly from framework data feeds:
|
|
58
|
+
|
|
59
|
+
| Framework | Strategy |
|
|
60
|
+
|-----------|----------|
|
|
61
|
+
| Next.js | Parses `__NEXT_DATA__` JSON |
|
|
62
|
+
| Mintlify | `__NEXT_DATA__` with Mintlify tagging |
|
|
63
|
+
| OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
|
|
64
|
+
| Docusaurus| Detected and tagged; generic extractor produces Markdown |
|
|
65
|
+
| Sphinx | Detected and tagged; generic extractor produces Markdown |
|
|
66
|
+
|
|
67
|
+
JS-only SPAs with no server-rendered content are detected and skipped with a
|
|
68
|
+
clear reason (or, with `--strict-js-required`, reported as an error so agents
|
|
69
|
+
can route elsewhere).
|
|
70
|
+
|
|
71
|
+
## Agent-friendly features
|
|
72
|
+
|
|
73
|
+
- **`--single`** — fetch a single URL without discovery. Designed for tool loops.
|
|
74
|
+
- **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
|
|
75
|
+
- **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
|
|
76
|
+
heading boundaries (exact counts with tiktoken, estimate without).
|
|
77
|
+
- **`--emit-chunks`** — write one file or record per chunk instead of per page.
|
|
78
|
+
- **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
|
|
79
|
+
skipping.
|
|
80
|
+
- **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
|
|
81
|
+
for sites where the default heuristics struggle.
|
|
82
|
+
|
|
83
|
+
## Python API
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from docpull import fetch_one
|
|
87
|
+
|
|
88
|
+
ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
|
|
89
|
+
print(ctx.title, ctx.source_type)
|
|
90
|
+
print(ctx.markdown[:500])
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Async streaming:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
import asyncio
|
|
97
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
98
|
+
|
|
99
|
+
async def main():
|
|
100
|
+
cfg = DocpullConfig(
|
|
101
|
+
url="https://docs.example.com",
|
|
102
|
+
profile=ProfileName.LLM, # chunked NDJSON output
|
|
103
|
+
)
|
|
104
|
+
async with Fetcher(cfg) as fetcher:
|
|
105
|
+
async for event in fetcher.run():
|
|
106
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
107
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
108
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
109
|
+
|
|
110
|
+
asyncio.run(main())
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Single-page from an agent tool:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from docpull import Fetcher, DocpullConfig
|
|
117
|
+
|
|
118
|
+
async def tool_call(url: str) -> str:
|
|
119
|
+
async with Fetcher(DocpullConfig(url=url)) as f:
|
|
120
|
+
ctx = await f.fetch_one(url, save=False)
|
|
121
|
+
return ctx.markdown or ctx.error or ""
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Profiles
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
docpull https://site.com --profile rag # Default. Dedup, rich metadata.
|
|
128
|
+
docpull https://site.com --profile llm # NDJSON + chunks + metadata.
|
|
129
|
+
docpull https://site.com --profile mirror # Full archive, polite, cached.
|
|
130
|
+
docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## MCP server
|
|
134
|
+
|
|
135
|
+
docpull ships an MCP (Model Context Protocol) server so AI agents can call it
|
|
136
|
+
directly over stdio:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
pip install 'docpull[mcp]'
|
|
140
|
+
docpull mcp # starts the stdio server
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Add to Claude Desktop or Claude Code:
|
|
144
|
+
|
|
145
|
+
```json
|
|
146
|
+
{
|
|
147
|
+
"mcpServers": {
|
|
148
|
+
"docpull": {
|
|
149
|
+
"command": "docpull",
|
|
150
|
+
"args": ["mcp"]
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Tools exposed:
|
|
157
|
+
|
|
158
|
+
- `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
|
|
159
|
+
- `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
|
|
160
|
+
- `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
|
|
161
|
+
- `list_indexed()` — what has been fetched locally
|
|
162
|
+
- `grep_docs(pattern, library?)` — regex search across fetched Markdown
|
|
163
|
+
|
|
164
|
+
User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
|
|
165
|
+
|
|
166
|
+
```yaml
|
|
167
|
+
sources:
|
|
168
|
+
mydocs:
|
|
169
|
+
url: https://docs.example.com
|
|
170
|
+
description: My internal docs
|
|
171
|
+
category: internal
|
|
172
|
+
maxPages: 200
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
## Output
|
|
176
|
+
|
|
177
|
+
Markdown files with YAML frontmatter:
|
|
178
|
+
|
|
179
|
+
```markdown
|
|
180
|
+
---
|
|
181
|
+
title: "Getting Started"
|
|
182
|
+
source: https://docs.example.com/guide
|
|
183
|
+
source_type: "nextjs"
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
# Getting Started
|
|
187
|
+
…
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
NDJSON (one record per page or chunk):
|
|
191
|
+
|
|
192
|
+
```json
|
|
193
|
+
{"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
## Security
|
|
197
|
+
|
|
198
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
199
|
+
- SSRF protection: blocks private/internal network IPs, DNS rebinding
|
|
200
|
+
- XXE protection via `defusedxml` on sitemaps
|
|
201
|
+
- Path traversal and CRLF header injection guards
|
|
202
|
+
- Auth headers stripped on cross-origin redirects
|
|
203
|
+
|
|
204
|
+
## Options
|
|
205
|
+
|
|
206
|
+
Run `docpull --help` for the full list. Highlights:
|
|
207
|
+
|
|
208
|
+
```
|
|
209
|
+
Core:
|
|
210
|
+
--profile {rag,mirror,quick,llm,custom}
|
|
211
|
+
--single Fetch one URL (no crawl)
|
|
212
|
+
--format {markdown,json,ndjson,sqlite}
|
|
213
|
+
--stream Stream NDJSON to stdout
|
|
214
|
+
|
|
215
|
+
LLM / chunking:
|
|
216
|
+
--max-tokens-per-file N
|
|
217
|
+
--tokenizer NAME tiktoken encoding (default cl100k_base)
|
|
218
|
+
--emit-chunks One file/record per chunk
|
|
219
|
+
|
|
220
|
+
Content extraction:
|
|
221
|
+
--extractor {default,trafilatura}
|
|
222
|
+
--no-special-cases Disable framework extractors
|
|
223
|
+
--strict-js-required Error on JS-only pages
|
|
224
|
+
|
|
225
|
+
Cache:
|
|
226
|
+
--cache Enable incremental updates
|
|
227
|
+
--cache-dir DIR
|
|
228
|
+
--cache-ttl DAYS
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Troubleshooting
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
docpull --doctor # Check installation
|
|
235
|
+
docpull URL --verbose # Verbose output
|
|
236
|
+
docpull URL --dry-run # Test without downloading
|
|
237
|
+
docpull URL --preview-urls # List URLs without fetching
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
## Links
|
|
241
|
+
|
|
242
|
+
- [Website](https://docpull.raintree.technology)
|
|
243
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
244
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
245
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
246
|
+
|
|
247
|
+
## License
|
|
248
|
+
|
|
249
|
+
MIT
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.3.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
12
|
license = "MIT"
|
|
13
13
|
license-files = ["LICENSE"]
|
|
14
14
|
authors = [
|
|
@@ -51,7 +51,6 @@ classifiers = [
|
|
|
51
51
|
|
|
52
52
|
# Programming Language
|
|
53
53
|
"Programming Language :: Python :: 3",
|
|
54
|
-
"Programming Language :: Python :: 3.9",
|
|
55
54
|
"Programming Language :: Python :: 3.10",
|
|
56
55
|
"Programming Language :: Python :: 3.11",
|
|
57
56
|
"Programming Language :: Python :: 3.12",
|
|
@@ -63,7 +62,6 @@ classifiers = [
|
|
|
63
62
|
"Typing :: Typed",
|
|
64
63
|
]
|
|
65
64
|
dependencies = [
|
|
66
|
-
"requests>=2.31.0",
|
|
67
65
|
"beautifulsoup4>=4.12.0",
|
|
68
66
|
"html2text>=2020.1.16",
|
|
69
67
|
"defusedxml>=0.7.1",
|
|
@@ -71,24 +69,34 @@ dependencies = [
|
|
|
71
69
|
"aiohttp>=3.9.0",
|
|
72
70
|
"rich>=13.0.0",
|
|
73
71
|
"pyyaml>=6.0",
|
|
74
|
-
"gitpython>=3.1.40",
|
|
75
72
|
"pydantic>=2.0",
|
|
76
73
|
]
|
|
77
74
|
|
|
78
75
|
[project.optional-dependencies]
|
|
79
|
-
js = [
|
|
80
|
-
"playwright>=1.40.0",
|
|
81
|
-
]
|
|
82
76
|
proxy = [
|
|
83
77
|
"aiohttp-socks>=0.8.0",
|
|
84
78
|
]
|
|
85
79
|
normalize = [
|
|
86
80
|
"url-normalize>=1.4.0",
|
|
87
81
|
]
|
|
82
|
+
trafilatura = [
|
|
83
|
+
"trafilatura>=1.12.0",
|
|
84
|
+
]
|
|
85
|
+
tokens = [
|
|
86
|
+
"tiktoken>=0.7.0",
|
|
87
|
+
]
|
|
88
|
+
mcp = [
|
|
89
|
+
"mcp>=1.0.0",
|
|
90
|
+
]
|
|
91
|
+
llm = [
|
|
92
|
+
"tiktoken>=0.7.0",
|
|
93
|
+
]
|
|
88
94
|
all = [
|
|
89
|
-
"playwright>=1.40.0",
|
|
90
95
|
"aiohttp-socks>=0.8.0",
|
|
91
96
|
"url-normalize>=1.4.0",
|
|
97
|
+
"trafilatura>=1.12.0",
|
|
98
|
+
"tiktoken>=0.7.0",
|
|
99
|
+
"mcp>=1.0.0",
|
|
92
100
|
]
|
|
93
101
|
dev = [
|
|
94
102
|
"pytest>=7.0.0",
|
|
@@ -126,18 +134,18 @@ docpull = ["py.typed"]
|
|
|
126
134
|
|
|
127
135
|
[tool.black]
|
|
128
136
|
line-length = 110
|
|
129
|
-
target-version = ["
|
|
137
|
+
target-version = ["py310", "py311", "py312", "py313", "py314"]
|
|
130
138
|
|
|
131
139
|
[tool.ruff]
|
|
132
140
|
line-length = 110
|
|
133
|
-
target-version = "
|
|
141
|
+
target-version = "py310"
|
|
134
142
|
|
|
135
143
|
[tool.ruff.lint]
|
|
136
144
|
select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
|
|
137
145
|
ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
|
|
138
146
|
|
|
139
147
|
[tool.mypy]
|
|
140
|
-
python_version = "3.
|
|
148
|
+
python_version = "3.10"
|
|
141
149
|
warn_return_any = true
|
|
142
150
|
warn_unused_configs = true
|
|
143
151
|
disallow_untyped_defs = true
|
|
@@ -154,10 +162,6 @@ init_forbid_extra = true
|
|
|
154
162
|
init_typed = true
|
|
155
163
|
warn_required_dynamic_aliases = true
|
|
156
164
|
|
|
157
|
-
[[tool.mypy.overrides]]
|
|
158
|
-
module = "playwright.*"
|
|
159
|
-
ignore_missing_imports = true
|
|
160
|
-
|
|
161
165
|
[[tool.mypy.overrides]]
|
|
162
166
|
module = "extruct.*"
|
|
163
167
|
ignore_missing_imports = true
|
|
@@ -171,11 +175,6 @@ module = "docpull.models.*"
|
|
|
171
175
|
disallow_any_unimported = false
|
|
172
176
|
warn_return_any = false
|
|
173
177
|
|
|
174
|
-
[[tool.mypy.overrides]]
|
|
175
|
-
module = "docpull.concurrency.browser_pool"
|
|
176
|
-
disallow_any_unimported = false
|
|
177
|
-
warn_return_any = false
|
|
178
|
-
|
|
179
178
|
[[tool.mypy.overrides]]
|
|
180
179
|
module = "tests.*"
|
|
181
180
|
disallow_untyped_defs = false
|