docpull 2.2.0__tar.gz → 2.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. docpull-2.4.0/PKG-INFO +356 -0
  2. docpull-2.4.0/README.md +274 -0
  3. {docpull-2.2.0 → docpull-2.4.0}/pyproject.toml +18 -19
  4. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/__init__.py +10 -4
  5. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/cli.py +168 -24
  6. docpull-2.4.0/src/docpull/concurrency/__init__.py +7 -0
  7. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/concurrency/manager.py +16 -4
  8. docpull-2.4.0/src/docpull/conversion/__init__.py +37 -0
  9. docpull-2.4.0/src/docpull/conversion/chunking.py +224 -0
  10. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/conversion/extractor.py +119 -10
  11. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/conversion/markdown.py +65 -3
  12. docpull-2.4.0/src/docpull/conversion/special_cases.py +669 -0
  13. docpull-2.4.0/src/docpull/conversion/trafilatura_extractor.py +69 -0
  14. docpull-2.4.0/src/docpull/core/fetcher.py +1005 -0
  15. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/__init__.py +0 -12
  16. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/composite.py +2 -3
  17. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/crawler.py +1 -1
  18. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/filters.py +2 -2
  19. docpull-2.4.0/src/docpull/discovery/link_extractors/__init__.py +11 -0
  20. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/protocols.py +2 -2
  21. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/sitemap.py +19 -1
  22. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/doctor.py +8 -5
  23. docpull-2.4.0/src/docpull/http/client.py +544 -0
  24. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/logging_config.py +2 -3
  25. docpull-2.4.0/src/docpull/mcp/__init__.py +9 -0
  26. docpull-2.4.0/src/docpull/mcp/server.py +200 -0
  27. docpull-2.4.0/src/docpull/mcp/sources.py +124 -0
  28. docpull-2.4.0/src/docpull/mcp/tools.py +360 -0
  29. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/models/__init__.py +0 -2
  30. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/models/config.py +166 -38
  31. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/models/events.py +12 -14
  32. docpull-2.4.0/src/docpull/models/profiles.py +145 -0
  33. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/base.py +8 -1
  34. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/__init__.py +4 -0
  35. docpull-2.4.0/src/docpull/pipeline/steps/chunk.py +53 -0
  36. docpull-2.4.0/src/docpull/pipeline/steps/convert.py +272 -0
  37. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/dedup.py +11 -6
  38. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/fetch.py +95 -8
  39. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/metadata.py +3 -4
  40. docpull-2.4.0/src/docpull/pipeline/steps/save.py +247 -0
  41. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_json.py +2 -1
  42. docpull-2.4.0/src/docpull/pipeline/steps/save_ndjson.py +135 -0
  43. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/save_sqlite.py +2 -4
  44. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/steps/validate.py +18 -17
  45. docpull-2.4.0/src/docpull/security/robots.py +380 -0
  46. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/security/url_validator.py +111 -20
  47. docpull-2.4.0/src/docpull.egg-info/PKG-INFO +356 -0
  48. {docpull-2.2.0 → docpull-2.4.0}/src/docpull.egg-info/SOURCES.txt +19 -3
  49. {docpull-2.2.0 → docpull-2.4.0}/src/docpull.egg-info/requires.txt +14 -5
  50. docpull-2.4.0/tests/test_cache_conditional_get.py +187 -0
  51. docpull-2.4.0/tests/test_chunking.py +59 -0
  52. docpull-2.4.0/tests/test_cli.py +13 -0
  53. docpull-2.4.0/tests/test_convert_step_new.py +80 -0
  54. docpull-2.4.0/tests/test_fixes_v2_3_0.py +121 -0
  55. docpull-2.4.0/tests/test_mcp_tools.py +189 -0
  56. docpull-2.4.0/tests/test_naming.py +120 -0
  57. docpull-2.4.0/tests/test_save_ndjson.py +79 -0
  58. docpull-2.4.0/tests/test_security_hardening.py +393 -0
  59. docpull-2.4.0/tests/test_special_cases.py +311 -0
  60. {docpull-2.2.0 → docpull-2.4.0}/tests/test_v2_conversion.py +237 -0
  61. {docpull-2.2.0 → docpull-2.4.0}/tests/test_v2_discovery.py +21 -0
  62. {docpull-2.2.0 → docpull-2.4.0}/tests/test_v2_integration.py +59 -0
  63. {docpull-2.2.0 → docpull-2.4.0}/tests/test_v2_pipeline.py +131 -0
  64. docpull-2.2.0/PKG-INFO +0 -208
  65. docpull-2.2.0/README.md +0 -131
  66. docpull-2.2.0/src/docpull/concurrency/__init__.py +0 -15
  67. docpull-2.2.0/src/docpull/concurrency/browser_pool.py +0 -336
  68. docpull-2.2.0/src/docpull/conversion/__init__.py +0 -15
  69. docpull-2.2.0/src/docpull/core/fetcher.py +0 -648
  70. docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +0 -22
  71. docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +0 -294
  72. docpull-2.2.0/src/docpull/http/client.py +0 -353
  73. docpull-2.2.0/src/docpull/models/profiles.py +0 -103
  74. docpull-2.2.0/src/docpull/pipeline/steps/browser_fetch.py +0 -141
  75. docpull-2.2.0/src/docpull/pipeline/steps/convert.py +0 -134
  76. docpull-2.2.0/src/docpull/pipeline/steps/save.py +0 -167
  77. docpull-2.2.0/src/docpull/security/robots.py +0 -193
  78. docpull-2.2.0/src/docpull.egg-info/PKG-INFO +0 -208
  79. {docpull-2.2.0 → docpull-2.4.0}/LICENSE +0 -0
  80. {docpull-2.2.0 → docpull-2.4.0}/setup.cfg +0 -0
  81. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/__main__.py +0 -0
  82. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/cache/__init__.py +0 -0
  83. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/cache/manager.py +0 -0
  84. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/cache/streaming_dedup.py +0 -0
  85. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/conversion/protocols.py +0 -0
  86. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/core/__init__.py +0 -0
  87. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  88. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  89. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  90. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/http/__init__.py +0 -0
  91. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/http/protocols.py +0 -0
  92. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/http/rate_limiter.py +0 -0
  93. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/metadata_extractor.py +0 -0
  94. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/pipeline/__init__.py +0 -0
  95. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/py.typed +0 -0
  96. {docpull-2.2.0 → docpull-2.4.0}/src/docpull/security/__init__.py +0 -0
  97. {docpull-2.2.0 → docpull-2.4.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  98. {docpull-2.2.0 → docpull-2.4.0}/src/docpull.egg-info/entry_points.txt +0 -0
  99. {docpull-2.2.0 → docpull-2.4.0}/src/docpull.egg-info/top_level.txt +0 -0
  100. {docpull-2.2.0 → docpull-2.4.0}/tests/test_link_extractors.py +0 -0
docpull-2.4.0/PKG-INFO ADDED
@@ -0,0 +1,356 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 2.4.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.10
32
+ Classifier: Programming Language :: Python :: 3.11
33
+ Classifier: Programming Language :: Python :: 3.12
34
+ Classifier: Programming Language :: Python :: 3.13
35
+ Classifier: Programming Language :: Python :: 3.14
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Typing :: Typed
38
+ Requires-Python: >=3.10
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: beautifulsoup4>=4.12.0
42
+ Requires-Dist: html2text>=2020.1.16
43
+ Requires-Dist: defusedxml>=0.7.1
44
+ Requires-Dist: extruct>=0.15.0
45
+ Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: rich>=13.0.0
47
+ Requires-Dist: pyyaml>=6.0
48
+ Requires-Dist: pydantic>=2.0
49
+ Provides-Extra: proxy
50
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
51
+ Provides-Extra: normalize
52
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
53
+ Provides-Extra: trafilatura
54
+ Requires-Dist: trafilatura>=1.12.0; extra == "trafilatura"
55
+ Provides-Extra: tokens
56
+ Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
57
+ Provides-Extra: mcp
58
+ Requires-Dist: mcp>=1.0.0; extra == "mcp"
59
+ Provides-Extra: llm
60
+ Requires-Dist: tiktoken>=0.7.0; extra == "llm"
61
+ Provides-Extra: all
62
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
63
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
64
+ Requires-Dist: trafilatura>=1.12.0; extra == "all"
65
+ Requires-Dist: tiktoken>=0.7.0; extra == "all"
66
+ Requires-Dist: mcp>=1.0.0; extra == "all"
67
+ Provides-Extra: dev
68
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
69
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
70
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
71
+ Requires-Dist: black>=23.0.0; extra == "dev"
72
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
73
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
74
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
75
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
76
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
77
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
78
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
79
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
80
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
81
+ Dynamic: license-file
82
+
83
+ # docpull
84
+
85
+ **Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
86
+
87
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
88
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
89
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
90
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
91
+
92
+ <p align="center">
93
+ <a href="https://docpull.raintree.technology">
94
+ <img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
95
+ </a>
96
+ </p>
97
+
98
+ docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
99
+ extracts main content, and writes clean Markdown with source-URL frontmatter —
100
+ in seconds, with a small install footprint. It won't render JavaScript, but for
101
+ the large class of docs that don't need it (API references, Python/Go stdlib,
102
+ most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
103
+ fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
104
+ a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
105
+ CRLF-injection protections are on by default — a necessity when an AI agent
106
+ is choosing the URLs.
107
+
108
+ ## Install
109
+
110
+ ```bash
111
+ pip install docpull
112
+
113
+ # Optional extras
114
+ pip install 'docpull[llm]' # tiktoken for token-accurate chunking
115
+ pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
116
+ pip install 'docpull[mcp]' # run as an MCP server for AI agents
117
+ pip install 'docpull[all]' # everything above
118
+ ```
119
+
120
+ ## Quick start
121
+
122
+ ```bash
123
+ # Crawl and save Markdown
124
+ docpull https://docs.example.com
125
+
126
+ # One page, no crawl — the fast path for agents
127
+ docpull https://docs.example.com/guide --single
128
+
129
+ # LLM-ready NDJSON with 4k-token chunks streamed to stdout
130
+ docpull https://docs.example.com --profile llm --stream | jq .
131
+
132
+ # Mirror a site for offline use
133
+ docpull https://docs.example.com --profile mirror --cache
134
+ ```
135
+
136
+ ## Framework-aware extraction
137
+
138
+ docpull inspects each page before running the generic extractor and can pull
139
+ content directly from framework data feeds:
140
+
141
+ | Framework | Strategy |
142
+ |-----------|----------|
143
+ | Next.js | Parses `__NEXT_DATA__` JSON |
144
+ | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
145
+ | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
146
+ | Docusaurus| Detected and tagged; generic extractor produces Markdown |
147
+ | Sphinx | Detected and tagged; generic extractor produces Markdown |
148
+
149
+ JS-only SPAs with no server-rendered content are detected and skipped with a
150
+ clear reason (or, with `--strict-js-required`, reported as an error so agents
151
+ can route elsewhere).
152
+
153
+ ## Agent-friendly features
154
+
155
+ - **`--single`** — fetch a single URL without discovery. Designed for tool loops.
156
+ - **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
157
+ - **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
158
+ heading boundaries (exact counts with tiktoken, estimate without).
159
+ - **`--emit-chunks`** — write one file or record per chunk instead of per page.
160
+ - **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
161
+ skipping.
162
+ - **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
163
+ for sites where the default heuristics struggle.
164
+
165
+ ## Python API
166
+
167
+ ```python
168
+ from docpull import fetch_one
169
+
170
+ ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
171
+ print(ctx.title, ctx.source_type)
172
+ print(ctx.markdown[:500])
173
+ ```
174
+
175
+ Async streaming:
176
+
177
+ ```python
178
+ import asyncio
179
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
180
+
181
+ async def main():
182
+ cfg = DocpullConfig(
183
+ url="https://docs.example.com",
184
+ profile=ProfileName.LLM, # chunked NDJSON output
185
+ )
186
+ async with Fetcher(cfg) as fetcher:
187
+ async for event in fetcher.run():
188
+ if event.type == EventType.FETCH_PROGRESS:
189
+ print(f"{event.current}/{event.total}: {event.url}")
190
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
191
+
192
+ asyncio.run(main())
193
+ ```
194
+
195
+ Single-page from an agent tool:
196
+
197
+ ```python
198
+ from docpull import Fetcher, DocpullConfig
199
+
200
+ async def tool_call(url: str) -> str:
201
+ async with Fetcher(DocpullConfig(url=url)) as f:
202
+ ctx = await f.fetch_one(url, save=False)
203
+ return ctx.markdown or ctx.error or ""
204
+ ```
205
+
206
+ ## Profiles
207
+
208
+ ```bash
209
+ docpull https://site.com --profile rag # Default. Dedup, rich metadata.
210
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata.
211
+ docpull https://site.com --profile mirror # Full archive, polite, cached.
212
+ docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
213
+ ```
214
+
215
+ ## MCP server
216
+
217
+ docpull ships an MCP (Model Context Protocol) server so AI agents can call it
218
+ directly over stdio:
219
+
220
+ ```bash
221
+ pip install 'docpull[mcp]'
222
+ docpull mcp # starts the stdio server
223
+ ```
224
+
225
+ Add to Claude Desktop or Claude Code:
226
+
227
+ ```json
228
+ {
229
+ "mcpServers": {
230
+ "docpull": {
231
+ "command": "docpull",
232
+ "args": ["mcp"]
233
+ }
234
+ }
235
+ }
236
+ ```
237
+
238
+ Tools exposed:
239
+
240
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
241
+ - `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
242
+ - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
243
+ - `list_indexed()` — what has been fetched locally
244
+ - `grep_docs(pattern, library?)` — regex search across fetched Markdown
245
+
246
+ User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
247
+
248
+ ```yaml
249
+ sources:
250
+ mydocs:
251
+ url: https://docs.example.com
252
+ description: My internal docs
253
+ category: internal
254
+ maxPages: 200
255
+ ```
256
+
257
+ ## Output
258
+
259
+ Markdown files with YAML frontmatter:
260
+
261
+ ```markdown
262
+ ---
263
+ title: "Getting Started"
264
+ source: https://docs.example.com/guide
265
+ source_type: "nextjs"
266
+ ---
267
+
268
+ # Getting Started
269
+
270
+ ```
271
+
272
+ NDJSON (one record per page or chunk):
273
+
274
+ ```json
275
+ {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
276
+ ```
277
+
278
+ ## Security
279
+
280
+ - HTTPS-only, mandatory robots.txt compliance
281
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding via
282
+ connect-time address pinning
283
+ - XXE protection via `defusedxml` on sitemaps
284
+ - Path traversal and CRLF header injection guards
285
+ - Auth headers stripped on cross-origin redirects
286
+
287
+ When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
288
+ `--require-pinned-dns` to refuse this configuration and keep the connector-
289
+ level SSRF guarantees in effect.
290
+
291
+ ## Options
292
+
293
+ Run `docpull --help` for the full list. Highlights:
294
+
295
+ ```
296
+ Core:
297
+ --profile {rag,mirror,quick,llm,custom}
298
+ --single Fetch one URL (no crawl)
299
+ --format {markdown,json,ndjson,sqlite}
300
+ --stream Stream NDJSON to stdout
301
+
302
+ LLM / chunking:
303
+ --max-tokens-per-file N
304
+ --tokenizer NAME tiktoken encoding (default cl100k_base)
305
+ --emit-chunks One file/record per chunk
306
+
307
+ Content extraction:
308
+ --extractor {default,trafilatura}
309
+ --no-special-cases Disable framework extractors
310
+ --strict-js-required Error on JS-only pages
311
+
312
+ Cache:
313
+ --cache Enable incremental updates
314
+ --cache-dir DIR
315
+ --cache-ttl DAYS
316
+ ```
317
+
318
+ ## Performance
319
+
320
+ End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
321
+ synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
322
+ HTTP keep-alive, 5% injected duplicate content):
323
+
324
+ | Metric | Value |
325
+ |---|---|
326
+ | Total wall time | ~27 s |
327
+ | Discovery (sitemap parse) | ~80 ms |
328
+ | Fetch + convert + save | ~27 s |
329
+ | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
330
+ | Peak RSS delta from baseline | ~28 MB |
331
+ | Cache manifest size on disk | ~3.4 MB |
332
+ | Duplicates detected (5% injected) | 499 / 500 |
333
+
334
+ Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
335
+ benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
336
+ into trend tooling).
337
+
338
+ ## Troubleshooting
339
+
340
+ ```bash
341
+ docpull --doctor # Check installation
342
+ docpull URL --verbose # Verbose output
343
+ docpull URL --dry-run # Test without downloading
344
+ docpull URL --preview-urls # List URLs without fetching
345
+ ```
346
+
347
+ ## Links
348
+
349
+ - [Website](https://docpull.raintree.technology)
350
+ - [PyPI](https://pypi.org/project/docpull/)
351
+ - [GitHub](https://github.com/raintree-technology/docpull)
352
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
353
+
354
+ ## License
355
+
356
+ MIT
@@ -0,0 +1,274 @@
1
+ # docpull
2
+
3
+ **Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
4
+
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
8
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
9
+
10
+ <p align="center">
11
+ <a href="https://docpull.raintree.technology">
12
+ <img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
13
+ </a>
14
+ </p>
15
+
16
+ docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
17
+ extracts main content, and writes clean Markdown with source-URL frontmatter —
18
+ in seconds, with a small install footprint. It won't render JavaScript, but for
19
+ the large class of docs that don't need it (API references, Python/Go stdlib,
20
+ most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
21
+ fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
22
+ a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
23
+ CRLF-injection protections are on by default — a necessity when an AI agent
24
+ is choosing the URLs.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install docpull
30
+
31
+ # Optional extras
32
+ pip install 'docpull[llm]' # tiktoken for token-accurate chunking
33
+ pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
34
+ pip install 'docpull[mcp]' # run as an MCP server for AI agents
35
+ pip install 'docpull[all]' # everything above
36
+ ```
37
+
38
+ ## Quick start
39
+
40
+ ```bash
41
+ # Crawl and save Markdown
42
+ docpull https://docs.example.com
43
+
44
+ # One page, no crawl — the fast path for agents
45
+ docpull https://docs.example.com/guide --single
46
+
47
+ # LLM-ready NDJSON with 4k-token chunks streamed to stdout
48
+ docpull https://docs.example.com --profile llm --stream | jq .
49
+
50
+ # Mirror a site for offline use
51
+ docpull https://docs.example.com --profile mirror --cache
52
+ ```
53
+
54
+ ## Framework-aware extraction
55
+
56
+ docpull inspects each page before running the generic extractor and can pull
57
+ content directly from framework data feeds:
58
+
59
+ | Framework | Strategy |
60
+ |-----------|----------|
61
+ | Next.js | Parses `__NEXT_DATA__` JSON |
62
+ | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
63
+ | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
64
+ | Docusaurus| Detected and tagged; generic extractor produces Markdown |
65
+ | Sphinx | Detected and tagged; generic extractor produces Markdown |
66
+
67
+ JS-only SPAs with no server-rendered content are detected and skipped with a
68
+ clear reason (or, with `--strict-js-required`, reported as an error so agents
69
+ can route elsewhere).
70
+
71
+ ## Agent-friendly features
72
+
73
+ - **`--single`** — fetch a single URL without discovery. Designed for tool loops.
74
+ - **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
75
+ - **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
76
+ heading boundaries (exact counts with tiktoken, estimate without).
77
+ - **`--emit-chunks`** — write one file or record per chunk instead of per page.
78
+ - **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
79
+ skipping.
80
+ - **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
81
+ for sites where the default heuristics struggle.
82
+
83
+ ## Python API
84
+
85
+ ```python
86
+ from docpull import fetch_one
87
+
88
+ ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
89
+ print(ctx.title, ctx.source_type)
90
+ print(ctx.markdown[:500])
91
+ ```
92
+
93
+ Async streaming:
94
+
95
+ ```python
96
+ import asyncio
97
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
98
+
99
+ async def main():
100
+ cfg = DocpullConfig(
101
+ url="https://docs.example.com",
102
+ profile=ProfileName.LLM, # chunked NDJSON output
103
+ )
104
+ async with Fetcher(cfg) as fetcher:
105
+ async for event in fetcher.run():
106
+ if event.type == EventType.FETCH_PROGRESS:
107
+ print(f"{event.current}/{event.total}: {event.url}")
108
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
109
+
110
+ asyncio.run(main())
111
+ ```
112
+
113
+ Single-page from an agent tool:
114
+
115
+ ```python
116
+ from docpull import Fetcher, DocpullConfig
117
+
118
+ async def tool_call(url: str) -> str:
119
+ async with Fetcher(DocpullConfig(url=url)) as f:
120
+ ctx = await f.fetch_one(url, save=False)
121
+ return ctx.markdown or ctx.error or ""
122
+ ```
123
+
124
+ ## Profiles
125
+
126
+ ```bash
127
+ docpull https://site.com --profile rag # Default. Dedup, rich metadata.
128
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata.
129
+ docpull https://site.com --profile mirror # Full archive, polite, cached.
130
+ docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
131
+ ```
132
+
133
+ ## MCP server
134
+
135
+ docpull ships an MCP (Model Context Protocol) server so AI agents can call it
136
+ directly over stdio:
137
+
138
+ ```bash
139
+ pip install 'docpull[mcp]'
140
+ docpull mcp # starts the stdio server
141
+ ```
142
+
143
+ Add to Claude Desktop or Claude Code:
144
+
145
+ ```json
146
+ {
147
+ "mcpServers": {
148
+ "docpull": {
149
+ "command": "docpull",
150
+ "args": ["mcp"]
151
+ }
152
+ }
153
+ }
154
+ ```
155
+
156
+ Tools exposed:
157
+
158
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
159
+ - `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
160
+ - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
161
+ - `list_indexed()` — what has been fetched locally
162
+ - `grep_docs(pattern, library?)` — regex search across fetched Markdown
163
+
164
+ User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
165
+
166
+ ```yaml
167
+ sources:
168
+ mydocs:
169
+ url: https://docs.example.com
170
+ description: My internal docs
171
+ category: internal
172
+ maxPages: 200
173
+ ```
174
+
175
+ ## Output
176
+
177
+ Markdown files with YAML frontmatter:
178
+
179
+ ```markdown
180
+ ---
181
+ title: "Getting Started"
182
+ source: https://docs.example.com/guide
183
+ source_type: "nextjs"
184
+ ---
185
+
186
+ # Getting Started
187
+
188
+ ```
189
+
190
+ NDJSON (one record per page or chunk):
191
+
192
+ ```json
193
+ {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
194
+ ```
195
+
196
+ ## Security
197
+
198
+ - HTTPS-only, mandatory robots.txt compliance
199
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding via
200
+ connect-time address pinning
201
+ - XXE protection via `defusedxml` on sitemaps
202
+ - Path traversal and CRLF header injection guards
203
+ - Auth headers stripped on cross-origin redirects
204
+
205
+ When running with `--proxy`, DNS pinning is delegated to the proxy. Pass
206
+ `--require-pinned-dns` to refuse this configuration and keep the connector-
207
+ level SSRF guarantees in effect.
208
+
209
+ ## Options
210
+
211
+ Run `docpull --help` for the full list. Highlights:
212
+
213
+ ```
214
+ Core:
215
+ --profile {rag,mirror,quick,llm,custom}
216
+ --single Fetch one URL (no crawl)
217
+ --format {markdown,json,ndjson,sqlite}
218
+ --stream Stream NDJSON to stdout
219
+
220
+ LLM / chunking:
221
+ --max-tokens-per-file N
222
+ --tokenizer NAME tiktoken encoding (default cl100k_base)
223
+ --emit-chunks One file/record per chunk
224
+
225
+ Content extraction:
226
+ --extractor {default,trafilatura}
227
+ --no-special-cases Disable framework extractors
228
+ --strict-js-required Error on JS-only pages
229
+
230
+ Cache:
231
+ --cache Enable incremental updates
232
+ --cache-dir DIR
233
+ --cache-ttl DAYS
234
+ ```
235
+
236
+ ## Performance
237
+
238
+ End-to-end numbers from `tests/benchmarks/test_10k_pages.py` against a
239
+ synthetic 10,000-page localhost site (RAG profile, `max_concurrent=50`,
240
+ HTTP keep-alive, 5% injected duplicate content):
241
+
242
+ | Metric | Value |
243
+ |---|---|
244
+ | Total wall time | ~27 s |
245
+ | Discovery (sitemap parse) | ~80 ms |
246
+ | Fetch + convert + save | ~27 s |
247
+ | Per-page latency p50 / p95 / p99 | ~2.6 / 4.6 / 5.3 ms |
248
+ | Peak RSS delta from baseline | ~28 MB |
249
+ | Cache manifest size on disk | ~3.4 MB |
250
+ | Duplicates detected (5% injected) | 499 / 500 |
251
+
252
+ Reproduce with `make benchmark` (requires `aiohttp`; runs the gated
253
+ benchmark in `tests/benchmarks/` and prints a JSON line you can pipe
254
+ into trend tooling).
255
+
256
+ ## Troubleshooting
257
+
258
+ ```bash
259
+ docpull --doctor # Check installation
260
+ docpull URL --verbose # Verbose output
261
+ docpull URL --dry-run # Test without downloading
262
+ docpull URL --preview-urls # List URLs without fetching
263
+ ```
264
+
265
+ ## Links
266
+
267
+ - [Website](https://docpull.raintree.technology)
268
+ - [PyPI](https://pypi.org/project/docpull/)
269
+ - [GitHub](https://github.com/raintree-technology/docpull)
270
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
271
+
272
+ ## License
273
+
274
+ MIT