docpull 2.2.0__tar.gz → 2.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. docpull-2.3.0/PKG-INFO +331 -0
  2. docpull-2.3.0/README.md +249 -0
  3. {docpull-2.2.0 → docpull-2.3.0}/pyproject.toml +18 -19
  4. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/__init__.py +10 -4
  5. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/cli.py +110 -19
  6. docpull-2.3.0/src/docpull/concurrency/__init__.py +7 -0
  7. docpull-2.3.0/src/docpull/conversion/__init__.py +37 -0
  8. docpull-2.3.0/src/docpull/conversion/chunking.py +224 -0
  9. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/conversion/extractor.py +5 -9
  10. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/conversion/markdown.py +16 -0
  11. docpull-2.3.0/src/docpull/conversion/special_cases.py +459 -0
  12. docpull-2.3.0/src/docpull/conversion/trafilatura_extractor.py +69 -0
  13. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/core/fetcher.py +138 -54
  14. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/__init__.py +0 -12
  15. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/crawler.py +1 -1
  16. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/filters.py +2 -2
  17. docpull-2.3.0/src/docpull/discovery/link_extractors/__init__.py +11 -0
  18. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/sitemap.py +19 -1
  19. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/doctor.py +8 -5
  20. docpull-2.3.0/src/docpull/http/client.py +532 -0
  21. docpull-2.3.0/src/docpull/mcp/__init__.py +9 -0
  22. docpull-2.3.0/src/docpull/mcp/server.py +182 -0
  23. docpull-2.3.0/src/docpull/mcp/sources.py +124 -0
  24. docpull-2.3.0/src/docpull/mcp/tools.py +238 -0
  25. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/models/__init__.py +0 -2
  26. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/models/config.py +68 -30
  27. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/models/events.py +0 -2
  28. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/models/profiles.py +21 -0
  29. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/base.py +6 -0
  30. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/__init__.py +4 -0
  31. docpull-2.3.0/src/docpull/pipeline/steps/chunk.py +54 -0
  32. docpull-2.3.0/src/docpull/pipeline/steps/convert.py +189 -0
  33. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/fetch.py +8 -1
  34. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/save.py +24 -8
  35. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/save_json.py +2 -1
  36. docpull-2.3.0/src/docpull/pipeline/steps/save_ndjson.py +135 -0
  37. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/save_sqlite.py +2 -4
  38. docpull-2.3.0/src/docpull/security/robots.py +380 -0
  39. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/security/url_validator.py +111 -20
  40. docpull-2.3.0/src/docpull.egg-info/PKG-INFO +331 -0
  41. {docpull-2.2.0 → docpull-2.3.0}/src/docpull.egg-info/SOURCES.txt +17 -3
  42. {docpull-2.2.0 → docpull-2.3.0}/src/docpull.egg-info/requires.txt +14 -5
  43. docpull-2.3.0/tests/test_chunking.py +59 -0
  44. docpull-2.3.0/tests/test_cli.py +13 -0
  45. docpull-2.3.0/tests/test_convert_step_new.py +80 -0
  46. docpull-2.3.0/tests/test_fixes_v2_3_0.py +121 -0
  47. docpull-2.3.0/tests/test_mcp_tools.py +101 -0
  48. docpull-2.3.0/tests/test_save_ndjson.py +79 -0
  49. docpull-2.3.0/tests/test_security_hardening.py +357 -0
  50. docpull-2.3.0/tests/test_special_cases.py +150 -0
  51. {docpull-2.2.0 → docpull-2.3.0}/tests/test_v2_discovery.py +21 -0
  52. {docpull-2.2.0 → docpull-2.3.0}/tests/test_v2_integration.py +24 -0
  53. docpull-2.2.0/PKG-INFO +0 -208
  54. docpull-2.2.0/README.md +0 -131
  55. docpull-2.2.0/src/docpull/concurrency/__init__.py +0 -15
  56. docpull-2.2.0/src/docpull/concurrency/browser_pool.py +0 -336
  57. docpull-2.2.0/src/docpull/conversion/__init__.py +0 -15
  58. docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +0 -22
  59. docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +0 -294
  60. docpull-2.2.0/src/docpull/http/client.py +0 -353
  61. docpull-2.2.0/src/docpull/pipeline/steps/browser_fetch.py +0 -141
  62. docpull-2.2.0/src/docpull/pipeline/steps/convert.py +0 -134
  63. docpull-2.2.0/src/docpull/security/robots.py +0 -193
  64. docpull-2.2.0/src/docpull.egg-info/PKG-INFO +0 -208
  65. {docpull-2.2.0 → docpull-2.3.0}/LICENSE +0 -0
  66. {docpull-2.2.0 → docpull-2.3.0}/setup.cfg +0 -0
  67. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/__main__.py +0 -0
  68. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/cache/__init__.py +0 -0
  69. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/cache/manager.py +0 -0
  70. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/cache/streaming_dedup.py +0 -0
  71. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/concurrency/manager.py +0 -0
  72. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/conversion/protocols.py +0 -0
  73. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/core/__init__.py +0 -0
  74. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/composite.py +0 -0
  75. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
  76. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  77. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/link_extractors/static.py +0 -0
  78. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/discovery/protocols.py +0 -0
  79. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/http/__init__.py +0 -0
  80. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/http/protocols.py +0 -0
  81. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/http/rate_limiter.py +0 -0
  82. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/logging_config.py +0 -0
  83. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/metadata_extractor.py +0 -0
  84. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/__init__.py +0 -0
  85. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  86. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  87. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/pipeline/steps/validate.py +0 -0
  88. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/py.typed +0 -0
  89. {docpull-2.2.0 → docpull-2.3.0}/src/docpull/security/__init__.py +0 -0
  90. {docpull-2.2.0 → docpull-2.3.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  91. {docpull-2.2.0 → docpull-2.3.0}/src/docpull.egg-info/entry_points.txt +0 -0
  92. {docpull-2.2.0 → docpull-2.3.0}/src/docpull.egg-info/top_level.txt +0 -0
  93. {docpull-2.2.0 → docpull-2.3.0}/tests/test_link_extractors.py +0 -0
  94. {docpull-2.2.0 → docpull-2.3.0}/tests/test_v2_conversion.py +0 -0
  95. {docpull-2.2.0 → docpull-2.3.0}/tests/test_v2_pipeline.py +0 -0
docpull-2.3.0/PKG-INFO ADDED
@@ -0,0 +1,331 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 2.3.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.10
32
+ Classifier: Programming Language :: Python :: 3.11
33
+ Classifier: Programming Language :: Python :: 3.12
34
+ Classifier: Programming Language :: Python :: 3.13
35
+ Classifier: Programming Language :: Python :: 3.14
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Typing :: Typed
38
+ Requires-Python: >=3.10
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: beautifulsoup4>=4.12.0
42
+ Requires-Dist: html2text>=2020.1.16
43
+ Requires-Dist: defusedxml>=0.7.1
44
+ Requires-Dist: extruct>=0.15.0
45
+ Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: rich>=13.0.0
47
+ Requires-Dist: pyyaml>=6.0
48
+ Requires-Dist: pydantic>=2.0
49
+ Provides-Extra: proxy
50
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
51
+ Provides-Extra: normalize
52
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
53
+ Provides-Extra: trafilatura
54
+ Requires-Dist: trafilatura>=1.12.0; extra == "trafilatura"
55
+ Provides-Extra: tokens
56
+ Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
57
+ Provides-Extra: mcp
58
+ Requires-Dist: mcp>=1.0.0; extra == "mcp"
59
+ Provides-Extra: llm
60
+ Requires-Dist: tiktoken>=0.7.0; extra == "llm"
61
+ Provides-Extra: all
62
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
63
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
64
+ Requires-Dist: trafilatura>=1.12.0; extra == "all"
65
+ Requires-Dist: tiktoken>=0.7.0; extra == "all"
66
+ Requires-Dist: mcp>=1.0.0; extra == "all"
67
+ Provides-Extra: dev
68
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
69
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
70
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
71
+ Requires-Dist: black>=23.0.0; extra == "dev"
72
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
73
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
74
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
75
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
76
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
77
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
78
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
79
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
80
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
81
+ Dynamic: license-file
82
+
83
+ # docpull
84
+
85
+ **Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
86
+
87
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
88
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
89
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
90
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
91
+
92
+ <p align="center">
93
+ <a href="https://docpull.raintree.technology">
94
+ <img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
95
+ </a>
96
+ </p>
97
+
98
+ docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
99
+ extracts main content, and writes clean Markdown with source-URL frontmatter —
100
+ in seconds, with a small install footprint. It won't render JavaScript, but for
101
+ the large class of docs that don't need it (API references, Python/Go stdlib,
102
+ most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
103
+ fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
104
+ a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
105
+ CRLF-injection protections are on by default — a necessity when an AI agent
106
+ is choosing the URLs.
107
+
108
+ ## Install
109
+
110
+ ```bash
111
+ pip install docpull
112
+
113
+ # Optional extras
114
+ pip install 'docpull[llm]' # tiktoken for token-accurate chunking
115
+ pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
116
+ pip install 'docpull[mcp]' # run as an MCP server for AI agents
117
+ pip install 'docpull[all]' # everything above
118
+ ```
119
+
120
+ ## Quick start
121
+
122
+ ```bash
123
+ # Crawl and save Markdown
124
+ docpull https://docs.example.com
125
+
126
+ # One page, no crawl — the fast path for agents
127
+ docpull https://docs.example.com/guide --single
128
+
129
+ # LLM-ready NDJSON with 4k-token chunks streamed to stdout
130
+ docpull https://docs.example.com --profile llm --stream | jq .
131
+
132
+ # Mirror a site for offline use
133
+ docpull https://docs.example.com --profile mirror --cache
134
+ ```
135
+
136
+ ## Framework-aware extraction
137
+
138
+ docpull inspects each page before running the generic extractor and can pull
139
+ content directly from framework data feeds:
140
+
141
+ | Framework | Strategy |
142
+ |-----------|----------|
143
+ | Next.js | Parses `__NEXT_DATA__` JSON |
144
+ | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
145
+ | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
146
+ | Docusaurus| Detected and tagged; generic extractor produces Markdown |
147
+ | Sphinx | Detected and tagged; generic extractor produces Markdown |
148
+
149
+ JS-only SPAs with no server-rendered content are detected and skipped with a
150
+ clear reason (or, with `--strict-js-required`, reported as an error so agents
151
+ can route elsewhere).
152
+
153
+ ## Agent-friendly features
154
+
155
+ - **`--single`** — fetch a single URL without discovery. Designed for tool loops.
156
+ - **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
157
+ - **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
158
+ heading boundaries (exact counts with tiktoken, estimate without).
159
+ - **`--emit-chunks`** — write one file or record per chunk instead of per page.
160
+ - **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
161
+ skipping.
162
+ - **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
163
+ for sites where the default heuristics struggle.
164
+
165
+ ## Python API
166
+
167
+ ```python
168
+ from docpull import fetch_one
169
+
170
+ ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
171
+ print(ctx.title, ctx.source_type)
172
+ print(ctx.markdown[:500])
173
+ ```
174
+
175
+ Async streaming:
176
+
177
+ ```python
178
+ import asyncio
179
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
180
+
181
+ async def main():
182
+ cfg = DocpullConfig(
183
+ url="https://docs.example.com",
184
+ profile=ProfileName.LLM, # chunked NDJSON output
185
+ )
186
+ async with Fetcher(cfg) as fetcher:
187
+ async for event in fetcher.run():
188
+ if event.type == EventType.FETCH_PROGRESS:
189
+ print(f"{event.current}/{event.total}: {event.url}")
190
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
191
+
192
+ asyncio.run(main())
193
+ ```
194
+
195
+ Single-page from an agent tool:
196
+
197
+ ```python
198
+ from docpull import Fetcher, DocpullConfig
199
+
200
+ async def tool_call(url: str) -> str:
201
+ async with Fetcher(DocpullConfig(url=url)) as f:
202
+ ctx = await f.fetch_one(url, save=False)
203
+ return ctx.markdown or ctx.error or ""
204
+ ```
205
+
206
+ ## Profiles
207
+
208
+ ```bash
209
+ docpull https://site.com --profile rag # Default. Dedup, rich metadata.
210
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata.
211
+ docpull https://site.com --profile mirror # Full archive, polite, cached.
212
+ docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
213
+ ```
214
+
215
+ ## MCP server
216
+
217
+ docpull ships an MCP (Model Context Protocol) server so AI agents can call it
218
+ directly over stdio:
219
+
220
+ ```bash
221
+ pip install 'docpull[mcp]'
222
+ docpull mcp # starts the stdio server
223
+ ```
224
+
225
+ Add to Claude Desktop or Claude Code:
226
+
227
+ ```json
228
+ {
229
+ "mcpServers": {
230
+ "docpull": {
231
+ "command": "docpull",
232
+ "args": ["mcp"]
233
+ }
234
+ }
235
+ }
236
+ ```
237
+
238
+ Tools exposed:
239
+
240
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
241
+ - `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
242
+ - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
243
+ - `list_indexed()` — what has been fetched locally
244
+ - `grep_docs(pattern, library?)` — regex search across fetched Markdown
245
+
246
+ User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
247
+
248
+ ```yaml
249
+ sources:
250
+ mydocs:
251
+ url: https://docs.example.com
252
+ description: My internal docs
253
+ category: internal
254
+ maxPages: 200
255
+ ```
256
+
257
+ ## Output
258
+
259
+ Markdown files with YAML frontmatter:
260
+
261
+ ```markdown
262
+ ---
263
+ title: "Getting Started"
264
+ source: https://docs.example.com/guide
265
+ source_type: "nextjs"
266
+ ---
267
+
268
+ # Getting Started
269
+
270
+ ```
271
+
272
+ NDJSON (one record per page or chunk):
273
+
274
+ ```json
275
+ {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
276
+ ```
277
+
278
+ ## Security
279
+
280
+ - HTTPS-only, mandatory robots.txt compliance
281
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding
282
+ - XXE protection via `defusedxml` on sitemaps
283
+ - Path traversal and CRLF header injection guards
284
+ - Auth headers stripped on cross-origin redirects
285
+
286
+ ## Options
287
+
288
+ Run `docpull --help` for the full list. Highlights:
289
+
290
+ ```
291
+ Core:
292
+ --profile {rag,mirror,quick,llm,custom}
293
+ --single Fetch one URL (no crawl)
294
+ --format {markdown,json,ndjson,sqlite}
295
+ --stream Stream NDJSON to stdout
296
+
297
+ LLM / chunking:
298
+ --max-tokens-per-file N
299
+ --tokenizer NAME tiktoken encoding (default cl100k_base)
300
+ --emit-chunks One file/record per chunk
301
+
302
+ Content extraction:
303
+ --extractor {default,trafilatura}
304
+ --no-special-cases Disable framework extractors
305
+ --strict-js-required Error on JS-only pages
306
+
307
+ Cache:
308
+ --cache Enable incremental updates
309
+ --cache-dir DIR
310
+ --cache-ttl DAYS
311
+ ```
312
+
313
+ ## Troubleshooting
314
+
315
+ ```bash
316
+ docpull --doctor # Check installation
317
+ docpull URL --verbose # Verbose output
318
+ docpull URL --dry-run # Test without downloading
319
+ docpull URL --preview-urls # List URLs without fetching
320
+ ```
321
+
322
+ ## Links
323
+
324
+ - [Website](https://docpull.raintree.technology)
325
+ - [PyPI](https://pypi.org/project/docpull/)
326
+ - [GitHub](https://github.com/raintree-technology/docpull)
327
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
328
+
329
+ ## License
330
+
331
+ MIT
@@ -0,0 +1,249 @@
1
+ # docpull
2
+
3
+ **Security-hardened, browser-free crawler that turns static documentation sites into clean, AI-ready Markdown — fast.**
4
+
5
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
8
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
9
+
10
+ <p align="center">
11
+ <a href="https://docpull.raintree.technology">
12
+ <img src="https://pub-e85a1abca36f4fd8b4300a6ec2d6f45f.r2.dev/marketing/docpull/1768954147343-iaiziy-docpull-terminal-hero.gif" alt="docpull demo" width="600">
13
+ </a>
14
+ </p>
15
+
16
+ docpull uses async HTTP (not Playwright) to fetch server-rendered pages,
17
+ extracts main content, and writes clean Markdown with source-URL frontmatter —
18
+ in seconds, with a small install footprint. It won't render JavaScript, but for
19
+ the large class of docs that don't need it (API references, Python/Go stdlib,
20
+ most dev-tool docs, OpenAPI specs, Next.js and Docusaurus builds), it is a
21
+ fast, auditable, sandbox-friendly way to pipe documentation into an LLM context,
22
+ a RAG index, or an offline archive. SSRF, XXE, DNS-rebinding, and
23
+ CRLF-injection protections are on by default — a necessity when an AI agent
24
+ is choosing the URLs.
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install docpull
30
+
31
+ # Optional extras
32
+ pip install 'docpull[llm]' # tiktoken for token-accurate chunking
33
+ pip install 'docpull[trafilatura]' # alternative extractor for noisy pages
34
+ pip install 'docpull[mcp]' # run as an MCP server for AI agents
35
+ pip install 'docpull[all]' # everything above
36
+ ```
37
+
38
+ ## Quick start
39
+
40
+ ```bash
41
+ # Crawl and save Markdown
42
+ docpull https://docs.example.com
43
+
44
+ # One page, no crawl — the fast path for agents
45
+ docpull https://docs.example.com/guide --single
46
+
47
+ # LLM-ready NDJSON with 4k-token chunks streamed to stdout
48
+ docpull https://docs.example.com --profile llm --stream | jq .
49
+
50
+ # Mirror a site for offline use
51
+ docpull https://docs.example.com --profile mirror --cache
52
+ ```
53
+
54
+ ## Framework-aware extraction
55
+
56
+ docpull inspects each page before running the generic extractor and can pull
57
+ content directly from framework data feeds:
58
+
59
+ | Framework | Strategy |
60
+ |-----------|----------|
61
+ | Next.js | Parses `__NEXT_DATA__` JSON |
62
+ | Mintlify | `__NEXT_DATA__` with Mintlify tagging |
63
+ | OpenAPI | Renders `openapi.json` / `swagger.json` into Markdown |
64
+ | Docusaurus| Detected and tagged; generic extractor produces Markdown |
65
+ | Sphinx | Detected and tagged; generic extractor produces Markdown |
66
+
67
+ JS-only SPAs with no server-rendered content are detected and skipped with a
68
+ clear reason (or, with `--strict-js-required`, reported as an error so agents
69
+ can route elsewhere).
70
+
71
+ ## Agent-friendly features
72
+
73
+ - **`--single`** — fetch a single URL without discovery. Designed for tool loops.
74
+ - **`--stream`** — NDJSON one-record-per-line, flushed on every page, pipeable.
75
+ - **`--max-tokens-per-file N`** — split each page into token-bounded chunks on
76
+ heading boundaries (exact counts with tiktoken, estimate without).
77
+ - **`--emit-chunks`** — write one file or record per chunk instead of per page.
78
+ - **`--strict-js-required`** — hard-fail on JS-only pages instead of silently
79
+ skipping.
80
+ - **`--extractor trafilatura`** — swap in [trafilatura](https://trafilatura.readthedocs.io/)
81
+ for sites where the default heuristics struggle.
82
+
83
+ ## Python API
84
+
85
+ ```python
86
+ from docpull import fetch_one
87
+
88
+ ctx = fetch_one("https://docs.python.org/3/library/asyncio.html")
89
+ print(ctx.title, ctx.source_type)
90
+ print(ctx.markdown[:500])
91
+ ```
92
+
93
+ Async streaming:
94
+
95
+ ```python
96
+ import asyncio
97
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
98
+
99
+ async def main():
100
+ cfg = DocpullConfig(
101
+ url="https://docs.example.com",
102
+ profile=ProfileName.LLM, # chunked NDJSON output
103
+ )
104
+ async with Fetcher(cfg) as fetcher:
105
+ async for event in fetcher.run():
106
+ if event.type == EventType.FETCH_PROGRESS:
107
+ print(f"{event.current}/{event.total}: {event.url}")
108
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
109
+
110
+ asyncio.run(main())
111
+ ```
112
+
113
+ Single-page from an agent tool:
114
+
115
+ ```python
116
+ from docpull import Fetcher, DocpullConfig
117
+
118
+ async def tool_call(url: str) -> str:
119
+ async with Fetcher(DocpullConfig(url=url)) as f:
120
+ ctx = await f.fetch_one(url, save=False)
121
+ return ctx.markdown or ctx.error or ""
122
+ ```
123
+
124
+ ## Profiles
125
+
126
+ ```bash
127
+ docpull https://site.com --profile rag # Default. Dedup, rich metadata.
128
+ docpull https://site.com --profile llm # NDJSON + chunks + metadata.
129
+ docpull https://site.com --profile mirror # Full archive, polite, cached.
130
+ docpull https://site.com --profile quick # Sampling: 50 pages, depth 2.
131
+ ```
132
+
133
+ ## MCP server
134
+
135
+ docpull ships an MCP (Model Context Protocol) server so AI agents can call it
136
+ directly over stdio:
137
+
138
+ ```bash
139
+ pip install 'docpull[mcp]'
140
+ docpull mcp # starts the stdio server
141
+ ```
142
+
143
+ Add to Claude Desktop or Claude Code:
144
+
145
+ ```json
146
+ {
147
+ "mcpServers": {
148
+ "docpull": {
149
+ "command": "docpull",
150
+ "args": ["mcp"]
151
+ }
152
+ }
153
+ }
154
+ ```
155
+
156
+ Tools exposed:
157
+
158
+ - `fetch_url(url, max_tokens?)` — one-shot fetch, no crawl
159
+ - `ensure_docs(source, force?)` — fetch a named library (cached 7 days)
160
+ - `list_sources(category?)` — show available aliases (react, nextjs, fastapi, …)
161
+ - `list_indexed()` — what has been fetched locally
162
+ - `grep_docs(pattern, library?)` — regex search across fetched Markdown
163
+
164
+ User-defined sources live in `~/.config/docpull-mcp/sources.yaml`:
165
+
166
+ ```yaml
167
+ sources:
168
+ mydocs:
169
+ url: https://docs.example.com
170
+ description: My internal docs
171
+ category: internal
172
+ maxPages: 200
173
+ ```
174
+
175
+ ## Output
176
+
177
+ Markdown files with YAML frontmatter:
178
+
179
+ ```markdown
180
+ ---
181
+ title: "Getting Started"
182
+ source: https://docs.example.com/guide
183
+ source_type: "nextjs"
184
+ ---
185
+
186
+ # Getting Started
187
+
188
+ ```
189
+
190
+ NDJSON (one record per page or chunk):
191
+
192
+ ```json
193
+ {"url": "...", "title": "...", "content": "...", "hash": "...", "token_count": 842, "chunk_index": 0}
194
+ ```
195
+
196
+ ## Security
197
+
198
+ - HTTPS-only, mandatory robots.txt compliance
199
+ - SSRF protection: blocks private/internal network IPs, DNS rebinding
200
+ - XXE protection via `defusedxml` on sitemaps
201
+ - Path traversal and CRLF header injection guards
202
+ - Auth headers stripped on cross-origin redirects
203
+
204
+ ## Options
205
+
206
+ Run `docpull --help` for the full list. Highlights:
207
+
208
+ ```
209
+ Core:
210
+ --profile {rag,mirror,quick,llm,custom}
211
+ --single Fetch one URL (no crawl)
212
+ --format {markdown,json,ndjson,sqlite}
213
+ --stream Stream NDJSON to stdout
214
+
215
+ LLM / chunking:
216
+ --max-tokens-per-file N
217
+ --tokenizer NAME tiktoken encoding (default cl100k_base)
218
+ --emit-chunks One file/record per chunk
219
+
220
+ Content extraction:
221
+ --extractor {default,trafilatura}
222
+ --no-special-cases Disable framework extractors
223
+ --strict-js-required Error on JS-only pages
224
+
225
+ Cache:
226
+ --cache Enable incremental updates
227
+ --cache-dir DIR
228
+ --cache-ttl DAYS
229
+ ```
230
+
231
+ ## Troubleshooting
232
+
233
+ ```bash
234
+ docpull --doctor # Check installation
235
+ docpull URL --verbose # Verbose output
236
+ docpull URL --dry-run # Test without downloading
237
+ docpull URL --preview-urls # List URLs without fetching
238
+ ```
239
+
240
+ ## Links
241
+
242
+ - [Website](https://docpull.raintree.technology)
243
+ - [PyPI](https://pypi.org/project/docpull/)
244
+ - [GitHub](https://github.com/raintree-technology/docpull)
245
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
246
+
247
+ ## License
248
+
249
+ MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.2.0"
7
+ version = "2.3.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -51,7 +51,6 @@ classifiers = [
51
51
 
52
52
  # Programming Language
53
53
  "Programming Language :: Python :: 3",
54
- "Programming Language :: Python :: 3.9",
55
54
  "Programming Language :: Python :: 3.10",
56
55
  "Programming Language :: Python :: 3.11",
57
56
  "Programming Language :: Python :: 3.12",
@@ -63,7 +62,6 @@ classifiers = [
63
62
  "Typing :: Typed",
64
63
  ]
65
64
  dependencies = [
66
- "requests>=2.31.0",
67
65
  "beautifulsoup4>=4.12.0",
68
66
  "html2text>=2020.1.16",
69
67
  "defusedxml>=0.7.1",
@@ -71,24 +69,34 @@ dependencies = [
71
69
  "aiohttp>=3.9.0",
72
70
  "rich>=13.0.0",
73
71
  "pyyaml>=6.0",
74
- "gitpython>=3.1.40",
75
72
  "pydantic>=2.0",
76
73
  ]
77
74
 
78
75
  [project.optional-dependencies]
79
- js = [
80
- "playwright>=1.40.0",
81
- ]
82
76
  proxy = [
83
77
  "aiohttp-socks>=0.8.0",
84
78
  ]
85
79
  normalize = [
86
80
  "url-normalize>=1.4.0",
87
81
  ]
82
+ trafilatura = [
83
+ "trafilatura>=1.12.0",
84
+ ]
85
+ tokens = [
86
+ "tiktoken>=0.7.0",
87
+ ]
88
+ mcp = [
89
+ "mcp>=1.0.0",
90
+ ]
91
+ llm = [
92
+ "tiktoken>=0.7.0",
93
+ ]
88
94
  all = [
89
- "playwright>=1.40.0",
90
95
  "aiohttp-socks>=0.8.0",
91
96
  "url-normalize>=1.4.0",
97
+ "trafilatura>=1.12.0",
98
+ "tiktoken>=0.7.0",
99
+ "mcp>=1.0.0",
92
100
  ]
93
101
  dev = [
94
102
  "pytest>=7.0.0",
@@ -126,11 +134,11 @@ docpull = ["py.typed"]
126
134
 
127
135
  [tool.black]
128
136
  line-length = 110
129
- target-version = ["py39", "py310", "py311", "py312", "py313", "py314"]
137
+ target-version = ["py310", "py311", "py312", "py313", "py314"]
130
138
 
131
139
  [tool.ruff]
132
140
  line-length = 110
133
- target-version = "py39"
141
+ target-version = "py310"
134
142
 
135
143
  [tool.ruff.lint]
136
144
  select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
@@ -154,10 +162,6 @@ init_forbid_extra = true
154
162
  init_typed = true
155
163
  warn_required_dynamic_aliases = true
156
164
 
157
- [[tool.mypy.overrides]]
158
- module = "playwright.*"
159
- ignore_missing_imports = true
160
-
161
165
  [[tool.mypy.overrides]]
162
166
  module = "extruct.*"
163
167
  ignore_missing_imports = true
@@ -171,11 +175,6 @@ module = "docpull.models.*"
171
175
  disallow_any_unimported = false
172
176
  warn_return_any = false
173
177
 
174
- [[tool.mypy.overrides]]
175
- module = "docpull.concurrency.browser_pool"
176
- disallow_any_unimported = false
177
- warn_return_any = false
178
-
179
178
  [[tool.mypy.overrides]]
180
179
  module = "tests.*"
181
180
  disallow_untyped_defs = false