docpull 1.2.1__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-1.2.1 → docpull-1.5.0}/PKG-INFO +137 -54
- {docpull-1.2.1 → docpull-1.5.0}/README.md +128 -52
- docpull-1.5.0/docpull/__init__.py +13 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/cli.py +78 -140
- {docpull-1.2.1 → docpull-1.5.0}/docpull/config.py +32 -11
- docpull-1.5.0/docpull/fetchers/__init__.py +11 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/async_fetcher.py +172 -31
- {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/base.py +246 -9
- {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/generic.py +25 -65
- {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/generic_async.py +105 -63
- docpull-1.5.0/docpull/metadata_extractor.py +283 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/sources_config.py +1 -0
- docpull-1.5.0/docpull.egg-info/PKG-INFO +478 -0
- docpull-1.5.0/docpull.egg-info/SOURCES.txt +49 -0
- docpull-1.5.0/docpull.egg-info/dependency_links.txt +1 -0
- docpull-1.5.0/docpull.egg-info/entry_points.txt +2 -0
- docpull-1.5.0/docpull.egg-info/requires.txt +38 -0
- docpull-1.5.0/docpull.egg-info/top_level.txt +1 -0
- {docpull-1.2.1 → docpull-1.5.0}/pyproject.toml +15 -2
- {docpull-1.2.1 → docpull-1.5.0}/tests/test_config.py +1 -5
- docpull-1.5.0/tests/test_metadata_extractor.py +233 -0
- docpull-1.2.1/.editorconfig +0 -30
- docpull-1.2.1/.pre-commit-config.yaml +0 -30
- docpull-1.2.1/CHANGELOG.md +0 -328
- docpull-1.2.1/CONTRIBUTING.md +0 -189
- docpull-1.2.1/MANIFEST.in +0 -49
- docpull-1.2.1/Makefile +0 -44
- docpull-1.2.1/SECURITY.md +0 -206
- docpull-1.2.1/TROUBLESHOOTING.md +0 -348
- docpull-1.2.1/docpull/__init__.py +0 -29
- docpull-1.2.1/docpull/fetchers/__init__.py +0 -23
- docpull-1.2.1/docpull/fetchers/bun.py +0 -59
- docpull-1.2.1/docpull/fetchers/d3.py +0 -211
- docpull-1.2.1/docpull/fetchers/nextjs.py +0 -59
- docpull-1.2.1/docpull/fetchers/plaid.py +0 -89
- docpull-1.2.1/docpull/fetchers/react.py +0 -59
- docpull-1.2.1/docpull/fetchers/stripe.py +0 -49
- docpull-1.2.1/docpull/fetchers/tailwind.py +0 -59
- docpull-1.2.1/docpull/fetchers/turborepo.py +0 -57
- docpull-1.2.1/docpull/profiles/__init__.py +0 -70
- docpull-1.2.1/docpull/profiles/base.py +0 -64
- docpull-1.2.1/docpull/profiles/bun.py +0 -14
- docpull-1.2.1/docpull/profiles/d3.py +0 -17
- docpull-1.2.1/docpull/profiles/nextjs.py +0 -15
- docpull-1.2.1/docpull/profiles/plaid.py +0 -16
- docpull-1.2.1/docpull/profiles/react.py +0 -14
- docpull-1.2.1/docpull/profiles/stripe.py +0 -14
- docpull-1.2.1/docpull/profiles/tailwind.py +0 -14
- docpull-1.2.1/docpull/profiles/turborepo.py +0 -14
- docpull-1.2.1/docpull/utils/__init__.py +0 -6
- docpull-1.2.1/docpull.egg-info/SOURCES.txt +0 -76
- docpull-1.2.1/examples/README.md +0 -280
- docpull-1.2.1/examples/deduplication-strategies.yaml +0 -29
- docpull-1.2.1/examples/format-conversion.yaml +0 -25
- docpull-1.2.1/examples/incremental-updates.yaml +0 -26
- docpull-1.2.1/examples/multi-source-optimized.yaml +0 -45
- docpull-1.2.1/examples/selective-crawling.yaml +0 -26
- docpull-1.2.1/examples/simple-optimization.yaml +0 -14
- docpull-1.2.1/requirements.txt +0 -34
- {docpull-1.2.1 → docpull-1.5.0}/LICENSE +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/__main__.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/archive.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/cache.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/doctor.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.2.1/docpull/utils → docpull-1.5.0/docpull}/file_utils.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/__init__.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/base.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/json.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/markdown.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/sqlite.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/toon.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/hooks.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/indexer.py +0 -0
- {docpull-1.2.1/docpull/utils → docpull-1.5.0/docpull}/logging_config.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/metadata.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/naming.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/orchestrator.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/__init__.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/base.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/content_filter.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/deduplicator.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/language_filter.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/size_limiter.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/py.typed +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/docpull/vcs.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/setup.cfg +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/tests/test_orchestrator.py +0 -0
- {docpull-1.2.1 → docpull-1.5.0}/tests/test_sources_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readm
|
|
|
10
10
|
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
11
|
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
12
|
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
-
Project-URL:
|
|
13
|
+
Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
|
|
14
14
|
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
15
|
Classifier: Development Status :: 5 - Production/Stable
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
@@ -43,14 +43,21 @@ Requires-Dist: requests>=2.31.0
|
|
|
43
43
|
Requires-Dist: beautifulsoup4>=4.12.0
|
|
44
44
|
Requires-Dist: html2text>=2020.1.16
|
|
45
45
|
Requires-Dist: defusedxml>=0.7.1
|
|
46
|
+
Requires-Dist: extruct>=0.15.0
|
|
46
47
|
Requires-Dist: aiohttp>=3.9.0
|
|
47
48
|
Requires-Dist: rich>=13.0.0
|
|
48
49
|
Requires-Dist: pyyaml>=6.0
|
|
49
50
|
Requires-Dist: gitpython>=3.1.40
|
|
50
51
|
Provides-Extra: js
|
|
51
52
|
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
53
|
+
Provides-Extra: proxy
|
|
54
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
55
|
+
Provides-Extra: normalize
|
|
56
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
|
|
52
57
|
Provides-Extra: all
|
|
53
58
|
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
59
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
|
|
60
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
54
61
|
Provides-Extra: dev
|
|
55
62
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
63
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -72,7 +79,11 @@ Dynamic: license-file
|
|
|
72
79
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
73
80
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
74
81
|
|
|
75
|
-
**NEW in v1.
|
|
82
|
+
**NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
|
|
83
|
+
|
|
84
|
+
**v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
85
|
+
|
|
86
|
+
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
76
87
|
|
|
77
88
|
[](https://www.python.org/downloads/)
|
|
78
89
|
[](https://badge.fury.io/py/docpull)
|
|
@@ -95,9 +106,22 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
95
106
|
- Sitemap + link crawling
|
|
96
107
|
- Rate limiting, timeouts, content-type checks
|
|
97
108
|
- Saves docs in structured Markdown with YAML metadata
|
|
98
|
-
-
|
|
99
|
-
|
|
100
|
-
### NEW in v1.
|
|
109
|
+
- **Mandatory robots.txt compliance** for TOS-friendly scraping
|
|
110
|
+
|
|
111
|
+
### NEW in v1.5.0: Network & Reliability
|
|
112
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
|
|
113
|
+
- **Retry with Exponential Backoff**: Configurable retries for transient failures
|
|
114
|
+
- **Custom User-Agent**: Set custom User-Agent strings for requests
|
|
115
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
116
|
+
- **Better Encoding Detection**: Intelligent charset detection for international docs
|
|
117
|
+
|
|
118
|
+
### v1.3.0: Rich Metadata Extraction
|
|
119
|
+
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
120
|
+
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
121
|
+
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
122
|
+
- **Opt-in Feature**: Enabled with `--rich-metadata` flag
|
|
123
|
+
|
|
124
|
+
### v1.2.0: Advanced Optimization
|
|
101
125
|
- **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
|
|
102
126
|
- **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
|
|
103
127
|
- **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
|
|
@@ -124,11 +148,14 @@ docpull --doctor # verify installation
|
|
|
124
148
|
|
|
125
149
|
# Basic usage
|
|
126
150
|
docpull https://aptos.dev
|
|
127
|
-
docpull
|
|
151
|
+
docpull https://docs.anthropic.com
|
|
128
152
|
|
|
129
153
|
# NEW: Simple optimization (v1.2.0)
|
|
130
154
|
docpull https://code.claude.com/docs --language en --create-index
|
|
131
155
|
|
|
156
|
+
# NEW: Rich metadata extraction (v1.3.0)
|
|
157
|
+
docpull https://docs.anthropic.com --rich-metadata --create-index
|
|
158
|
+
|
|
132
159
|
# NEW: Advanced optimization (v1.2.0)
|
|
133
160
|
docpull https://aptos.dev \
|
|
134
161
|
--deduplicate \
|
|
@@ -154,7 +181,7 @@ docpull https://site.com --js
|
|
|
154
181
|
from docpull import GenericAsyncFetcher
|
|
155
182
|
|
|
156
183
|
fetcher = GenericAsyncFetcher(
|
|
157
|
-
|
|
184
|
+
url="https://aptos.dev",
|
|
158
185
|
output_dir="./docs",
|
|
159
186
|
max_pages=100,
|
|
160
187
|
max_concurrent=20,
|
|
@@ -189,6 +216,7 @@ fetcher.fetch()
|
|
|
189
216
|
- `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
|
|
190
217
|
- `--create-index` – generate INDEX.md with navigation
|
|
191
218
|
- `--extract-metadata` – extract metadata to metadata.json
|
|
219
|
+
- `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
|
|
192
220
|
- `--update-only-changed` – only download changed files
|
|
193
221
|
- `--incremental` – enable incremental mode with resume
|
|
194
222
|
- `--git-commit` – auto-commit changes
|
|
@@ -197,6 +225,12 @@ fetcher.fetch()
|
|
|
197
225
|
- `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
|
|
198
226
|
- `--sources-file PATH` – multi-source configuration file
|
|
199
227
|
|
|
228
|
+
### NEW in v1.5.0: Network Options
|
|
229
|
+
- `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
|
|
230
|
+
- `--user-agent STRING` – custom User-Agent string
|
|
231
|
+
- `--max-retries N` – max retry attempts for failed requests (default: 3)
|
|
232
|
+
- `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
|
|
233
|
+
|
|
200
234
|
See `docpull --help` for complete list of options.
|
|
201
235
|
|
|
202
236
|
## Performance
|
|
@@ -215,33 +249,36 @@ Each downloaded page becomes a Markdown file:
|
|
|
215
249
|
|
|
216
250
|
```markdown
|
|
217
251
|
---
|
|
218
|
-
url: https://
|
|
219
|
-
fetched: 2025-11-
|
|
252
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
253
|
+
fetched: 2025-11-28
|
|
220
254
|
---
|
|
221
|
-
#
|
|
255
|
+
# Your First Transaction
|
|
222
256
|
...
|
|
223
257
|
```
|
|
224
258
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
## Configuration File
|
|
228
|
-
|
|
229
|
-
### Simple Configuration (v1.0+)
|
|
259
|
+
With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
|
|
230
260
|
|
|
231
|
-
```
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
261
|
+
```markdown
|
|
262
|
+
---
|
|
263
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
264
|
+
fetched: 2025-11-28
|
|
265
|
+
title: Your First Transaction
|
|
266
|
+
description: Learn how to submit your first transaction on Aptos
|
|
267
|
+
author: Aptos Foundation
|
|
268
|
+
keywords: [aptos, blockchain, transaction, guide]
|
|
269
|
+
image: https://aptos.dev/img/docs-preview.png
|
|
270
|
+
type: article
|
|
271
|
+
site_name: Aptos Documentation
|
|
272
|
+
---
|
|
273
|
+
# Your First Transaction
|
|
274
|
+
...
|
|
237
275
|
```
|
|
238
276
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
```
|
|
277
|
+
Directory layout mirrors the target site's structure.
|
|
278
|
+
|
|
279
|
+
## Configuration File
|
|
243
280
|
|
|
244
|
-
###
|
|
281
|
+
### Multi-Source Configuration
|
|
245
282
|
|
|
246
283
|
```yaml
|
|
247
284
|
sources:
|
|
@@ -250,6 +287,7 @@ sources:
|
|
|
250
287
|
language: en
|
|
251
288
|
max_file_size: 200kb
|
|
252
289
|
create_index: true
|
|
290
|
+
rich_metadata: true # Extract Open Graph, JSON-LD metadata
|
|
253
291
|
|
|
254
292
|
claude-code:
|
|
255
293
|
url: https://code.claude.com/docs
|
|
@@ -279,38 +317,27 @@ docpull --sources-file config.yaml
|
|
|
279
317
|
|
|
280
318
|
See `examples/` directory for more configuration examples.
|
|
281
319
|
|
|
282
|
-
## Custom Profiles
|
|
283
|
-
|
|
284
|
-
Easily define profiles for frequently scraped sites.
|
|
285
|
-
|
|
286
|
-
```python
|
|
287
|
-
from docpull.profiles.base import SiteProfile
|
|
288
|
-
|
|
289
|
-
MY_PROFILE = SiteProfile(
|
|
290
|
-
name="mysite",
|
|
291
|
-
domains={"docs.mysite.com"},
|
|
292
|
-
include_patterns=["/docs/", "/api/"],
|
|
293
|
-
)
|
|
294
|
-
```
|
|
295
|
-
|
|
296
320
|
## Security
|
|
297
321
|
|
|
298
|
-
- HTTPS-only
|
|
299
|
-
-
|
|
322
|
+
- HTTPS-only (HTTP rejected)
|
|
323
|
+
- **Mandatory robots.txt compliance** (cannot be disabled)
|
|
324
|
+
- Respects Crawl-delay directives
|
|
325
|
+
- Blocks private/internal network IPs
|
|
300
326
|
- 50MB page size limit
|
|
301
|
-
- Timeout controls
|
|
302
|
-
- Validates content-type
|
|
303
|
-
- Playwright sandboxing
|
|
327
|
+
- Timeout controls (30s connection, 5min download)
|
|
328
|
+
- Validates content-type headers
|
|
329
|
+
- Playwright sandboxing for JS rendering
|
|
330
|
+
- Path traversal protection
|
|
304
331
|
|
|
305
332
|
## Troubleshooting
|
|
306
333
|
|
|
307
334
|
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
308
|
-
- **Missing dependencies**:
|
|
309
|
-
- **Site requires JS**: install
|
|
310
|
-
- **Slow or rate limited**:
|
|
311
|
-
- **Large sites**:
|
|
312
|
-
|
|
313
|
-
|
|
335
|
+
- **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
|
|
336
|
+
- **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
|
|
337
|
+
- **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
|
|
338
|
+
- **Large sites**: Set `--max-pages` to limit crawl size
|
|
339
|
+
- **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
|
|
340
|
+
- **Transient failures**: Increase `--max-retries` (default: 3)
|
|
314
341
|
|
|
315
342
|
## v1.2.0 Feature Examples
|
|
316
343
|
|
|
@@ -366,9 +393,65 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
366
393
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
367
394
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
368
395
|
|
|
396
|
+
## What's New in v1.5.0
|
|
397
|
+
|
|
398
|
+
This release focuses on network reliability, proxy support, and TOS compliance.
|
|
399
|
+
|
|
400
|
+
**New Features**:
|
|
401
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
|
|
402
|
+
- Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
|
|
403
|
+
- Install SOCKS support: `pip install docpull[proxy]`
|
|
404
|
+
- **Retry with Exponential Backoff**: Automatic retries for transient failures
|
|
405
|
+
- `--max-retries N` (default: 3)
|
|
406
|
+
- `--retry-base-delay SECONDS` (default: 1.0)
|
|
407
|
+
- Handles 429, 500, 502, 503, 504 status codes
|
|
408
|
+
- **Custom User-Agent**: `--user-agent STRING` for custom identification
|
|
409
|
+
- **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
|
|
410
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
411
|
+
|
|
412
|
+
**Security Enhancement**:
|
|
413
|
+
- **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
|
|
414
|
+
- Ensures TOS-friendly scraping behavior
|
|
415
|
+
- Automatically adjusts rate limiting based on Crawl-delay
|
|
416
|
+
|
|
417
|
+
**Codebase Simplification**:
|
|
418
|
+
- Removed built-in profiles (Stripe, etc.) - use URLs directly
|
|
419
|
+
- Consolidated utility modules
|
|
420
|
+
- Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
|
|
421
|
+
|
|
422
|
+
**Backward Compatible**: All existing workflows continue to work unchanged.
|
|
423
|
+
|
|
424
|
+
## What's New in v1.3.0
|
|
425
|
+
|
|
426
|
+
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
427
|
+
|
|
428
|
+
**New Feature**:
|
|
429
|
+
- **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
|
|
430
|
+
- Adds author, description, keywords, images, publish dates, and more to frontmatter
|
|
431
|
+
- Enhances AI/RAG systems with richer context
|
|
432
|
+
- Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
|
|
433
|
+
- Powered by the extruct library
|
|
434
|
+
|
|
435
|
+
**Example enhanced frontmatter**:
|
|
436
|
+
```yaml
|
|
437
|
+
---
|
|
438
|
+
url: https://docs.example.com/guide
|
|
439
|
+
fetched: 2025-11-20
|
|
440
|
+
title: Getting Started Guide
|
|
441
|
+
description: Learn the basics of our platform
|
|
442
|
+
author: John Doe
|
|
443
|
+
keywords: [tutorial, guide, api]
|
|
444
|
+
image: https://docs.example.com/og-image.png
|
|
445
|
+
type: article
|
|
446
|
+
published_time: 2024-01-15T10:00:00Z
|
|
447
|
+
---
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
**Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
|
|
451
|
+
|
|
369
452
|
## What's New in v1.2.0
|
|
370
453
|
|
|
371
|
-
This release adds 15 major features across 4 phases.
|
|
454
|
+
This release adds 15 major features across 4 phases.
|
|
372
455
|
|
|
373
456
|
**Highlights**:
|
|
374
457
|
- Multi-source YAML configuration
|
|
@@ -387,7 +470,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
|
|
|
387
470
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
388
471
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
389
472
|
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
390
|
-
- [
|
|
473
|
+
- [Releases](https://github.com/raintree-technology/docpull/releases)
|
|
391
474
|
- [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
|
|
392
475
|
|
|
393
476
|
## License
|
|
@@ -3,7 +3,11 @@
|
|
|
3
3
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
4
4
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
5
5
|
|
|
6
|
-
**NEW in v1.
|
|
6
|
+
**NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
|
|
7
|
+
|
|
8
|
+
**v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
9
|
+
|
|
10
|
+
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
7
11
|
|
|
8
12
|
[](https://www.python.org/downloads/)
|
|
9
13
|
[](https://badge.fury.io/py/docpull)
|
|
@@ -26,9 +30,22 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
26
30
|
- Sitemap + link crawling
|
|
27
31
|
- Rate limiting, timeouts, content-type checks
|
|
28
32
|
- Saves docs in structured Markdown with YAML metadata
|
|
29
|
-
-
|
|
30
|
-
|
|
31
|
-
### NEW in v1.
|
|
33
|
+
- **Mandatory robots.txt compliance** for TOS-friendly scraping
|
|
34
|
+
|
|
35
|
+
### NEW in v1.5.0: Network & Reliability
|
|
36
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
|
|
37
|
+
- **Retry with Exponential Backoff**: Configurable retries for transient failures
|
|
38
|
+
- **Custom User-Agent**: Set custom User-Agent strings for requests
|
|
39
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
40
|
+
- **Better Encoding Detection**: Intelligent charset detection for international docs
|
|
41
|
+
|
|
42
|
+
### v1.3.0: Rich Metadata Extraction
|
|
43
|
+
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
44
|
+
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
45
|
+
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
46
|
+
- **Opt-in Feature**: Enabled with `--rich-metadata` flag
|
|
47
|
+
|
|
48
|
+
### v1.2.0: Advanced Optimization
|
|
32
49
|
- **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
|
|
33
50
|
- **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
|
|
34
51
|
- **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
|
|
@@ -55,11 +72,14 @@ docpull --doctor # verify installation
|
|
|
55
72
|
|
|
56
73
|
# Basic usage
|
|
57
74
|
docpull https://aptos.dev
|
|
58
|
-
docpull
|
|
75
|
+
docpull https://docs.anthropic.com
|
|
59
76
|
|
|
60
77
|
# NEW: Simple optimization (v1.2.0)
|
|
61
78
|
docpull https://code.claude.com/docs --language en --create-index
|
|
62
79
|
|
|
80
|
+
# NEW: Rich metadata extraction (v1.3.0)
|
|
81
|
+
docpull https://docs.anthropic.com --rich-metadata --create-index
|
|
82
|
+
|
|
63
83
|
# NEW: Advanced optimization (v1.2.0)
|
|
64
84
|
docpull https://aptos.dev \
|
|
65
85
|
--deduplicate \
|
|
@@ -85,7 +105,7 @@ docpull https://site.com --js
|
|
|
85
105
|
from docpull import GenericAsyncFetcher
|
|
86
106
|
|
|
87
107
|
fetcher = GenericAsyncFetcher(
|
|
88
|
-
|
|
108
|
+
url="https://aptos.dev",
|
|
89
109
|
output_dir="./docs",
|
|
90
110
|
max_pages=100,
|
|
91
111
|
max_concurrent=20,
|
|
@@ -120,6 +140,7 @@ fetcher.fetch()
|
|
|
120
140
|
- `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
|
|
121
141
|
- `--create-index` – generate INDEX.md with navigation
|
|
122
142
|
- `--extract-metadata` – extract metadata to metadata.json
|
|
143
|
+
- `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
|
|
123
144
|
- `--update-only-changed` – only download changed files
|
|
124
145
|
- `--incremental` – enable incremental mode with resume
|
|
125
146
|
- `--git-commit` – auto-commit changes
|
|
@@ -128,6 +149,12 @@ fetcher.fetch()
|
|
|
128
149
|
- `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
|
|
129
150
|
- `--sources-file PATH` – multi-source configuration file
|
|
130
151
|
|
|
152
|
+
### NEW in v1.5.0: Network Options
|
|
153
|
+
- `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
|
|
154
|
+
- `--user-agent STRING` – custom User-Agent string
|
|
155
|
+
- `--max-retries N` – max retry attempts for failed requests (default: 3)
|
|
156
|
+
- `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
|
|
157
|
+
|
|
131
158
|
See `docpull --help` for complete list of options.
|
|
132
159
|
|
|
133
160
|
## Performance
|
|
@@ -146,33 +173,36 @@ Each downloaded page becomes a Markdown file:
|
|
|
146
173
|
|
|
147
174
|
```markdown
|
|
148
175
|
---
|
|
149
|
-
url: https://
|
|
150
|
-
fetched: 2025-11-
|
|
176
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
177
|
+
fetched: 2025-11-28
|
|
151
178
|
---
|
|
152
|
-
#
|
|
179
|
+
# Your First Transaction
|
|
153
180
|
...
|
|
154
181
|
```
|
|
155
182
|
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
## Configuration File
|
|
159
|
-
|
|
160
|
-
### Simple Configuration (v1.0+)
|
|
183
|
+
With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
|
|
161
184
|
|
|
162
|
-
```
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
185
|
+
```markdown
|
|
186
|
+
---
|
|
187
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
188
|
+
fetched: 2025-11-28
|
|
189
|
+
title: Your First Transaction
|
|
190
|
+
description: Learn how to submit your first transaction on Aptos
|
|
191
|
+
author: Aptos Foundation
|
|
192
|
+
keywords: [aptos, blockchain, transaction, guide]
|
|
193
|
+
image: https://aptos.dev/img/docs-preview.png
|
|
194
|
+
type: article
|
|
195
|
+
site_name: Aptos Documentation
|
|
196
|
+
---
|
|
197
|
+
# Your First Transaction
|
|
198
|
+
...
|
|
168
199
|
```
|
|
169
200
|
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
```
|
|
201
|
+
Directory layout mirrors the target site's structure.
|
|
202
|
+
|
|
203
|
+
## Configuration File
|
|
174
204
|
|
|
175
|
-
###
|
|
205
|
+
### Multi-Source Configuration
|
|
176
206
|
|
|
177
207
|
```yaml
|
|
178
208
|
sources:
|
|
@@ -181,6 +211,7 @@ sources:
|
|
|
181
211
|
language: en
|
|
182
212
|
max_file_size: 200kb
|
|
183
213
|
create_index: true
|
|
214
|
+
rich_metadata: true # Extract Open Graph, JSON-LD metadata
|
|
184
215
|
|
|
185
216
|
claude-code:
|
|
186
217
|
url: https://code.claude.com/docs
|
|
@@ -210,38 +241,27 @@ docpull --sources-file config.yaml
|
|
|
210
241
|
|
|
211
242
|
See `examples/` directory for more configuration examples.
|
|
212
243
|
|
|
213
|
-
## Custom Profiles
|
|
214
|
-
|
|
215
|
-
Easily define profiles for frequently scraped sites.
|
|
216
|
-
|
|
217
|
-
```python
|
|
218
|
-
from docpull.profiles.base import SiteProfile
|
|
219
|
-
|
|
220
|
-
MY_PROFILE = SiteProfile(
|
|
221
|
-
name="mysite",
|
|
222
|
-
domains={"docs.mysite.com"},
|
|
223
|
-
include_patterns=["/docs/", "/api/"],
|
|
224
|
-
)
|
|
225
|
-
```
|
|
226
|
-
|
|
227
244
|
## Security
|
|
228
245
|
|
|
229
|
-
- HTTPS-only
|
|
230
|
-
-
|
|
246
|
+
- HTTPS-only (HTTP rejected)
|
|
247
|
+
- **Mandatory robots.txt compliance** (cannot be disabled)
|
|
248
|
+
- Respects Crawl-delay directives
|
|
249
|
+
- Blocks private/internal network IPs
|
|
231
250
|
- 50MB page size limit
|
|
232
|
-
- Timeout controls
|
|
233
|
-
- Validates content-type
|
|
234
|
-
- Playwright sandboxing
|
|
251
|
+
- Timeout controls (30s connection, 5min download)
|
|
252
|
+
- Validates content-type headers
|
|
253
|
+
- Playwright sandboxing for JS rendering
|
|
254
|
+
- Path traversal protection
|
|
235
255
|
|
|
236
256
|
## Troubleshooting
|
|
237
257
|
|
|
238
258
|
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
239
|
-
- **Missing dependencies**:
|
|
240
|
-
- **Site requires JS**: install
|
|
241
|
-
- **Slow or rate limited**:
|
|
242
|
-
- **Large sites**:
|
|
243
|
-
|
|
244
|
-
|
|
259
|
+
- **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
|
|
260
|
+
- **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
|
|
261
|
+
- **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
|
|
262
|
+
- **Large sites**: Set `--max-pages` to limit crawl size
|
|
263
|
+
- **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
|
|
264
|
+
- **Transient failures**: Increase `--max-retries` (default: 3)
|
|
245
265
|
|
|
246
266
|
## v1.2.0 Feature Examples
|
|
247
267
|
|
|
@@ -297,9 +317,65 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
297
317
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
298
318
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
299
319
|
|
|
320
|
+
## What's New in v1.5.0
|
|
321
|
+
|
|
322
|
+
This release focuses on network reliability, proxy support, and TOS compliance.
|
|
323
|
+
|
|
324
|
+
**New Features**:
|
|
325
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
|
|
326
|
+
- Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
|
|
327
|
+
- Install SOCKS support: `pip install docpull[proxy]`
|
|
328
|
+
- **Retry with Exponential Backoff**: Automatic retries for transient failures
|
|
329
|
+
- `--max-retries N` (default: 3)
|
|
330
|
+
- `--retry-base-delay SECONDS` (default: 1.0)
|
|
331
|
+
- Handles 429, 500, 502, 503, 504 status codes
|
|
332
|
+
- **Custom User-Agent**: `--user-agent STRING` for custom identification
|
|
333
|
+
- **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
|
|
334
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
335
|
+
|
|
336
|
+
**Security Enhancement**:
|
|
337
|
+
- **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
|
|
338
|
+
- Ensures TOS-friendly scraping behavior
|
|
339
|
+
- Automatically adjusts rate limiting based on Crawl-delay
|
|
340
|
+
|
|
341
|
+
**Codebase Simplification**:
|
|
342
|
+
- Removed built-in profiles (Stripe, etc.) - use URLs directly
|
|
343
|
+
- Consolidated utility modules
|
|
344
|
+
- Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
|
|
345
|
+
|
|
346
|
+
**Backward Compatible**: All existing workflows continue to work unchanged.
|
|
347
|
+
|
|
348
|
+
## What's New in v1.3.0
|
|
349
|
+
|
|
350
|
+
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
351
|
+
|
|
352
|
+
**New Feature**:
|
|
353
|
+
- **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
|
|
354
|
+
- Adds author, description, keywords, images, publish dates, and more to frontmatter
|
|
355
|
+
- Enhances AI/RAG systems with richer context
|
|
356
|
+
- Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
|
|
357
|
+
- Powered by the extruct library
|
|
358
|
+
|
|
359
|
+
**Example enhanced frontmatter**:
|
|
360
|
+
```yaml
|
|
361
|
+
---
|
|
362
|
+
url: https://docs.example.com/guide
|
|
363
|
+
fetched: 2025-11-20
|
|
364
|
+
title: Getting Started Guide
|
|
365
|
+
description: Learn the basics of our platform
|
|
366
|
+
author: John Doe
|
|
367
|
+
keywords: [tutorial, guide, api]
|
|
368
|
+
image: https://docs.example.com/og-image.png
|
|
369
|
+
type: article
|
|
370
|
+
published_time: 2024-01-15T10:00:00Z
|
|
371
|
+
---
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
|
|
375
|
+
|
|
300
376
|
## What's New in v1.2.0
|
|
301
377
|
|
|
302
|
-
This release adds 15 major features across 4 phases.
|
|
378
|
+
This release adds 15 major features across 4 phases.
|
|
303
379
|
|
|
304
380
|
**Highlights**:
|
|
305
381
|
- Multi-source YAML configuration
|
|
@@ -318,7 +394,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
|
|
|
318
394
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
319
395
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
320
396
|
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
321
|
-
- [
|
|
397
|
+
- [Releases](https://github.com/raintree-technology/docpull/releases)
|
|
322
398
|
- [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
|
|
323
399
|
|
|
324
400
|
## License
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
__version__ = "1.5.0"
|
|
2
|
+
|
|
3
|
+
from .fetchers.base import BaseFetcher
|
|
4
|
+
from .fetchers.generic import GenericFetcher
|
|
5
|
+
from .fetchers.generic_async import GenericAsyncFetcher
|
|
6
|
+
from .fetchers.parallel_base import ParallelFetcher
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseFetcher",
|
|
10
|
+
"GenericFetcher",
|
|
11
|
+
"GenericAsyncFetcher",
|
|
12
|
+
"ParallelFetcher",
|
|
13
|
+
]
|