docpull 1.3.0__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-1.3.0 → docpull-1.5.0}/PKG-INFO +85 -66
- {docpull-1.3.0 → docpull-1.5.0}/README.md +77 -64
- {docpull-1.3.0 → docpull-1.5.0}/docpull/__init__.py +1 -3
- {docpull-1.3.0 → docpull-1.5.0}/docpull/cli.py +70 -139
- {docpull-1.3.0 → docpull-1.5.0}/docpull/config.py +29 -13
- docpull-1.5.0/docpull/fetchers/__init__.py +11 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/fetchers/async_fetcher.py +172 -31
- {docpull-1.3.0 → docpull-1.5.0}/docpull/fetchers/base.py +192 -8
- {docpull-1.3.0 → docpull-1.5.0}/docpull/fetchers/generic.py +25 -65
- {docpull-1.3.0 → docpull-1.5.0}/docpull/fetchers/generic_async.py +95 -61
- {docpull-1.3.0 → docpull-1.5.0}/docpull/metadata_extractor.py +3 -3
- docpull-1.5.0/docpull.egg-info/PKG-INFO +478 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull.egg-info/SOURCES.txt +8 -23
- docpull-1.5.0/docpull.egg-info/dependency_links.txt +1 -0
- docpull-1.5.0/docpull.egg-info/entry_points.txt +2 -0
- docpull-1.5.0/docpull.egg-info/requires.txt +38 -0
- docpull-1.5.0/docpull.egg-info/top_level.txt +1 -0
- {docpull-1.3.0 → docpull-1.5.0}/pyproject.toml +10 -2
- {docpull-1.3.0 → docpull-1.5.0}/tests/test_config.py +1 -5
- docpull-1.3.0/.editorconfig +0 -30
- docpull-1.3.0/.pre-commit-config.yaml +0 -30
- docpull-1.3.0/CHANGELOG.md +0 -403
- docpull-1.3.0/CONTRIBUTING.md +0 -189
- docpull-1.3.0/MANIFEST.in +0 -49
- docpull-1.3.0/Makefile +0 -44
- docpull-1.3.0/SECURITY.md +0 -206
- docpull-1.3.0/TROUBLESHOOTING.md +0 -348
- docpull-1.3.0/docpull/fetchers/__init__.py +0 -9
- docpull-1.3.0/docpull/fetchers/stripe.py +0 -49
- docpull-1.3.0/docpull/profiles/__init__.py +0 -53
- docpull-1.3.0/docpull/profiles/base.py +0 -64
- docpull-1.3.0/docpull/profiles/stripe.py +0 -14
- docpull-1.3.0/docpull/utils/__init__.py +0 -6
- docpull-1.3.0/examples/README.md +0 -280
- docpull-1.3.0/examples/deduplication-strategies.yaml +0 -29
- docpull-1.3.0/examples/format-conversion.yaml +0 -25
- docpull-1.3.0/examples/incremental-updates.yaml +0 -26
- docpull-1.3.0/examples/multi-source-optimized.yaml +0 -45
- docpull-1.3.0/examples/selective-crawling.yaml +0 -26
- docpull-1.3.0/examples/simple-optimization.yaml +0 -14
- docpull-1.3.0/requirements.txt +0 -34
- {docpull-1.3.0 → docpull-1.5.0}/LICENSE +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/__main__.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/archive.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/cache.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/doctor.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.3.0/docpull/utils → docpull-1.5.0/docpull}/file_utils.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/__init__.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/base.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/json.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/markdown.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/sqlite.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/formatters/toon.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/hooks.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/indexer.py +0 -0
- {docpull-1.3.0/docpull/utils → docpull-1.5.0/docpull}/logging_config.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/metadata.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/naming.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/orchestrator.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/__init__.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/base.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/content_filter.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/deduplicator.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/language_filter.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/processors/size_limiter.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/py.typed +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/sources_config.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/docpull/vcs.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/setup.cfg +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/tests/test_metadata_extractor.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/tests/test_orchestrator.py +0 -0
- {docpull-1.3.0 → docpull-1.5.0}/tests/test_sources_config.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readm
|
|
|
10
10
|
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
11
|
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
12
|
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
-
Project-URL:
|
|
13
|
+
Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
|
|
14
14
|
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
15
|
Classifier: Development Status :: 5 - Production/Stable
|
|
16
16
|
Classifier: Intended Audience :: Developers
|
|
@@ -50,8 +50,14 @@ Requires-Dist: pyyaml>=6.0
|
|
|
50
50
|
Requires-Dist: gitpython>=3.1.40
|
|
51
51
|
Provides-Extra: js
|
|
52
52
|
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
53
|
+
Provides-Extra: proxy
|
|
54
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
55
|
+
Provides-Extra: normalize
|
|
56
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
|
|
53
57
|
Provides-Extra: all
|
|
54
58
|
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
59
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
|
|
60
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
55
61
|
Provides-Extra: dev
|
|
56
62
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
57
63
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -73,7 +79,9 @@ Dynamic: license-file
|
|
|
73
79
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
74
80
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
75
81
|
|
|
76
|
-
**NEW in v1.
|
|
82
|
+
**NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
|
|
83
|
+
|
|
84
|
+
**v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
77
85
|
|
|
78
86
|
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
79
87
|
|
|
@@ -98,9 +106,16 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
98
106
|
- Sitemap + link crawling
|
|
99
107
|
- Rate limiting, timeouts, content-type checks
|
|
100
108
|
- Saves docs in structured Markdown with YAML metadata
|
|
101
|
-
-
|
|
109
|
+
- **Mandatory robots.txt compliance** for TOS-friendly scraping
|
|
110
|
+
|
|
111
|
+
### NEW in v1.5.0: Network & Reliability
|
|
112
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
|
|
113
|
+
- **Retry with Exponential Backoff**: Configurable retries for transient failures
|
|
114
|
+
- **Custom User-Agent**: Set custom User-Agent strings for requests
|
|
115
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
116
|
+
- **Better Encoding Detection**: Intelligent charset detection for international docs
|
|
102
117
|
|
|
103
|
-
###
|
|
118
|
+
### v1.3.0: Rich Metadata Extraction
|
|
104
119
|
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
105
120
|
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
106
121
|
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
@@ -133,7 +148,7 @@ docpull --doctor # verify installation
|
|
|
133
148
|
|
|
134
149
|
# Basic usage
|
|
135
150
|
docpull https://aptos.dev
|
|
136
|
-
docpull
|
|
151
|
+
docpull https://docs.anthropic.com
|
|
137
152
|
|
|
138
153
|
# NEW: Simple optimization (v1.2.0)
|
|
139
154
|
docpull https://code.claude.com/docs --language en --create-index
|
|
@@ -166,7 +181,7 @@ docpull https://site.com --js
|
|
|
166
181
|
from docpull import GenericAsyncFetcher
|
|
167
182
|
|
|
168
183
|
fetcher = GenericAsyncFetcher(
|
|
169
|
-
|
|
184
|
+
url="https://aptos.dev",
|
|
170
185
|
output_dir="./docs",
|
|
171
186
|
max_pages=100,
|
|
172
187
|
max_concurrent=20,
|
|
@@ -210,6 +225,12 @@ fetcher.fetch()
|
|
|
210
225
|
- `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
|
|
211
226
|
- `--sources-file PATH` – multi-source configuration file
|
|
212
227
|
|
|
228
|
+
### NEW in v1.5.0: Network Options
|
|
229
|
+
- `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
|
|
230
|
+
- `--user-agent STRING` – custom User-Agent string
|
|
231
|
+
- `--max-retries N` – max retry attempts for failed requests (default: 3)
|
|
232
|
+
- `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
|
|
233
|
+
|
|
213
234
|
See `docpull --help` for complete list of options.
|
|
214
235
|
|
|
215
236
|
## Performance
|
|
@@ -228,10 +249,10 @@ Each downloaded page becomes a Markdown file:
|
|
|
228
249
|
|
|
229
250
|
```markdown
|
|
230
251
|
---
|
|
231
|
-
url: https://
|
|
232
|
-
fetched: 2025-11-
|
|
252
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
253
|
+
fetched: 2025-11-28
|
|
233
254
|
---
|
|
234
|
-
#
|
|
255
|
+
# Your First Transaction
|
|
235
256
|
...
|
|
236
257
|
```
|
|
237
258
|
|
|
@@ -239,17 +260,17 @@ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other
|
|
|
239
260
|
|
|
240
261
|
```markdown
|
|
241
262
|
---
|
|
242
|
-
url: https://
|
|
243
|
-
fetched: 2025-11-
|
|
244
|
-
title:
|
|
245
|
-
description: Learn how to
|
|
246
|
-
author:
|
|
247
|
-
keywords: [
|
|
248
|
-
image: https://
|
|
263
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
264
|
+
fetched: 2025-11-28
|
|
265
|
+
title: Your First Transaction
|
|
266
|
+
description: Learn how to submit your first transaction on Aptos
|
|
267
|
+
author: Aptos Foundation
|
|
268
|
+
keywords: [aptos, blockchain, transaction, guide]
|
|
269
|
+
image: https://aptos.dev/img/docs-preview.png
|
|
249
270
|
type: article
|
|
250
|
-
site_name:
|
|
271
|
+
site_name: Aptos Documentation
|
|
251
272
|
---
|
|
252
|
-
#
|
|
273
|
+
# Your First Transaction
|
|
253
274
|
...
|
|
254
275
|
```
|
|
255
276
|
|
|
@@ -257,22 +278,7 @@ Directory layout mirrors the target site's structure.
|
|
|
257
278
|
|
|
258
279
|
## Configuration File
|
|
259
280
|
|
|
260
|
-
###
|
|
261
|
-
|
|
262
|
-
```yaml
|
|
263
|
-
output_dir: ./docs
|
|
264
|
-
rate_limit: 0.5
|
|
265
|
-
sources:
|
|
266
|
-
- stripe # Built-in profile
|
|
267
|
-
- https://docs.example.com # Or any URL
|
|
268
|
-
```
|
|
269
|
-
|
|
270
|
-
Run with:
|
|
271
|
-
```bash
|
|
272
|
-
docpull --config config.yaml
|
|
273
|
-
```
|
|
274
|
-
|
|
275
|
-
### NEW: Multi-Source Configuration (v1.2.0)
|
|
281
|
+
### Multi-Source Configuration
|
|
276
282
|
|
|
277
283
|
```yaml
|
|
278
284
|
sources:
|
|
@@ -311,42 +317,27 @@ docpull --sources-file config.yaml
|
|
|
311
317
|
|
|
312
318
|
See `examples/` directory for more configuration examples.
|
|
313
319
|
|
|
314
|
-
## Custom Profiles
|
|
315
|
-
|
|
316
|
-
docpull includes a Stripe profile as reference. Create custom profiles for other sites:
|
|
317
|
-
|
|
318
|
-
```python
|
|
319
|
-
from docpull.profiles.base import SiteProfile
|
|
320
|
-
|
|
321
|
-
MY_PROFILE = SiteProfile(
|
|
322
|
-
name="mysite",
|
|
323
|
-
domains={"docs.mysite.com"},
|
|
324
|
-
include_patterns=["/docs/", "/api/"],
|
|
325
|
-
sitemap_url="https://docs.mysite.com/sitemap.xml",
|
|
326
|
-
rate_limit=0.5,
|
|
327
|
-
)
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
**Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
|
|
331
|
-
|
|
332
320
|
## Security
|
|
333
321
|
|
|
334
|
-
- HTTPS-only
|
|
335
|
-
-
|
|
322
|
+
- HTTPS-only (HTTP rejected)
|
|
323
|
+
- **Mandatory robots.txt compliance** (cannot be disabled)
|
|
324
|
+
- Respects Crawl-delay directives
|
|
325
|
+
- Blocks private/internal network IPs
|
|
336
326
|
- 50MB page size limit
|
|
337
|
-
- Timeout controls
|
|
338
|
-
- Validates content-type
|
|
339
|
-
- Playwright sandboxing
|
|
327
|
+
- Timeout controls (30s connection, 5min download)
|
|
328
|
+
- Validates content-type headers
|
|
329
|
+
- Playwright sandboxing for JS rendering
|
|
330
|
+
- Path traversal protection
|
|
340
331
|
|
|
341
332
|
## Troubleshooting
|
|
342
333
|
|
|
343
334
|
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
344
|
-
- **Missing dependencies**:
|
|
345
|
-
- **Site requires JS**: install
|
|
346
|
-
- **Slow or rate limited**:
|
|
347
|
-
- **Large sites**:
|
|
348
|
-
|
|
349
|
-
|
|
335
|
+
- **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
|
|
336
|
+
- **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
|
|
337
|
+
- **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
|
|
338
|
+
- **Large sites**: Set `--max-pages` to limit crawl size
|
|
339
|
+
- **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
|
|
340
|
+
- **Transient failures**: Increase `--max-retries` (default: 3)
|
|
350
341
|
|
|
351
342
|
## v1.2.0 Feature Examples
|
|
352
343
|
|
|
@@ -402,6 +393,34 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
402
393
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
403
394
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
404
395
|
|
|
396
|
+
## What's New in v1.5.0
|
|
397
|
+
|
|
398
|
+
This release focuses on network reliability, proxy support, and TOS compliance.
|
|
399
|
+
|
|
400
|
+
**New Features**:
|
|
401
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
|
|
402
|
+
- Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
|
|
403
|
+
- Install SOCKS support: `pip install docpull[proxy]`
|
|
404
|
+
- **Retry with Exponential Backoff**: Automatic retries for transient failures
|
|
405
|
+
- `--max-retries N` (default: 3)
|
|
406
|
+
- `--retry-base-delay SECONDS` (default: 1.0)
|
|
407
|
+
- Handles 429, 500, 502, 503, 504 status codes
|
|
408
|
+
- **Custom User-Agent**: `--user-agent STRING` for custom identification
|
|
409
|
+
- **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
|
|
410
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
411
|
+
|
|
412
|
+
**Security Enhancement**:
|
|
413
|
+
- **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
|
|
414
|
+
- Ensures TOS-friendly scraping behavior
|
|
415
|
+
- Automatically adjusts rate limiting based on Crawl-delay
|
|
416
|
+
|
|
417
|
+
**Codebase Simplification**:
|
|
418
|
+
- Removed built-in profiles (Stripe, etc.) - use URLs directly
|
|
419
|
+
- Consolidated utility modules
|
|
420
|
+
- Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
|
|
421
|
+
|
|
422
|
+
**Backward Compatible**: All existing workflows continue to work unchanged.
|
|
423
|
+
|
|
405
424
|
## What's New in v1.3.0
|
|
406
425
|
|
|
407
426
|
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
@@ -432,7 +451,7 @@ published_time: 2024-01-15T10:00:00Z
|
|
|
432
451
|
|
|
433
452
|
## What's New in v1.2.0
|
|
434
453
|
|
|
435
|
-
This release adds 15 major features across 4 phases.
|
|
454
|
+
This release adds 15 major features across 4 phases.
|
|
436
455
|
|
|
437
456
|
**Highlights**:
|
|
438
457
|
- Multi-source YAML configuration
|
|
@@ -451,7 +470,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
|
|
|
451
470
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
452
471
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
453
472
|
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
454
|
-
- [
|
|
473
|
+
- [Releases](https://github.com/raintree-technology/docpull/releases)
|
|
455
474
|
- [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
|
|
456
475
|
|
|
457
476
|
## License
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
4
4
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
5
5
|
|
|
6
|
-
**NEW in v1.
|
|
6
|
+
**NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
|
|
7
|
+
|
|
8
|
+
**v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
7
9
|
|
|
8
10
|
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
9
11
|
|
|
@@ -28,9 +30,16 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
28
30
|
- Sitemap + link crawling
|
|
29
31
|
- Rate limiting, timeouts, content-type checks
|
|
30
32
|
- Saves docs in structured Markdown with YAML metadata
|
|
31
|
-
-
|
|
33
|
+
- **Mandatory robots.txt compliance** for TOS-friendly scraping
|
|
34
|
+
|
|
35
|
+
### NEW in v1.5.0: Network & Reliability
|
|
36
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
|
|
37
|
+
- **Retry with Exponential Backoff**: Configurable retries for transient failures
|
|
38
|
+
- **Custom User-Agent**: Set custom User-Agent strings for requests
|
|
39
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
40
|
+
- **Better Encoding Detection**: Intelligent charset detection for international docs
|
|
32
41
|
|
|
33
|
-
###
|
|
42
|
+
### v1.3.0: Rich Metadata Extraction
|
|
34
43
|
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
35
44
|
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
36
45
|
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
@@ -63,7 +72,7 @@ docpull --doctor # verify installation
|
|
|
63
72
|
|
|
64
73
|
# Basic usage
|
|
65
74
|
docpull https://aptos.dev
|
|
66
|
-
docpull
|
|
75
|
+
docpull https://docs.anthropic.com
|
|
67
76
|
|
|
68
77
|
# NEW: Simple optimization (v1.2.0)
|
|
69
78
|
docpull https://code.claude.com/docs --language en --create-index
|
|
@@ -96,7 +105,7 @@ docpull https://site.com --js
|
|
|
96
105
|
from docpull import GenericAsyncFetcher
|
|
97
106
|
|
|
98
107
|
fetcher = GenericAsyncFetcher(
|
|
99
|
-
|
|
108
|
+
url="https://aptos.dev",
|
|
100
109
|
output_dir="./docs",
|
|
101
110
|
max_pages=100,
|
|
102
111
|
max_concurrent=20,
|
|
@@ -140,6 +149,12 @@ fetcher.fetch()
|
|
|
140
149
|
- `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
|
|
141
150
|
- `--sources-file PATH` – multi-source configuration file
|
|
142
151
|
|
|
152
|
+
### NEW in v1.5.0: Network Options
|
|
153
|
+
- `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
|
|
154
|
+
- `--user-agent STRING` – custom User-Agent string
|
|
155
|
+
- `--max-retries N` – max retry attempts for failed requests (default: 3)
|
|
156
|
+
- `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
|
|
157
|
+
|
|
143
158
|
See `docpull --help` for complete list of options.
|
|
144
159
|
|
|
145
160
|
## Performance
|
|
@@ -158,10 +173,10 @@ Each downloaded page becomes a Markdown file:
|
|
|
158
173
|
|
|
159
174
|
```markdown
|
|
160
175
|
---
|
|
161
|
-
url: https://
|
|
162
|
-
fetched: 2025-11-
|
|
176
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
177
|
+
fetched: 2025-11-28
|
|
163
178
|
---
|
|
164
|
-
#
|
|
179
|
+
# Your First Transaction
|
|
165
180
|
...
|
|
166
181
|
```
|
|
167
182
|
|
|
@@ -169,17 +184,17 @@ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other
|
|
|
169
184
|
|
|
170
185
|
```markdown
|
|
171
186
|
---
|
|
172
|
-
url: https://
|
|
173
|
-
fetched: 2025-11-
|
|
174
|
-
title:
|
|
175
|
-
description: Learn how to
|
|
176
|
-
author:
|
|
177
|
-
keywords: [
|
|
178
|
-
image: https://
|
|
187
|
+
url: https://aptos.dev/build/guides/first-transaction
|
|
188
|
+
fetched: 2025-11-28
|
|
189
|
+
title: Your First Transaction
|
|
190
|
+
description: Learn how to submit your first transaction on Aptos
|
|
191
|
+
author: Aptos Foundation
|
|
192
|
+
keywords: [aptos, blockchain, transaction, guide]
|
|
193
|
+
image: https://aptos.dev/img/docs-preview.png
|
|
179
194
|
type: article
|
|
180
|
-
site_name:
|
|
195
|
+
site_name: Aptos Documentation
|
|
181
196
|
---
|
|
182
|
-
#
|
|
197
|
+
# Your First Transaction
|
|
183
198
|
...
|
|
184
199
|
```
|
|
185
200
|
|
|
@@ -187,22 +202,7 @@ Directory layout mirrors the target site's structure.
|
|
|
187
202
|
|
|
188
203
|
## Configuration File
|
|
189
204
|
|
|
190
|
-
###
|
|
191
|
-
|
|
192
|
-
```yaml
|
|
193
|
-
output_dir: ./docs
|
|
194
|
-
rate_limit: 0.5
|
|
195
|
-
sources:
|
|
196
|
-
- stripe # Built-in profile
|
|
197
|
-
- https://docs.example.com # Or any URL
|
|
198
|
-
```
|
|
199
|
-
|
|
200
|
-
Run with:
|
|
201
|
-
```bash
|
|
202
|
-
docpull --config config.yaml
|
|
203
|
-
```
|
|
204
|
-
|
|
205
|
-
### NEW: Multi-Source Configuration (v1.2.0)
|
|
205
|
+
### Multi-Source Configuration
|
|
206
206
|
|
|
207
207
|
```yaml
|
|
208
208
|
sources:
|
|
@@ -241,42 +241,27 @@ docpull --sources-file config.yaml
|
|
|
241
241
|
|
|
242
242
|
See `examples/` directory for more configuration examples.
|
|
243
243
|
|
|
244
|
-
## Custom Profiles
|
|
245
|
-
|
|
246
|
-
docpull includes a Stripe profile as reference. Create custom profiles for other sites:
|
|
247
|
-
|
|
248
|
-
```python
|
|
249
|
-
from docpull.profiles.base import SiteProfile
|
|
250
|
-
|
|
251
|
-
MY_PROFILE = SiteProfile(
|
|
252
|
-
name="mysite",
|
|
253
|
-
domains={"docs.mysite.com"},
|
|
254
|
-
include_patterns=["/docs/", "/api/"],
|
|
255
|
-
sitemap_url="https://docs.mysite.com/sitemap.xml",
|
|
256
|
-
rate_limit=0.5,
|
|
257
|
-
)
|
|
258
|
-
```
|
|
259
|
-
|
|
260
|
-
**Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
|
|
261
|
-
|
|
262
244
|
## Security
|
|
263
245
|
|
|
264
|
-
- HTTPS-only
|
|
265
|
-
-
|
|
246
|
+
- HTTPS-only (HTTP rejected)
|
|
247
|
+
- **Mandatory robots.txt compliance** (cannot be disabled)
|
|
248
|
+
- Respects Crawl-delay directives
|
|
249
|
+
- Blocks private/internal network IPs
|
|
266
250
|
- 50MB page size limit
|
|
267
|
-
- Timeout controls
|
|
268
|
-
- Validates content-type
|
|
269
|
-
- Playwright sandboxing
|
|
251
|
+
- Timeout controls (30s connection, 5min download)
|
|
252
|
+
- Validates content-type headers
|
|
253
|
+
- Playwright sandboxing for JS rendering
|
|
254
|
+
- Path traversal protection
|
|
270
255
|
|
|
271
256
|
## Troubleshooting
|
|
272
257
|
|
|
273
258
|
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
274
|
-
- **Missing dependencies**:
|
|
275
|
-
- **Site requires JS**: install
|
|
276
|
-
- **Slow or rate limited**:
|
|
277
|
-
- **Large sites**:
|
|
278
|
-
|
|
279
|
-
|
|
259
|
+
- **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
|
|
260
|
+
- **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
|
|
261
|
+
- **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
|
|
262
|
+
- **Large sites**: Set `--max-pages` to limit crawl size
|
|
263
|
+
- **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
|
|
264
|
+
- **Transient failures**: Increase `--max-retries` (default: 3)
|
|
280
265
|
|
|
281
266
|
## v1.2.0 Feature Examples
|
|
282
267
|
|
|
@@ -332,6 +317,34 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
332
317
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
333
318
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
334
319
|
|
|
320
|
+
## What's New in v1.5.0
|
|
321
|
+
|
|
322
|
+
This release focuses on network reliability, proxy support, and TOS compliance.
|
|
323
|
+
|
|
324
|
+
**New Features**:
|
|
325
|
+
- **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
|
|
326
|
+
- Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
|
|
327
|
+
- Install SOCKS support: `pip install docpull[proxy]`
|
|
328
|
+
- **Retry with Exponential Backoff**: Automatic retries for transient failures
|
|
329
|
+
- `--max-retries N` (default: 3)
|
|
330
|
+
- `--retry-base-delay SECONDS` (default: 1.0)
|
|
331
|
+
- Handles 429, 500, 502, 503, 504 status codes
|
|
332
|
+
- **Custom User-Agent**: `--user-agent STRING` for custom identification
|
|
333
|
+
- **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
|
|
334
|
+
- **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
|
|
335
|
+
|
|
336
|
+
**Security Enhancement**:
|
|
337
|
+
- **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
|
|
338
|
+
- Ensures TOS-friendly scraping behavior
|
|
339
|
+
- Automatically adjusts rate limiting based on Crawl-delay
|
|
340
|
+
|
|
341
|
+
**Codebase Simplification**:
|
|
342
|
+
- Removed built-in profiles (Stripe, etc.) - use URLs directly
|
|
343
|
+
- Consolidated utility modules
|
|
344
|
+
- Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
|
|
345
|
+
|
|
346
|
+
**Backward Compatible**: All existing workflows continue to work unchanged.
|
|
347
|
+
|
|
335
348
|
## What's New in v1.3.0
|
|
336
349
|
|
|
337
350
|
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
@@ -362,7 +375,7 @@ published_time: 2024-01-15T10:00:00Z
|
|
|
362
375
|
|
|
363
376
|
## What's New in v1.2.0
|
|
364
377
|
|
|
365
|
-
This release adds 15 major features across 4 phases.
|
|
378
|
+
This release adds 15 major features across 4 phases.
|
|
366
379
|
|
|
367
380
|
**Highlights**:
|
|
368
381
|
- Multi-source YAML configuration
|
|
@@ -381,7 +394,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
|
|
|
381
394
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
382
395
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
383
396
|
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
384
|
-
- [
|
|
397
|
+
- [Releases](https://github.com/raintree-technology/docpull/releases)
|
|
385
398
|
- [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
|
|
386
399
|
|
|
387
400
|
## License
|
|
@@ -1,15 +1,13 @@
|
|
|
1
|
-
__version__ = "1.
|
|
1
|
+
__version__ = "1.5.0"
|
|
2
2
|
|
|
3
3
|
from .fetchers.base import BaseFetcher
|
|
4
4
|
from .fetchers.generic import GenericFetcher
|
|
5
5
|
from .fetchers.generic_async import GenericAsyncFetcher
|
|
6
6
|
from .fetchers.parallel_base import ParallelFetcher
|
|
7
|
-
from .fetchers.stripe import StripeFetcher
|
|
8
7
|
|
|
9
8
|
__all__ = [
|
|
10
9
|
"BaseFetcher",
|
|
11
10
|
"GenericFetcher",
|
|
12
11
|
"GenericAsyncFetcher",
|
|
13
12
|
"ParallelFetcher",
|
|
14
|
-
"StripeFetcher",
|
|
15
13
|
]
|