docpull 1.2.1__tar.gz → 1.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {docpull-1.2.1 → docpull-1.5.0}/PKG-INFO +137 -54
  2. {docpull-1.2.1 → docpull-1.5.0}/README.md +128 -52
  3. docpull-1.5.0/docpull/__init__.py +13 -0
  4. {docpull-1.2.1 → docpull-1.5.0}/docpull/cli.py +78 -140
  5. {docpull-1.2.1 → docpull-1.5.0}/docpull/config.py +32 -11
  6. docpull-1.5.0/docpull/fetchers/__init__.py +11 -0
  7. {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/async_fetcher.py +172 -31
  8. {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/base.py +246 -9
  9. {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/generic.py +25 -65
  10. {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/generic_async.py +105 -63
  11. docpull-1.5.0/docpull/metadata_extractor.py +283 -0
  12. {docpull-1.2.1 → docpull-1.5.0}/docpull/sources_config.py +1 -0
  13. docpull-1.5.0/docpull.egg-info/PKG-INFO +478 -0
  14. docpull-1.5.0/docpull.egg-info/SOURCES.txt +49 -0
  15. docpull-1.5.0/docpull.egg-info/dependency_links.txt +1 -0
  16. docpull-1.5.0/docpull.egg-info/entry_points.txt +2 -0
  17. docpull-1.5.0/docpull.egg-info/requires.txt +38 -0
  18. docpull-1.5.0/docpull.egg-info/top_level.txt +1 -0
  19. {docpull-1.2.1 → docpull-1.5.0}/pyproject.toml +15 -2
  20. {docpull-1.2.1 → docpull-1.5.0}/tests/test_config.py +1 -5
  21. docpull-1.5.0/tests/test_metadata_extractor.py +233 -0
  22. docpull-1.2.1/.editorconfig +0 -30
  23. docpull-1.2.1/.pre-commit-config.yaml +0 -30
  24. docpull-1.2.1/CHANGELOG.md +0 -328
  25. docpull-1.2.1/CONTRIBUTING.md +0 -189
  26. docpull-1.2.1/MANIFEST.in +0 -49
  27. docpull-1.2.1/Makefile +0 -44
  28. docpull-1.2.1/SECURITY.md +0 -206
  29. docpull-1.2.1/TROUBLESHOOTING.md +0 -348
  30. docpull-1.2.1/docpull/__init__.py +0 -29
  31. docpull-1.2.1/docpull/fetchers/__init__.py +0 -23
  32. docpull-1.2.1/docpull/fetchers/bun.py +0 -59
  33. docpull-1.2.1/docpull/fetchers/d3.py +0 -211
  34. docpull-1.2.1/docpull/fetchers/nextjs.py +0 -59
  35. docpull-1.2.1/docpull/fetchers/plaid.py +0 -89
  36. docpull-1.2.1/docpull/fetchers/react.py +0 -59
  37. docpull-1.2.1/docpull/fetchers/stripe.py +0 -49
  38. docpull-1.2.1/docpull/fetchers/tailwind.py +0 -59
  39. docpull-1.2.1/docpull/fetchers/turborepo.py +0 -57
  40. docpull-1.2.1/docpull/profiles/__init__.py +0 -70
  41. docpull-1.2.1/docpull/profiles/base.py +0 -64
  42. docpull-1.2.1/docpull/profiles/bun.py +0 -14
  43. docpull-1.2.1/docpull/profiles/d3.py +0 -17
  44. docpull-1.2.1/docpull/profiles/nextjs.py +0 -15
  45. docpull-1.2.1/docpull/profiles/plaid.py +0 -16
  46. docpull-1.2.1/docpull/profiles/react.py +0 -14
  47. docpull-1.2.1/docpull/profiles/stripe.py +0 -14
  48. docpull-1.2.1/docpull/profiles/tailwind.py +0 -14
  49. docpull-1.2.1/docpull/profiles/turborepo.py +0 -14
  50. docpull-1.2.1/docpull/utils/__init__.py +0 -6
  51. docpull-1.2.1/docpull.egg-info/SOURCES.txt +0 -76
  52. docpull-1.2.1/examples/README.md +0 -280
  53. docpull-1.2.1/examples/deduplication-strategies.yaml +0 -29
  54. docpull-1.2.1/examples/format-conversion.yaml +0 -25
  55. docpull-1.2.1/examples/incremental-updates.yaml +0 -26
  56. docpull-1.2.1/examples/multi-source-optimized.yaml +0 -45
  57. docpull-1.2.1/examples/selective-crawling.yaml +0 -26
  58. docpull-1.2.1/examples/simple-optimization.yaml +0 -14
  59. docpull-1.2.1/requirements.txt +0 -34
  60. {docpull-1.2.1 → docpull-1.5.0}/LICENSE +0 -0
  61. {docpull-1.2.1 → docpull-1.5.0}/docpull/__main__.py +0 -0
  62. {docpull-1.2.1 → docpull-1.5.0}/docpull/archive.py +0 -0
  63. {docpull-1.2.1 → docpull-1.5.0}/docpull/cache.py +0 -0
  64. {docpull-1.2.1 → docpull-1.5.0}/docpull/doctor.py +0 -0
  65. {docpull-1.2.1 → docpull-1.5.0}/docpull/fetchers/parallel_base.py +0 -0
  66. {docpull-1.2.1/docpull/utils → docpull-1.5.0/docpull}/file_utils.py +0 -0
  67. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/__init__.py +0 -0
  68. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/base.py +0 -0
  69. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/json.py +0 -0
  70. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/markdown.py +0 -0
  71. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/sqlite.py +0 -0
  72. {docpull-1.2.1 → docpull-1.5.0}/docpull/formatters/toon.py +0 -0
  73. {docpull-1.2.1 → docpull-1.5.0}/docpull/hooks.py +0 -0
  74. {docpull-1.2.1 → docpull-1.5.0}/docpull/indexer.py +0 -0
  75. {docpull-1.2.1/docpull/utils → docpull-1.5.0/docpull}/logging_config.py +0 -0
  76. {docpull-1.2.1 → docpull-1.5.0}/docpull/metadata.py +0 -0
  77. {docpull-1.2.1 → docpull-1.5.0}/docpull/naming.py +0 -0
  78. {docpull-1.2.1 → docpull-1.5.0}/docpull/orchestrator.py +0 -0
  79. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/__init__.py +0 -0
  80. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/base.py +0 -0
  81. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/content_filter.py +0 -0
  82. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/deduplicator.py +0 -0
  83. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/language_filter.py +0 -0
  84. {docpull-1.2.1 → docpull-1.5.0}/docpull/processors/size_limiter.py +0 -0
  85. {docpull-1.2.1 → docpull-1.5.0}/docpull/py.typed +0 -0
  86. {docpull-1.2.1 → docpull-1.5.0}/docpull/vcs.py +0 -0
  87. {docpull-1.2.1 → docpull-1.5.0}/setup.cfg +0 -0
  88. {docpull-1.2.1 → docpull-1.5.0}/tests/test_orchestrator.py +0 -0
  89. {docpull-1.2.1 → docpull-1.5.0}/tests/test_sources_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 1.2.1
3
+ Version: 1.5.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -10,7 +10,7 @@ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readm
10
10
  Project-URL: Repository, https://github.com/raintree-technology/docpull
11
11
  Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
12
  Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
- Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
14
  Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
15
  Classifier: Development Status :: 5 - Production/Stable
16
16
  Classifier: Intended Audience :: Developers
@@ -43,14 +43,21 @@ Requires-Dist: requests>=2.31.0
43
43
  Requires-Dist: beautifulsoup4>=4.12.0
44
44
  Requires-Dist: html2text>=2020.1.16
45
45
  Requires-Dist: defusedxml>=0.7.1
46
+ Requires-Dist: extruct>=0.15.0
46
47
  Requires-Dist: aiohttp>=3.9.0
47
48
  Requires-Dist: rich>=13.0.0
48
49
  Requires-Dist: pyyaml>=6.0
49
50
  Requires-Dist: gitpython>=3.1.40
50
51
  Provides-Extra: js
51
52
  Requires-Dist: playwright>=1.40.0; extra == "js"
53
+ Provides-Extra: proxy
54
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
55
+ Provides-Extra: normalize
56
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
52
57
  Provides-Extra: all
53
58
  Requires-Dist: playwright>=1.40.0; extra == "all"
59
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
60
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
54
61
  Provides-Extra: dev
55
62
  Requires-Dist: pytest>=7.0.0; extra == "dev"
56
63
  Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
@@ -72,7 +79,11 @@ Dynamic: license-file
72
79
  **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
73
80
  Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
74
81
 
75
- **NEW in v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
82
+ **NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
83
+
84
+ **v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
85
+
86
+ **v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
76
87
 
77
88
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
78
89
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
@@ -95,9 +106,22 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
95
106
  - Sitemap + link crawling
96
107
  - Rate limiting, timeouts, content-type checks
97
108
  - Saves docs in structured Markdown with YAML metadata
98
- - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
99
-
100
- ### NEW in v1.2.0: Advanced Optimization
109
+ - **Mandatory robots.txt compliance** for TOS-friendly scraping
110
+
111
+ ### NEW in v1.5.0: Network & Reliability
112
+ - **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
113
+ - **Retry with Exponential Backoff**: Configurable retries for transient failures
114
+ - **Custom User-Agent**: Set custom User-Agent strings for requests
115
+ - **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
116
+ - **Better Encoding Detection**: Intelligent charset detection for international docs
117
+
118
+ ### v1.3.0: Rich Metadata Extraction
119
+ - **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
120
+ - **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
121
+ - **AI/RAG Ready**: Richer context for embeddings and retrieval systems
122
+ - **Opt-in Feature**: Enabled with `--rich-metadata` flag
123
+
124
+ ### v1.2.0: Advanced Optimization
101
125
  - **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
102
126
  - **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
103
127
  - **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
@@ -124,11 +148,14 @@ docpull --doctor # verify installation
124
148
 
125
149
  # Basic usage
126
150
  docpull https://aptos.dev
127
- docpull stripe # use a built-in profile
151
+ docpull https://docs.anthropic.com
128
152
 
129
153
  # NEW: Simple optimization (v1.2.0)
130
154
  docpull https://code.claude.com/docs --language en --create-index
131
155
 
156
+ # NEW: Rich metadata extraction (v1.3.0)
157
+ docpull https://docs.anthropic.com --rich-metadata --create-index
158
+
132
159
  # NEW: Advanced optimization (v1.2.0)
133
160
  docpull https://aptos.dev \
134
161
  --deduplicate \
@@ -154,7 +181,7 @@ docpull https://site.com --js
154
181
  from docpull import GenericAsyncFetcher
155
182
 
156
183
  fetcher = GenericAsyncFetcher(
157
- url_or_profile="https://aptos.dev",
184
+ url="https://aptos.dev",
158
185
  output_dir="./docs",
159
186
  max_pages=100,
160
187
  max_concurrent=20,
@@ -189,6 +216,7 @@ fetcher.fetch()
189
216
  - `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
190
217
  - `--create-index` – generate INDEX.md with navigation
191
218
  - `--extract-metadata` – extract metadata to metadata.json
219
+ - `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
192
220
  - `--update-only-changed` – only download changed files
193
221
  - `--incremental` – enable incremental mode with resume
194
222
  - `--git-commit` – auto-commit changes
@@ -197,6 +225,12 @@ fetcher.fetch()
197
225
  - `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
198
226
  - `--sources-file PATH` – multi-source configuration file
199
227
 
228
+ ### NEW in v1.5.0: Network Options
229
+ - `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
230
+ - `--user-agent STRING` – custom User-Agent string
231
+ - `--max-retries N` – max retry attempts for failed requests (default: 3)
232
+ - `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
233
+
200
234
  See `docpull --help` for complete list of options.
201
235
 
202
236
  ## Performance
@@ -215,33 +249,36 @@ Each downloaded page becomes a Markdown file:
215
249
 
216
250
  ```markdown
217
251
  ---
218
- url: https://stripe.com/docs/payments
219
- fetched: 2025-11-13
252
+ url: https://aptos.dev/build/guides/first-transaction
253
+ fetched: 2025-11-28
220
254
  ---
221
- # Payment Intents
255
+ # Your First Transaction
222
256
  ...
223
257
  ```
224
258
 
225
- Directory layout mirrors the target site's structure.
226
-
227
- ## Configuration File
228
-
229
- ### Simple Configuration (v1.0+)
259
+ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
230
260
 
231
- ```yaml
232
- output_dir: ./docs
233
- rate_limit: 0.5
234
- sources:
235
- - stripe
236
- - nextjs
261
+ ```markdown
262
+ ---
263
+ url: https://aptos.dev/build/guides/first-transaction
264
+ fetched: 2025-11-28
265
+ title: Your First Transaction
266
+ description: Learn how to submit your first transaction on Aptos
267
+ author: Aptos Foundation
268
+ keywords: [aptos, blockchain, transaction, guide]
269
+ image: https://aptos.dev/img/docs-preview.png
270
+ type: article
271
+ site_name: Aptos Documentation
272
+ ---
273
+ # Your First Transaction
274
+ ...
237
275
  ```
238
276
 
239
- Run with:
240
- ```bash
241
- docpull --config config.yaml
242
- ```
277
+ Directory layout mirrors the target site's structure.
278
+
279
+ ## Configuration File
243
280
 
244
- ### NEW: Multi-Source Configuration (v1.2.0)
281
+ ### Multi-Source Configuration
245
282
 
246
283
  ```yaml
247
284
  sources:
@@ -250,6 +287,7 @@ sources:
250
287
  language: en
251
288
  max_file_size: 200kb
252
289
  create_index: true
290
+ rich_metadata: true # Extract Open Graph, JSON-LD metadata
253
291
 
254
292
  claude-code:
255
293
  url: https://code.claude.com/docs
@@ -279,38 +317,27 @@ docpull --sources-file config.yaml
279
317
 
280
318
  See `examples/` directory for more configuration examples.
281
319
 
282
- ## Custom Profiles
283
-
284
- Easily define profiles for frequently scraped sites.
285
-
286
- ```python
287
- from docpull.profiles.base import SiteProfile
288
-
289
- MY_PROFILE = SiteProfile(
290
- name="mysite",
291
- domains={"docs.mysite.com"},
292
- include_patterns=["/docs/", "/api/"],
293
- )
294
- ```
295
-
296
320
  ## Security
297
321
 
298
- - HTTPS-only
299
- - Blocks private network IPs
322
+ - HTTPS-only (HTTP rejected)
323
+ - **Mandatory robots.txt compliance** (cannot be disabled)
324
+ - Respects Crawl-delay directives
325
+ - Blocks private/internal network IPs
300
326
  - 50MB page size limit
301
- - Timeout controls
302
- - Validates content-type
303
- - Playwright sandboxing
327
+ - Timeout controls (30s connection, 5min download)
328
+ - Validates content-type headers
329
+ - Playwright sandboxing for JS rendering
330
+ - Path traversal protection
304
331
 
305
332
  ## Troubleshooting
306
333
 
307
334
  - **Installation issues**: Run `docpull --doctor` to diagnose problems
308
- - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
309
- - **Site requires JS**: install Playwright + `--js`
310
- - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
311
- - **Large sites**: set `--max-pages`
312
-
313
- For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
335
+ - **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
336
+ - **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
337
+ - **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
338
+ - **Large sites**: Set `--max-pages` to limit crawl size
339
+ - **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
340
+ - **Transient failures**: Increase `--max-retries` (default: 3)
314
341
 
315
342
  ## v1.2.0 Feature Examples
316
343
 
@@ -366,9 +393,65 @@ See `examples/` directory for comprehensive configuration examples.
366
393
  - **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
367
394
  - **One command** instead of 4+ separate commands with manual optimization
368
395
 
396
+ ## What's New in v1.5.0
397
+
398
+ This release focuses on network reliability, proxy support, and TOS compliance.
399
+
400
+ **New Features**:
401
+ - **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
402
+ - Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
403
+ - Install SOCKS support: `pip install docpull[proxy]`
404
+ - **Retry with Exponential Backoff**: Automatic retries for transient failures
405
+ - `--max-retries N` (default: 3)
406
+ - `--retry-base-delay SECONDS` (default: 1.0)
407
+ - Handles 429, 500, 502, 503, 504 status codes
408
+ - **Custom User-Agent**: `--user-agent STRING` for custom identification
409
+ - **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
410
+ - **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
411
+
412
+ **Security Enhancement**:
413
+ - **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
414
+ - Ensures TOS-friendly scraping behavior
415
+ - Automatically adjusts rate limiting based on Crawl-delay
416
+
417
+ **Codebase Simplification**:
418
+ - Removed built-in profiles (Stripe, etc.) - use URLs directly
419
+ - Consolidated utility modules
420
+ - Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
421
+
422
+ **Backward Compatible**: All existing workflows continue to work unchanged.
423
+
424
+ ## What's New in v1.3.0
425
+
426
+ This release adds rich structured metadata extraction for better AI/RAG integration.
427
+
428
+ **New Feature**:
429
+ - **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
430
+ - Adds author, description, keywords, images, publish dates, and more to frontmatter
431
+ - Enhances AI/RAG systems with richer context
432
+ - Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
433
+ - Powered by the extruct library
434
+
435
+ **Example enhanced frontmatter**:
436
+ ```yaml
437
+ ---
438
+ url: https://docs.example.com/guide
439
+ fetched: 2025-11-20
440
+ title: Getting Started Guide
441
+ description: Learn the basics of our platform
442
+ author: John Doe
443
+ keywords: [tutorial, guide, api]
444
+ image: https://docs.example.com/og-image.png
445
+ type: article
446
+ published_time: 2024-01-15T10:00:00Z
447
+ ---
448
+ ```
449
+
450
+ **Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
451
+
369
452
  ## What's New in v1.2.0
370
453
 
371
- This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
454
+ This release adds 15 major features across 4 phases.
372
455
 
373
456
  **Highlights**:
374
457
  - Multi-source YAML configuration
@@ -387,7 +470,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
387
470
  - [PyPI](https://pypi.org/project/docpull/)
388
471
  - [GitHub](https://github.com/raintree-technology/docpull)
389
472
  - [Issues](https://github.com/raintree-technology/docpull/issues)
390
- - [Changelog](https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md)
473
+ - [Releases](https://github.com/raintree-technology/docpull/releases)
391
474
  - [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
392
475
 
393
476
  ## License
@@ -3,7 +3,11 @@
3
3
  **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
4
4
  Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
5
5
 
6
- **NEW in v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
6
+ **NEW in v1.5.0**: Proxy support, retry with exponential backoff, custom User-Agent, and mandatory robots.txt compliance for TOS-friendly scraping.
7
+
8
+ **v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
9
+
10
+ **v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
7
11
 
8
12
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
9
13
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
@@ -26,9 +30,22 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
26
30
  - Sitemap + link crawling
27
31
  - Rate limiting, timeouts, content-type checks
28
32
  - Saves docs in structured Markdown with YAML metadata
29
- - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
30
-
31
- ### NEW in v1.2.0: Advanced Optimization
33
+ - **Mandatory robots.txt compliance** for TOS-friendly scraping
34
+
35
+ ### NEW in v1.5.0: Network & Reliability
36
+ - **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies via `--proxy` or env vars
37
+ - **Retry with Exponential Backoff**: Configurable retries for transient failures
38
+ - **Custom User-Agent**: Set custom User-Agent strings for requests
39
+ - **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
40
+ - **Better Encoding Detection**: Intelligent charset detection for international docs
41
+
42
+ ### v1.3.0: Rich Metadata Extraction
43
+ - **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
44
+ - **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
45
+ - **AI/RAG Ready**: Richer context for embeddings and retrieval systems
46
+ - **Opt-in Feature**: Enabled with `--rich-metadata` flag
47
+
48
+ ### v1.2.0: Advanced Optimization
32
49
  - **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
33
50
  - **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
34
51
  - **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
@@ -55,11 +72,14 @@ docpull --doctor # verify installation
55
72
 
56
73
  # Basic usage
57
74
  docpull https://aptos.dev
58
- docpull stripe # use a built-in profile
75
+ docpull https://docs.anthropic.com
59
76
 
60
77
  # NEW: Simple optimization (v1.2.0)
61
78
  docpull https://code.claude.com/docs --language en --create-index
62
79
 
80
+ # NEW: Rich metadata extraction (v1.3.0)
81
+ docpull https://docs.anthropic.com --rich-metadata --create-index
82
+
63
83
  # NEW: Advanced optimization (v1.2.0)
64
84
  docpull https://aptos.dev \
65
85
  --deduplicate \
@@ -85,7 +105,7 @@ docpull https://site.com --js
85
105
  from docpull import GenericAsyncFetcher
86
106
 
87
107
  fetcher = GenericAsyncFetcher(
88
- url_or_profile="https://aptos.dev",
108
+ url="https://aptos.dev",
89
109
  output_dir="./docs",
90
110
  max_pages=100,
91
111
  max_concurrent=20,
@@ -120,6 +140,7 @@ fetcher.fetch()
120
140
  - `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
121
141
  - `--create-index` – generate INDEX.md with navigation
122
142
  - `--extract-metadata` – extract metadata to metadata.json
143
+ - `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
123
144
  - `--update-only-changed` – only download changed files
124
145
  - `--incremental` – enable incremental mode with resume
125
146
  - `--git-commit` – auto-commit changes
@@ -128,6 +149,12 @@ fetcher.fetch()
128
149
  - `--archive-format {tar.gz,tar.bz2,tar.xz,zip}` – archive format
129
150
  - `--sources-file PATH` – multi-source configuration file
130
151
 
152
+ ### NEW in v1.5.0: Network Options
153
+ - `--proxy URL` – proxy URL (HTTP, HTTPS, SOCKS5)
154
+ - `--user-agent STRING` – custom User-Agent string
155
+ - `--max-retries N` – max retry attempts for failed requests (default: 3)
156
+ - `--retry-base-delay SECONDS` – base delay for exponential backoff (default: 1.0)
157
+
131
158
  See `docpull --help` for complete list of options.
132
159
 
133
160
  ## Performance
@@ -146,33 +173,36 @@ Each downloaded page becomes a Markdown file:
146
173
 
147
174
  ```markdown
148
175
  ---
149
- url: https://stripe.com/docs/payments
150
- fetched: 2025-11-13
176
+ url: https://aptos.dev/build/guides/first-transaction
177
+ fetched: 2025-11-28
151
178
  ---
152
- # Payment Intents
179
+ # Your First Transaction
153
180
  ...
154
181
  ```
155
182
 
156
- Directory layout mirrors the target site's structure.
157
-
158
- ## Configuration File
159
-
160
- ### Simple Configuration (v1.0+)
183
+ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
161
184
 
162
- ```yaml
163
- output_dir: ./docs
164
- rate_limit: 0.5
165
- sources:
166
- - stripe
167
- - nextjs
185
+ ```markdown
186
+ ---
187
+ url: https://aptos.dev/build/guides/first-transaction
188
+ fetched: 2025-11-28
189
+ title: Your First Transaction
190
+ description: Learn how to submit your first transaction on Aptos
191
+ author: Aptos Foundation
192
+ keywords: [aptos, blockchain, transaction, guide]
193
+ image: https://aptos.dev/img/docs-preview.png
194
+ type: article
195
+ site_name: Aptos Documentation
196
+ ---
197
+ # Your First Transaction
198
+ ...
168
199
  ```
169
200
 
170
- Run with:
171
- ```bash
172
- docpull --config config.yaml
173
- ```
201
+ Directory layout mirrors the target site's structure.
202
+
203
+ ## Configuration File
174
204
 
175
- ### NEW: Multi-Source Configuration (v1.2.0)
205
+ ### Multi-Source Configuration
176
206
 
177
207
  ```yaml
178
208
  sources:
@@ -181,6 +211,7 @@ sources:
181
211
  language: en
182
212
  max_file_size: 200kb
183
213
  create_index: true
214
+ rich_metadata: true # Extract Open Graph, JSON-LD metadata
184
215
 
185
216
  claude-code:
186
217
  url: https://code.claude.com/docs
@@ -210,38 +241,27 @@ docpull --sources-file config.yaml
210
241
 
211
242
  See `examples/` directory for more configuration examples.
212
243
 
213
- ## Custom Profiles
214
-
215
- Easily define profiles for frequently scraped sites.
216
-
217
- ```python
218
- from docpull.profiles.base import SiteProfile
219
-
220
- MY_PROFILE = SiteProfile(
221
- name="mysite",
222
- domains={"docs.mysite.com"},
223
- include_patterns=["/docs/", "/api/"],
224
- )
225
- ```
226
-
227
244
  ## Security
228
245
 
229
- - HTTPS-only
230
- - Blocks private network IPs
246
+ - HTTPS-only (HTTP rejected)
247
+ - **Mandatory robots.txt compliance** (cannot be disabled)
248
+ - Respects Crawl-delay directives
249
+ - Blocks private/internal network IPs
231
250
  - 50MB page size limit
232
- - Timeout controls
233
- - Validates content-type
234
- - Playwright sandboxing
251
+ - Timeout controls (30s connection, 5min download)
252
+ - Validates content-type headers
253
+ - Playwright sandboxing for JS rendering
254
+ - Path traversal protection
235
255
 
236
256
  ## Troubleshooting
237
257
 
238
258
  - **Installation issues**: Run `docpull --doctor` to diagnose problems
239
- - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
240
- - **Site requires JS**: install Playwright + `--js`
241
- - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
242
- - **Large sites**: set `--max-pages`
243
-
244
- For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
259
+ - **Missing dependencies**: `pip install docpull[all]` for all optional dependencies
260
+ - **Site requires JS**: `pip install docpull[js]` then `python -m playwright install chromium`
261
+ - **Slow or rate limited**: Lower `--max-concurrent` or raise `--rate-limit`
262
+ - **Large sites**: Set `--max-pages` to limit crawl size
263
+ - **Proxy issues**: Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` env var
264
+ - **Transient failures**: Increase `--max-retries` (default: 3)
245
265
 
246
266
  ## v1.2.0 Feature Examples
247
267
 
@@ -297,9 +317,65 @@ See `examples/` directory for comprehensive configuration examples.
297
317
  - **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
298
318
  - **One command** instead of 4+ separate commands with manual optimization
299
319
 
320
+ ## What's New in v1.5.0
321
+
322
+ This release focuses on network reliability, proxy support, and TOS compliance.
323
+
324
+ **New Features**:
325
+ - **Proxy Support**: HTTP, HTTPS, and SOCKS5 proxies
326
+ - Use `--proxy URL` or set `DOCPULL_PROXY` / `HTTPS_PROXY` environment variables
327
+ - Install SOCKS support: `pip install docpull[proxy]`
328
+ - **Retry with Exponential Backoff**: Automatic retries for transient failures
329
+ - `--max-retries N` (default: 3)
330
+ - `--retry-base-delay SECONDS` (default: 1.0)
331
+ - Handles 429, 500, 502, 503, 504 status codes
332
+ - **Custom User-Agent**: `--user-agent STRING` for custom identification
333
+ - **Better Encoding Detection**: Intelligent charset detection using charset-normalizer
334
+ - **Crawl-delay Compliance**: Automatically respects robots.txt Crawl-delay directives
335
+
336
+ **Security Enhancement**:
337
+ - **Mandatory robots.txt Compliance**: robots.txt is now always respected (cannot be disabled)
338
+ - Ensures TOS-friendly scraping behavior
339
+ - Automatically adjusts rate limiting based on Crawl-delay
340
+
341
+ **Codebase Simplification**:
342
+ - Removed built-in profiles (Stripe, etc.) - use URLs directly
343
+ - Consolidated utility modules
344
+ - Moved CONTRIBUTING.md, SECURITY.md to `.github/` directory
345
+
346
+ **Backward Compatible**: All existing workflows continue to work unchanged.
347
+
348
+ ## What's New in v1.3.0
349
+
350
+ This release adds rich structured metadata extraction for better AI/RAG integration.
351
+
352
+ **New Feature**:
353
+ - **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
354
+ - Adds author, description, keywords, images, publish dates, and more to frontmatter
355
+ - Enhances AI/RAG systems with richer context
356
+ - Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
357
+ - Powered by the extruct library
358
+
359
+ **Example enhanced frontmatter**:
360
+ ```yaml
361
+ ---
362
+ url: https://docs.example.com/guide
363
+ fetched: 2025-11-20
364
+ title: Getting Started Guide
365
+ description: Learn the basics of our platform
366
+ author: John Doe
367
+ keywords: [tutorial, guide, api]
368
+ image: https://docs.example.com/og-image.png
369
+ type: article
370
+ published_time: 2024-01-15T10:00:00Z
371
+ ---
372
+ ```
373
+
374
+ **Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
375
+
300
376
  ## What's New in v1.2.0
301
377
 
302
- This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
378
+ This release adds 15 major features across 4 phases.
303
379
 
304
380
  **Highlights**:
305
381
  - Multi-source YAML configuration
@@ -318,7 +394,7 @@ This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELO
318
394
  - [PyPI](https://pypi.org/project/docpull/)
319
395
  - [GitHub](https://github.com/raintree-technology/docpull)
320
396
  - [Issues](https://github.com/raintree-technology/docpull/issues)
321
- - [Changelog](https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md)
397
+ - [Releases](https://github.com/raintree-technology/docpull/releases)
322
398
  - [Examples](https://github.com/raintree-technology/docpull/tree/main/examples)
323
399
 
324
400
  ## License
@@ -0,0 +1,13 @@
1
+ __version__ = "1.5.0"
2
+
3
+ from .fetchers.base import BaseFetcher
4
+ from .fetchers.generic import GenericFetcher
5
+ from .fetchers.generic_async import GenericAsyncFetcher
6
+ from .fetchers.parallel_base import ParallelFetcher
7
+
8
+ __all__ = [
9
+ "BaseFetcher",
10
+ "GenericFetcher",
11
+ "GenericAsyncFetcher",
12
+ "ParallelFetcher",
13
+ ]