docpull 1.2.1__tar.gz → 1.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-1.2.1 → docpull-1.3.0}/CHANGELOG.md +75 -0
- {docpull-1.2.1 → docpull-1.3.0}/PKG-INFO +71 -7
- {docpull-1.2.1 → docpull-1.3.0}/README.md +69 -6
- docpull-1.3.0/docpull/__init__.py +15 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/cli.py +8 -1
- {docpull-1.2.1 → docpull-1.3.0}/docpull/config.py +12 -7
- docpull-1.3.0/docpull/fetchers/__init__.py +9 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/base.py +58 -5
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/generic_async.py +9 -1
- docpull-1.3.0/docpull/metadata_extractor.py +283 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/__init__.py +5 -22
- {docpull-1.2.1 → docpull-1.3.0}/docpull/sources_config.py +1 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull.egg-info/SOURCES.txt +2 -14
- {docpull-1.2.1 → docpull-1.3.0}/pyproject.toml +6 -1
- {docpull-1.2.1 → docpull-1.3.0}/tests/test_config.py +3 -3
- docpull-1.3.0/tests/test_metadata_extractor.py +233 -0
- docpull-1.2.1/docpull/__init__.py +0 -29
- docpull-1.2.1/docpull/fetchers/__init__.py +0 -23
- docpull-1.2.1/docpull/fetchers/bun.py +0 -59
- docpull-1.2.1/docpull/fetchers/d3.py +0 -211
- docpull-1.2.1/docpull/fetchers/nextjs.py +0 -59
- docpull-1.2.1/docpull/fetchers/plaid.py +0 -89
- docpull-1.2.1/docpull/fetchers/react.py +0 -59
- docpull-1.2.1/docpull/fetchers/tailwind.py +0 -59
- docpull-1.2.1/docpull/fetchers/turborepo.py +0 -57
- docpull-1.2.1/docpull/profiles/bun.py +0 -14
- docpull-1.2.1/docpull/profiles/d3.py +0 -17
- docpull-1.2.1/docpull/profiles/nextjs.py +0 -15
- docpull-1.2.1/docpull/profiles/plaid.py +0 -16
- docpull-1.2.1/docpull/profiles/react.py +0 -14
- docpull-1.2.1/docpull/profiles/tailwind.py +0 -14
- docpull-1.2.1/docpull/profiles/turborepo.py +0 -14
- {docpull-1.2.1 → docpull-1.3.0}/.editorconfig +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/.pre-commit-config.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/CONTRIBUTING.md +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/LICENSE +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/MANIFEST.in +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/Makefile +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/SECURITY.md +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/TROUBLESHOOTING.md +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/__main__.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/archive.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/cache.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/doctor.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/async_fetcher.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/generic.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/stripe.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/__init__.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/base.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/json.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/markdown.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/sqlite.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/toon.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/hooks.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/indexer.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/metadata.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/naming.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/orchestrator.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/__init__.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/base.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/content_filter.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/deduplicator.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/language_filter.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/size_limiter.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/base.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/stripe.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/py.typed +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/__init__.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/file_utils.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/logging_config.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/docpull/vcs.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/README.md +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/deduplication-strategies.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/format-conversion.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/incremental-updates.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/multi-source-optimized.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/selective-crawling.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/examples/simple-optimization.yaml +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/requirements.txt +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/setup.cfg +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/tests/test_orchestrator.py +0 -0
- {docpull-1.2.1 → docpull-1.3.0}/tests/test_sources_config.py +0 -0
|
@@ -5,6 +5,81 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.3.0] - 2025-11-20
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
**Rich Metadata Extraction**
|
|
13
|
+
- Extract structured metadata (Open Graph, JSON-LD, microdata) during fetch
|
|
14
|
+
- New `--rich-metadata` CLI flag to enable rich metadata extraction
|
|
15
|
+
- Enhanced frontmatter with author, description, keywords, images, publish dates, tags, and more
|
|
16
|
+
- Better context for AI/RAG systems with richer document metadata
|
|
17
|
+
- Powered by `extruct` library
|
|
18
|
+
- Opt-in feature, backward compatible with existing workflows
|
|
19
|
+
|
|
20
|
+
### Changed
|
|
21
|
+
|
|
22
|
+
**Simplified Profile System**
|
|
23
|
+
- Removed 7 built-in profiles (Next.js, React, Plaid, Tailwind, Bun, D3, Turborepo)
|
|
24
|
+
- Kept Stripe profile as reference implementation
|
|
25
|
+
- Generic fetcher works excellently for all documentation sites
|
|
26
|
+
- Users can create custom profiles or use URLs directly
|
|
27
|
+
- Reduced maintenance burden and codebase complexity
|
|
28
|
+
|
|
29
|
+
### Technical Details
|
|
30
|
+
|
|
31
|
+
**New Dependencies:**
|
|
32
|
+
- Added `extruct>=0.15.0` for structured metadata extraction
|
|
33
|
+
|
|
34
|
+
**New Files:**
|
|
35
|
+
- `docpull/metadata_extractor.py` - Rich metadata extraction module
|
|
36
|
+
- `tests/test_metadata_extractor.py` - Comprehensive test suite for metadata extraction
|
|
37
|
+
|
|
38
|
+
**Updated Files:**
|
|
39
|
+
- `docpull/fetchers/base.py` - Integrated rich metadata extraction into fetch pipeline
|
|
40
|
+
- `docpull/fetchers/generic_async.py` - Added `use_rich_metadata` parameter
|
|
41
|
+
- `docpull/config.py` - Added `rich_metadata` configuration option
|
|
42
|
+
- `docpull/sources_config.py` - Added `rich_metadata` field to SourceConfig
|
|
43
|
+
- `docpull/cli.py` - Added `--rich-metadata` CLI flag
|
|
44
|
+
- `docpull/profiles/__init__.py` - Simplified to single Stripe profile
|
|
45
|
+
|
|
46
|
+
**Removed Files:**
|
|
47
|
+
- Removed 7 profile files and 7 fetcher implementation files
|
|
48
|
+
|
|
49
|
+
**Version Bump:**
|
|
50
|
+
- Updated version from `1.2.1` to `1.3.0`
|
|
51
|
+
|
|
52
|
+
### Example Usage
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
# Extract rich metadata during fetch
|
|
56
|
+
docpull https://docs.anthropic.com --rich-metadata
|
|
57
|
+
|
|
58
|
+
# Combine with other features
|
|
59
|
+
docpull https://stripe.com/docs --rich-metadata --create-index --language en
|
|
60
|
+
|
|
61
|
+
# Multi-source configuration
|
|
62
|
+
docpull --sources-file config.yaml # with rich_metadata: true per source
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Example Enhanced Frontmatter
|
|
66
|
+
|
|
67
|
+
```yaml
|
|
68
|
+
---
|
|
69
|
+
url: https://docs.example.com/guide
|
|
70
|
+
fetched: 2025-11-20
|
|
71
|
+
title: Getting Started Guide
|
|
72
|
+
description: Learn the basics of our platform
|
|
73
|
+
author: John Doe
|
|
74
|
+
keywords: [tutorial, guide, api]
|
|
75
|
+
image: https://docs.example.com/og-image.png
|
|
76
|
+
type: article
|
|
77
|
+
site_name: Example Docs
|
|
78
|
+
published_time: 2024-01-15T10:00:00Z
|
|
79
|
+
modified_time: 2024-01-20T15:30:00Z
|
|
80
|
+
---
|
|
81
|
+
```
|
|
82
|
+
|
|
8
83
|
## [1.2.0] - 2025-11-16
|
|
9
84
|
|
|
10
85
|
### Added - 15 Major New Features
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -43,6 +43,7 @@ Requires-Dist: requests>=2.31.0
|
|
|
43
43
|
Requires-Dist: beautifulsoup4>=4.12.0
|
|
44
44
|
Requires-Dist: html2text>=2020.1.16
|
|
45
45
|
Requires-Dist: defusedxml>=0.7.1
|
|
46
|
+
Requires-Dist: extruct>=0.15.0
|
|
46
47
|
Requires-Dist: aiohttp>=3.9.0
|
|
47
48
|
Requires-Dist: rich>=13.0.0
|
|
48
49
|
Requires-Dist: pyyaml>=6.0
|
|
@@ -72,7 +73,9 @@ Dynamic: license-file
|
|
|
72
73
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
73
74
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
74
75
|
|
|
75
|
-
**NEW in v1.
|
|
76
|
+
**NEW in v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
77
|
+
|
|
78
|
+
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
76
79
|
|
|
77
80
|
[](https://www.python.org/downloads/)
|
|
78
81
|
[](https://badge.fury.io/py/docpull)
|
|
@@ -95,9 +98,15 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
95
98
|
- Sitemap + link crawling
|
|
96
99
|
- Rate limiting, timeouts, content-type checks
|
|
97
100
|
- Saves docs in structured Markdown with YAML metadata
|
|
98
|
-
-
|
|
101
|
+
- Built-in Stripe profile as reference implementation (custom profiles easily added)
|
|
102
|
+
|
|
103
|
+
### NEW in v1.3.0: Rich Metadata Extraction
|
|
104
|
+
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
105
|
+
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
106
|
+
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
107
|
+
- **Opt-in Feature**: Enabled with `--rich-metadata` flag
|
|
99
108
|
|
|
100
|
-
###
|
|
109
|
+
### v1.2.0: Advanced Optimization
|
|
101
110
|
- **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
|
|
102
111
|
- **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
|
|
103
112
|
- **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
|
|
@@ -129,6 +138,9 @@ docpull stripe # use a built-in profile
|
|
|
129
138
|
# NEW: Simple optimization (v1.2.0)
|
|
130
139
|
docpull https://code.claude.com/docs --language en --create-index
|
|
131
140
|
|
|
141
|
+
# NEW: Rich metadata extraction (v1.3.0)
|
|
142
|
+
docpull https://docs.anthropic.com --rich-metadata --create-index
|
|
143
|
+
|
|
132
144
|
# NEW: Advanced optimization (v1.2.0)
|
|
133
145
|
docpull https://aptos.dev \
|
|
134
146
|
--deduplicate \
|
|
@@ -189,6 +201,7 @@ fetcher.fetch()
|
|
|
189
201
|
- `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
|
|
190
202
|
- `--create-index` – generate INDEX.md with navigation
|
|
191
203
|
- `--extract-metadata` – extract metadata to metadata.json
|
|
204
|
+
- `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
|
|
192
205
|
- `--update-only-changed` – only download changed files
|
|
193
206
|
- `--incremental` – enable incremental mode with resume
|
|
194
207
|
- `--git-commit` – auto-commit changes
|
|
@@ -222,6 +235,24 @@ fetched: 2025-11-13
|
|
|
222
235
|
...
|
|
223
236
|
```
|
|
224
237
|
|
|
238
|
+
With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
|
|
239
|
+
|
|
240
|
+
```markdown
|
|
241
|
+
---
|
|
242
|
+
url: https://stripe.com/docs/payments
|
|
243
|
+
fetched: 2025-11-13
|
|
244
|
+
title: Accept a payment
|
|
245
|
+
description: Learn how to accept payments with the Payment Intents API
|
|
246
|
+
author: Stripe
|
|
247
|
+
keywords: [payments, api, stripe, checkout]
|
|
248
|
+
image: https://stripe.com/img/docs-preview.png
|
|
249
|
+
type: article
|
|
250
|
+
site_name: Stripe Documentation
|
|
251
|
+
---
|
|
252
|
+
# Payment Intents
|
|
253
|
+
...
|
|
254
|
+
```
|
|
255
|
+
|
|
225
256
|
Directory layout mirrors the target site's structure.
|
|
226
257
|
|
|
227
258
|
## Configuration File
|
|
@@ -232,8 +263,8 @@ Directory layout mirrors the target site's structure.
|
|
|
232
263
|
output_dir: ./docs
|
|
233
264
|
rate_limit: 0.5
|
|
234
265
|
sources:
|
|
235
|
-
- stripe
|
|
236
|
-
-
|
|
266
|
+
- stripe # Built-in profile
|
|
267
|
+
- https://docs.example.com # Or any URL
|
|
237
268
|
```
|
|
238
269
|
|
|
239
270
|
Run with:
|
|
@@ -250,6 +281,7 @@ sources:
|
|
|
250
281
|
language: en
|
|
251
282
|
max_file_size: 200kb
|
|
252
283
|
create_index: true
|
|
284
|
+
rich_metadata: true # Extract Open Graph, JSON-LD metadata
|
|
253
285
|
|
|
254
286
|
claude-code:
|
|
255
287
|
url: https://code.claude.com/docs
|
|
@@ -281,7 +313,7 @@ See `examples/` directory for more configuration examples.
|
|
|
281
313
|
|
|
282
314
|
## Custom Profiles
|
|
283
315
|
|
|
284
|
-
|
|
316
|
+
docpull includes a Stripe profile as reference. Create custom profiles for other sites:
|
|
285
317
|
|
|
286
318
|
```python
|
|
287
319
|
from docpull.profiles.base import SiteProfile
|
|
@@ -290,9 +322,13 @@ MY_PROFILE = SiteProfile(
|
|
|
290
322
|
name="mysite",
|
|
291
323
|
domains={"docs.mysite.com"},
|
|
292
324
|
include_patterns=["/docs/", "/api/"],
|
|
325
|
+
sitemap_url="https://docs.mysite.com/sitemap.xml",
|
|
326
|
+
rate_limit=0.5,
|
|
293
327
|
)
|
|
294
328
|
```
|
|
295
329
|
|
|
330
|
+
**Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
|
|
331
|
+
|
|
296
332
|
## Security
|
|
297
333
|
|
|
298
334
|
- HTTPS-only
|
|
@@ -366,6 +402,34 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
366
402
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
367
403
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
368
404
|
|
|
405
|
+
## What's New in v1.3.0
|
|
406
|
+
|
|
407
|
+
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
408
|
+
|
|
409
|
+
**New Feature**:
|
|
410
|
+
- **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
|
|
411
|
+
- Adds author, description, keywords, images, publish dates, and more to frontmatter
|
|
412
|
+
- Enhances AI/RAG systems with richer context
|
|
413
|
+
- Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
|
|
414
|
+
- Powered by the extruct library
|
|
415
|
+
|
|
416
|
+
**Example enhanced frontmatter**:
|
|
417
|
+
```yaml
|
|
418
|
+
---
|
|
419
|
+
url: https://docs.example.com/guide
|
|
420
|
+
fetched: 2025-11-20
|
|
421
|
+
title: Getting Started Guide
|
|
422
|
+
description: Learn the basics of our platform
|
|
423
|
+
author: John Doe
|
|
424
|
+
keywords: [tutorial, guide, api]
|
|
425
|
+
image: https://docs.example.com/og-image.png
|
|
426
|
+
type: article
|
|
427
|
+
published_time: 2024-01-15T10:00:00Z
|
|
428
|
+
---
|
|
429
|
+
```
|
|
430
|
+
|
|
431
|
+
**Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
|
|
432
|
+
|
|
369
433
|
## What's New in v1.2.0
|
|
370
434
|
|
|
371
435
|
This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
|
|
@@ -3,7 +3,9 @@
|
|
|
3
3
|
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
4
4
|
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
5
5
|
|
|
6
|
-
**NEW in v1.
|
|
6
|
+
**NEW in v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
|
|
7
|
+
|
|
8
|
+
**v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
|
|
7
9
|
|
|
8
10
|
[](https://www.python.org/downloads/)
|
|
9
11
|
[](https://badge.fury.io/py/docpull)
|
|
@@ -26,9 +28,15 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
26
28
|
- Sitemap + link crawling
|
|
27
29
|
- Rate limiting, timeouts, content-type checks
|
|
28
30
|
- Saves docs in structured Markdown with YAML metadata
|
|
29
|
-
-
|
|
31
|
+
- Built-in Stripe profile as reference implementation (custom profiles easily added)
|
|
32
|
+
|
|
33
|
+
### NEW in v1.3.0: Rich Metadata Extraction
|
|
34
|
+
- **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
|
|
35
|
+
- **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
|
|
36
|
+
- **AI/RAG Ready**: Richer context for embeddings and retrieval systems
|
|
37
|
+
- **Opt-in Feature**: Enabled with `--rich-metadata` flag
|
|
30
38
|
|
|
31
|
-
###
|
|
39
|
+
### v1.2.0: Advanced Optimization
|
|
32
40
|
- **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
|
|
33
41
|
- **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
|
|
34
42
|
- **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
|
|
@@ -60,6 +68,9 @@ docpull stripe # use a built-in profile
|
|
|
60
68
|
# NEW: Simple optimization (v1.2.0)
|
|
61
69
|
docpull https://code.claude.com/docs --language en --create-index
|
|
62
70
|
|
|
71
|
+
# NEW: Rich metadata extraction (v1.3.0)
|
|
72
|
+
docpull https://docs.anthropic.com --rich-metadata --create-index
|
|
73
|
+
|
|
63
74
|
# NEW: Advanced optimization (v1.2.0)
|
|
64
75
|
docpull https://aptos.dev \
|
|
65
76
|
--deduplicate \
|
|
@@ -120,6 +131,7 @@ fetcher.fetch()
|
|
|
120
131
|
- `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
|
|
121
132
|
- `--create-index` – generate INDEX.md with navigation
|
|
122
133
|
- `--extract-metadata` – extract metadata to metadata.json
|
|
134
|
+
- `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
|
|
123
135
|
- `--update-only-changed` – only download changed files
|
|
124
136
|
- `--incremental` – enable incremental mode with resume
|
|
125
137
|
- `--git-commit` – auto-commit changes
|
|
@@ -153,6 +165,24 @@ fetched: 2025-11-13
|
|
|
153
165
|
...
|
|
154
166
|
```
|
|
155
167
|
|
|
168
|
+
With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
|
|
169
|
+
|
|
170
|
+
```markdown
|
|
171
|
+
---
|
|
172
|
+
url: https://stripe.com/docs/payments
|
|
173
|
+
fetched: 2025-11-13
|
|
174
|
+
title: Accept a payment
|
|
175
|
+
description: Learn how to accept payments with the Payment Intents API
|
|
176
|
+
author: Stripe
|
|
177
|
+
keywords: [payments, api, stripe, checkout]
|
|
178
|
+
image: https://stripe.com/img/docs-preview.png
|
|
179
|
+
type: article
|
|
180
|
+
site_name: Stripe Documentation
|
|
181
|
+
---
|
|
182
|
+
# Payment Intents
|
|
183
|
+
...
|
|
184
|
+
```
|
|
185
|
+
|
|
156
186
|
Directory layout mirrors the target site's structure.
|
|
157
187
|
|
|
158
188
|
## Configuration File
|
|
@@ -163,8 +193,8 @@ Directory layout mirrors the target site's structure.
|
|
|
163
193
|
output_dir: ./docs
|
|
164
194
|
rate_limit: 0.5
|
|
165
195
|
sources:
|
|
166
|
-
- stripe
|
|
167
|
-
-
|
|
196
|
+
- stripe # Built-in profile
|
|
197
|
+
- https://docs.example.com # Or any URL
|
|
168
198
|
```
|
|
169
199
|
|
|
170
200
|
Run with:
|
|
@@ -181,6 +211,7 @@ sources:
|
|
|
181
211
|
language: en
|
|
182
212
|
max_file_size: 200kb
|
|
183
213
|
create_index: true
|
|
214
|
+
rich_metadata: true # Extract Open Graph, JSON-LD metadata
|
|
184
215
|
|
|
185
216
|
claude-code:
|
|
186
217
|
url: https://code.claude.com/docs
|
|
@@ -212,7 +243,7 @@ See `examples/` directory for more configuration examples.
|
|
|
212
243
|
|
|
213
244
|
## Custom Profiles
|
|
214
245
|
|
|
215
|
-
|
|
246
|
+
docpull includes a Stripe profile as reference. Create custom profiles for other sites:
|
|
216
247
|
|
|
217
248
|
```python
|
|
218
249
|
from docpull.profiles.base import SiteProfile
|
|
@@ -221,9 +252,13 @@ MY_PROFILE = SiteProfile(
|
|
|
221
252
|
name="mysite",
|
|
222
253
|
domains={"docs.mysite.com"},
|
|
223
254
|
include_patterns=["/docs/", "/api/"],
|
|
255
|
+
sitemap_url="https://docs.mysite.com/sitemap.xml",
|
|
256
|
+
rate_limit=0.5,
|
|
224
257
|
)
|
|
225
258
|
```
|
|
226
259
|
|
|
260
|
+
**Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
|
|
261
|
+
|
|
227
262
|
## Security
|
|
228
263
|
|
|
229
264
|
- HTTPS-only
|
|
@@ -297,6 +332,34 @@ See `examples/` directory for comprehensive configuration examples.
|
|
|
297
332
|
- **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
|
|
298
333
|
- **One command** instead of 4+ separate commands with manual optimization
|
|
299
334
|
|
|
335
|
+
## What's New in v1.3.0
|
|
336
|
+
|
|
337
|
+
This release adds rich structured metadata extraction for better AI/RAG integration.
|
|
338
|
+
|
|
339
|
+
**New Feature**:
|
|
340
|
+
- **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
|
|
341
|
+
- Adds author, description, keywords, images, publish dates, and more to frontmatter
|
|
342
|
+
- Enhances AI/RAG systems with richer context
|
|
343
|
+
- Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
|
|
344
|
+
- Powered by the extruct library
|
|
345
|
+
|
|
346
|
+
**Example enhanced frontmatter**:
|
|
347
|
+
```yaml
|
|
348
|
+
---
|
|
349
|
+
url: https://docs.example.com/guide
|
|
350
|
+
fetched: 2025-11-20
|
|
351
|
+
title: Getting Started Guide
|
|
352
|
+
description: Learn the basics of our platform
|
|
353
|
+
author: John Doe
|
|
354
|
+
keywords: [tutorial, guide, api]
|
|
355
|
+
image: https://docs.example.com/og-image.png
|
|
356
|
+
type: article
|
|
357
|
+
published_time: 2024-01-15T10:00:00Z
|
|
358
|
+
---
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
**Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
|
|
362
|
+
|
|
300
363
|
## What's New in v1.2.0
|
|
301
364
|
|
|
302
365
|
This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
__version__ = "1.3.0"
|
|
2
|
+
|
|
3
|
+
from .fetchers.base import BaseFetcher
|
|
4
|
+
from .fetchers.generic import GenericFetcher
|
|
5
|
+
from .fetchers.generic_async import GenericAsyncFetcher
|
|
6
|
+
from .fetchers.parallel_base import ParallelFetcher
|
|
7
|
+
from .fetchers.stripe import StripeFetcher
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"BaseFetcher",
|
|
11
|
+
"GenericFetcher",
|
|
12
|
+
"GenericAsyncFetcher",
|
|
13
|
+
"ParallelFetcher",
|
|
14
|
+
"StripeFetcher",
|
|
15
|
+
]
|
|
@@ -295,13 +295,18 @@ Examples:
|
|
|
295
295
|
)
|
|
296
296
|
|
|
297
297
|
# Index Generation
|
|
298
|
-
index_group = parser.add_argument_group("index generation")
|
|
298
|
+
index_group = parser.add_argument_group("index generation & metadata")
|
|
299
299
|
index_group.add_argument(
|
|
300
300
|
"--create-index", action="store_true", help="Create INDEX.md with file tree and navigation"
|
|
301
301
|
)
|
|
302
302
|
index_group.add_argument(
|
|
303
303
|
"--extract-metadata", action="store_true", help="Extract metadata to metadata.json"
|
|
304
304
|
)
|
|
305
|
+
index_group.add_argument(
|
|
306
|
+
"--rich-metadata",
|
|
307
|
+
action="store_true",
|
|
308
|
+
help="Extract rich structured metadata (Open Graph, JSON-LD) during fetch",
|
|
309
|
+
)
|
|
305
310
|
|
|
306
311
|
# Update Detection
|
|
307
312
|
cache_group = parser.add_argument_group("update detection & caching")
|
|
@@ -635,6 +640,7 @@ def run_generic_fetchers(args: argparse.Namespace) -> int:
|
|
|
635
640
|
max_concurrent=max_concurrent,
|
|
636
641
|
use_js=use_js,
|
|
637
642
|
show_progress=show_progress,
|
|
643
|
+
use_rich_metadata=args.rich_metadata,
|
|
638
644
|
)
|
|
639
645
|
fetcher.fetch() # This calls asyncio.run() internally
|
|
640
646
|
|
|
@@ -741,6 +747,7 @@ def run_multi_source_fetch(args: argparse.Namespace) -> int:
|
|
|
741
747
|
max_concurrent=source_config.max_concurrent or 10,
|
|
742
748
|
use_js=source_config.javascript,
|
|
743
749
|
show_progress=True,
|
|
750
|
+
use_rich_metadata=source_config.rich_metadata or False,
|
|
744
751
|
)
|
|
745
752
|
|
|
746
753
|
# Fetch
|
|
@@ -34,6 +34,7 @@ class FetcherConfig:
|
|
|
34
34
|
naming_strategy: str = "full",
|
|
35
35
|
create_index: bool = False,
|
|
36
36
|
extract_metadata: bool = False,
|
|
37
|
+
rich_metadata: bool = False,
|
|
37
38
|
update_only_changed: bool = False,
|
|
38
39
|
incremental: bool = False,
|
|
39
40
|
cache_dir: str = ".docpull-cache",
|
|
@@ -52,7 +53,7 @@ class FetcherConfig:
|
|
|
52
53
|
skip_existing: Skip existing files
|
|
53
54
|
log_level: Logging level
|
|
54
55
|
log_file: Optional log file path
|
|
55
|
-
sources: List of sources to fetch (e.g., ['stripe', '
|
|
56
|
+
sources: List of sources to fetch (profile names or URLs, e.g., ['stripe', 'https://docs.example.com'])
|
|
56
57
|
dry_run: Dry run mode (don't download files)
|
|
57
58
|
language: Include only this language (e.g., 'en')
|
|
58
59
|
exclude_languages: Exclude these languages
|
|
@@ -67,6 +68,7 @@ class FetcherConfig:
|
|
|
67
68
|
naming_strategy: File naming strategy (full, short, flat, hierarchical)
|
|
68
69
|
create_index: Create INDEX.md with navigation
|
|
69
70
|
extract_metadata: Extract metadata to metadata.json
|
|
71
|
+
rich_metadata: Extract rich structured metadata (Open Graph, JSON-LD) during fetch
|
|
70
72
|
update_only_changed: Only download changed files
|
|
71
73
|
incremental: Enable incremental mode
|
|
72
74
|
cache_dir: Cache directory for update detection
|
|
@@ -81,7 +83,7 @@ class FetcherConfig:
|
|
|
81
83
|
self.skip_existing = skip_existing
|
|
82
84
|
self.log_level = log_level
|
|
83
85
|
self.log_file = log_file
|
|
84
|
-
self.sources = sources or ["
|
|
86
|
+
self.sources = sources or ["stripe"]
|
|
85
87
|
self.dry_run = dry_run
|
|
86
88
|
|
|
87
89
|
# v1.2.0 features
|
|
@@ -98,6 +100,7 @@ class FetcherConfig:
|
|
|
98
100
|
self.naming_strategy = naming_strategy
|
|
99
101
|
self.create_index = create_index
|
|
100
102
|
self.extract_metadata = extract_metadata
|
|
103
|
+
self.rich_metadata = rich_metadata
|
|
101
104
|
self.update_only_changed = update_only_changed
|
|
102
105
|
self.incremental = incremental
|
|
103
106
|
self.cache_dir = Path(cache_dir)
|
|
@@ -131,11 +134,13 @@ class FetcherConfig:
|
|
|
131
134
|
if not isinstance(rate_limit, (int, float)) or rate_limit < 0 or rate_limit > 60:
|
|
132
135
|
raise ValueError("rate_limit must be between 0 and 60")
|
|
133
136
|
|
|
134
|
-
# Validate sources
|
|
135
|
-
valid_sources = {"
|
|
136
|
-
sources = config_dict.get("sources", ["
|
|
137
|
-
|
|
138
|
-
|
|
137
|
+
# Validate sources (built-in profiles or URLs)
|
|
138
|
+
valid_sources = {"stripe"}
|
|
139
|
+
sources = config_dict.get("sources", ["stripe"])
|
|
140
|
+
# Allow URLs or valid profile names
|
|
141
|
+
for source in sources:
|
|
142
|
+
if not (source in valid_sources or source.startswith("http://") or source.startswith("https://")):
|
|
143
|
+
raise ValueError(f"Invalid source: {source}. Must be 'stripe' or a URL")
|
|
139
144
|
|
|
140
145
|
# Validate log_level
|
|
141
146
|
log_level = config_dict.get("log_level", "INFO")
|
|
@@ -59,10 +59,12 @@ class BaseFetcher(ABC):
|
|
|
59
59
|
skip_existing: bool = True,
|
|
60
60
|
logger: Optional[logging.Logger] = None,
|
|
61
61
|
allowed_domains: Optional[set[str]] = None,
|
|
62
|
+
use_rich_metadata: bool = False,
|
|
62
63
|
) -> None:
|
|
63
64
|
self.output_dir = Path(output_dir).resolve()
|
|
64
65
|
self.rate_limit = rate_limit
|
|
65
66
|
self.skip_existing = skip_existing
|
|
67
|
+
self.use_rich_metadata = use_rich_metadata
|
|
66
68
|
self.logger = logger or logging.getLogger(f"{__name__}.{self.__class__.__name__}")
|
|
67
69
|
self.allowed_domains = allowed_domains
|
|
68
70
|
self.h2t = html2text.HTML2Text()
|
|
@@ -98,6 +100,14 @@ class BaseFetcher(ABC):
|
|
|
98
100
|
if user_agent is None:
|
|
99
101
|
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
100
102
|
self.session.headers.update({"User-Agent": user_agent})
|
|
103
|
+
|
|
104
|
+
# Initialize rich metadata extractor if enabled
|
|
105
|
+
self.rich_metadata_extractor = None
|
|
106
|
+
if self.use_rich_metadata:
|
|
107
|
+
from ..metadata_extractor import RichMetadataExtractor
|
|
108
|
+
|
|
109
|
+
self.rich_metadata_extractor = RichMetadataExtractor()
|
|
110
|
+
|
|
101
111
|
self.stats: FetcherStats = {
|
|
102
112
|
"fetched": 0,
|
|
103
113
|
"skipped": 0,
|
|
@@ -358,6 +368,14 @@ class BaseFetcher(ABC):
|
|
|
358
368
|
|
|
359
369
|
soup = BeautifulSoup(content, "html.parser")
|
|
360
370
|
|
|
371
|
+
# Extract rich metadata if enabled
|
|
372
|
+
rich_meta = None
|
|
373
|
+
if self.use_rich_metadata and self.rich_metadata_extractor:
|
|
374
|
+
try:
|
|
375
|
+
rich_meta = self.rich_metadata_extractor.extract(content.decode("utf-8"), url)
|
|
376
|
+
except Exception as e:
|
|
377
|
+
self.logger.debug(f"Rich metadata extraction failed for {url}: {e}")
|
|
378
|
+
|
|
361
379
|
for element in soup(["script", "style", "nav", "footer", "header"]):
|
|
362
380
|
element.decompose()
|
|
363
381
|
main_content = (
|
|
@@ -369,12 +387,47 @@ class BaseFetcher(ABC):
|
|
|
369
387
|
|
|
370
388
|
if main_content:
|
|
371
389
|
markdown = self.h2t.handle(str(main_content))
|
|
372
|
-
frontmatter = f"""---
|
|
373
|
-
url: {url}
|
|
374
|
-
fetched: {time.strftime('%Y-%m-%d')}
|
|
375
|
-
---
|
|
376
390
|
|
|
377
|
-
|
|
391
|
+
# Build frontmatter with optional rich metadata
|
|
392
|
+
frontmatter_parts = [
|
|
393
|
+
"---",
|
|
394
|
+
f"url: {url}",
|
|
395
|
+
f"fetched: {time.strftime('%Y-%m-%d')}",
|
|
396
|
+
]
|
|
397
|
+
|
|
398
|
+
if rich_meta:
|
|
399
|
+
# Add rich metadata fields if available
|
|
400
|
+
if rich_meta.get("title"):
|
|
401
|
+
frontmatter_parts.append(f"title: {rich_meta['title']}")
|
|
402
|
+
if rich_meta.get("description"):
|
|
403
|
+
# Escape any colons in description
|
|
404
|
+
desc = str(rich_meta["description"]).replace(":", "\\:")
|
|
405
|
+
frontmatter_parts.append(f"description: {desc}")
|
|
406
|
+
if rich_meta.get("author"):
|
|
407
|
+
frontmatter_parts.append(f"author: {rich_meta['author']}")
|
|
408
|
+
if rich_meta.get("keywords"):
|
|
409
|
+
keywords_str = ", ".join(rich_meta["keywords"])
|
|
410
|
+
frontmatter_parts.append(f"keywords: [{keywords_str}]")
|
|
411
|
+
if rich_meta.get("image"):
|
|
412
|
+
frontmatter_parts.append(f"image: {rich_meta['image']}")
|
|
413
|
+
if rich_meta.get("type"):
|
|
414
|
+
frontmatter_parts.append(f"type: {rich_meta['type']}")
|
|
415
|
+
if rich_meta.get("site_name"):
|
|
416
|
+
frontmatter_parts.append(f"site_name: {rich_meta['site_name']}")
|
|
417
|
+
if rich_meta.get("published_time"):
|
|
418
|
+
frontmatter_parts.append(f"published_time: {rich_meta['published_time']}")
|
|
419
|
+
if rich_meta.get("modified_time"):
|
|
420
|
+
frontmatter_parts.append(f"modified_time: {rich_meta['modified_time']}")
|
|
421
|
+
if rich_meta.get("section"):
|
|
422
|
+
frontmatter_parts.append(f"section: {rich_meta['section']}")
|
|
423
|
+
if rich_meta.get("tags"):
|
|
424
|
+
tags_str = ", ".join(rich_meta["tags"])
|
|
425
|
+
frontmatter_parts.append(f"tags: [{tags_str}]")
|
|
426
|
+
|
|
427
|
+
frontmatter_parts.append("---")
|
|
428
|
+
frontmatter_parts.append("") # Empty line after frontmatter
|
|
429
|
+
|
|
430
|
+
frontmatter = "\n".join(frontmatter_parts)
|
|
378
431
|
return frontmatter + markdown.strip()
|
|
379
432
|
else:
|
|
380
433
|
return f"# Error\n\nCould not find main content for {url}"
|
|
@@ -38,6 +38,7 @@ class GenericAsyncFetcher(BaseFetcher):
|
|
|
38
38
|
max_concurrent: int = 10,
|
|
39
39
|
use_js: bool = False,
|
|
40
40
|
show_progress: bool = True,
|
|
41
|
+
use_rich_metadata: bool = False,
|
|
41
42
|
) -> None:
|
|
42
43
|
"""
|
|
43
44
|
Initialize async generic fetcher.
|
|
@@ -54,8 +55,15 @@ class GenericAsyncFetcher(BaseFetcher):
|
|
|
54
55
|
max_concurrent: Maximum concurrent requests
|
|
55
56
|
use_js: Enable JavaScript rendering (requires playwright)
|
|
56
57
|
show_progress: Show progress bars
|
|
58
|
+
use_rich_metadata: Extract rich structured metadata (Open Graph, JSON-LD)
|
|
57
59
|
"""
|
|
58
|
-
super().__init__(
|
|
60
|
+
super().__init__(
|
|
61
|
+
output_dir,
|
|
62
|
+
rate_limit,
|
|
63
|
+
skip_existing=skip_existing,
|
|
64
|
+
logger=logger,
|
|
65
|
+
use_rich_metadata=use_rich_metadata,
|
|
66
|
+
)
|
|
59
67
|
|
|
60
68
|
# Determine if input is a URL or profile name
|
|
61
69
|
if url_or_profile.startswith(("http://", "https://")):
|