docpull 1.2.1__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. {docpull-1.2.1 → docpull-1.3.0}/CHANGELOG.md +75 -0
  2. {docpull-1.2.1 → docpull-1.3.0}/PKG-INFO +71 -7
  3. {docpull-1.2.1 → docpull-1.3.0}/README.md +69 -6
  4. docpull-1.3.0/docpull/__init__.py +15 -0
  5. {docpull-1.2.1 → docpull-1.3.0}/docpull/cli.py +8 -1
  6. {docpull-1.2.1 → docpull-1.3.0}/docpull/config.py +12 -7
  7. docpull-1.3.0/docpull/fetchers/__init__.py +9 -0
  8. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/base.py +58 -5
  9. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/generic_async.py +9 -1
  10. docpull-1.3.0/docpull/metadata_extractor.py +283 -0
  11. {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/__init__.py +5 -22
  12. {docpull-1.2.1 → docpull-1.3.0}/docpull/sources_config.py +1 -0
  13. {docpull-1.2.1 → docpull-1.3.0}/docpull.egg-info/SOURCES.txt +2 -14
  14. {docpull-1.2.1 → docpull-1.3.0}/pyproject.toml +6 -1
  15. {docpull-1.2.1 → docpull-1.3.0}/tests/test_config.py +3 -3
  16. docpull-1.3.0/tests/test_metadata_extractor.py +233 -0
  17. docpull-1.2.1/docpull/__init__.py +0 -29
  18. docpull-1.2.1/docpull/fetchers/__init__.py +0 -23
  19. docpull-1.2.1/docpull/fetchers/bun.py +0 -59
  20. docpull-1.2.1/docpull/fetchers/d3.py +0 -211
  21. docpull-1.2.1/docpull/fetchers/nextjs.py +0 -59
  22. docpull-1.2.1/docpull/fetchers/plaid.py +0 -89
  23. docpull-1.2.1/docpull/fetchers/react.py +0 -59
  24. docpull-1.2.1/docpull/fetchers/tailwind.py +0 -59
  25. docpull-1.2.1/docpull/fetchers/turborepo.py +0 -57
  26. docpull-1.2.1/docpull/profiles/bun.py +0 -14
  27. docpull-1.2.1/docpull/profiles/d3.py +0 -17
  28. docpull-1.2.1/docpull/profiles/nextjs.py +0 -15
  29. docpull-1.2.1/docpull/profiles/plaid.py +0 -16
  30. docpull-1.2.1/docpull/profiles/react.py +0 -14
  31. docpull-1.2.1/docpull/profiles/tailwind.py +0 -14
  32. docpull-1.2.1/docpull/profiles/turborepo.py +0 -14
  33. {docpull-1.2.1 → docpull-1.3.0}/.editorconfig +0 -0
  34. {docpull-1.2.1 → docpull-1.3.0}/.pre-commit-config.yaml +0 -0
  35. {docpull-1.2.1 → docpull-1.3.0}/CONTRIBUTING.md +0 -0
  36. {docpull-1.2.1 → docpull-1.3.0}/LICENSE +0 -0
  37. {docpull-1.2.1 → docpull-1.3.0}/MANIFEST.in +0 -0
  38. {docpull-1.2.1 → docpull-1.3.0}/Makefile +0 -0
  39. {docpull-1.2.1 → docpull-1.3.0}/SECURITY.md +0 -0
  40. {docpull-1.2.1 → docpull-1.3.0}/TROUBLESHOOTING.md +0 -0
  41. {docpull-1.2.1 → docpull-1.3.0}/docpull/__main__.py +0 -0
  42. {docpull-1.2.1 → docpull-1.3.0}/docpull/archive.py +0 -0
  43. {docpull-1.2.1 → docpull-1.3.0}/docpull/cache.py +0 -0
  44. {docpull-1.2.1 → docpull-1.3.0}/docpull/doctor.py +0 -0
  45. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/async_fetcher.py +0 -0
  46. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/generic.py +0 -0
  47. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/parallel_base.py +0 -0
  48. {docpull-1.2.1 → docpull-1.3.0}/docpull/fetchers/stripe.py +0 -0
  49. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/__init__.py +0 -0
  50. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/base.py +0 -0
  51. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/json.py +0 -0
  52. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/markdown.py +0 -0
  53. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/sqlite.py +0 -0
  54. {docpull-1.2.1 → docpull-1.3.0}/docpull/formatters/toon.py +0 -0
  55. {docpull-1.2.1 → docpull-1.3.0}/docpull/hooks.py +0 -0
  56. {docpull-1.2.1 → docpull-1.3.0}/docpull/indexer.py +0 -0
  57. {docpull-1.2.1 → docpull-1.3.0}/docpull/metadata.py +0 -0
  58. {docpull-1.2.1 → docpull-1.3.0}/docpull/naming.py +0 -0
  59. {docpull-1.2.1 → docpull-1.3.0}/docpull/orchestrator.py +0 -0
  60. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/__init__.py +0 -0
  61. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/base.py +0 -0
  62. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/content_filter.py +0 -0
  63. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/deduplicator.py +0 -0
  64. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/language_filter.py +0 -0
  65. {docpull-1.2.1 → docpull-1.3.0}/docpull/processors/size_limiter.py +0 -0
  66. {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/base.py +0 -0
  67. {docpull-1.2.1 → docpull-1.3.0}/docpull/profiles/stripe.py +0 -0
  68. {docpull-1.2.1 → docpull-1.3.0}/docpull/py.typed +0 -0
  69. {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/__init__.py +0 -0
  70. {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/file_utils.py +0 -0
  71. {docpull-1.2.1 → docpull-1.3.0}/docpull/utils/logging_config.py +0 -0
  72. {docpull-1.2.1 → docpull-1.3.0}/docpull/vcs.py +0 -0
  73. {docpull-1.2.1 → docpull-1.3.0}/examples/README.md +0 -0
  74. {docpull-1.2.1 → docpull-1.3.0}/examples/deduplication-strategies.yaml +0 -0
  75. {docpull-1.2.1 → docpull-1.3.0}/examples/format-conversion.yaml +0 -0
  76. {docpull-1.2.1 → docpull-1.3.0}/examples/incremental-updates.yaml +0 -0
  77. {docpull-1.2.1 → docpull-1.3.0}/examples/multi-source-optimized.yaml +0 -0
  78. {docpull-1.2.1 → docpull-1.3.0}/examples/selective-crawling.yaml +0 -0
  79. {docpull-1.2.1 → docpull-1.3.0}/examples/simple-optimization.yaml +0 -0
  80. {docpull-1.2.1 → docpull-1.3.0}/requirements.txt +0 -0
  81. {docpull-1.2.1 → docpull-1.3.0}/setup.cfg +0 -0
  82. {docpull-1.2.1 → docpull-1.3.0}/tests/test_orchestrator.py +0 -0
  83. {docpull-1.2.1 → docpull-1.3.0}/tests/test_sources_config.py +0 -0
@@ -5,6 +5,81 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [1.3.0] - 2025-11-20
9
+
10
+ ### Added
11
+
12
+ **Rich Metadata Extraction**
13
+ - Extract structured metadata (Open Graph, JSON-LD, microdata) during fetch
14
+ - New `--rich-metadata` CLI flag to enable rich metadata extraction
15
+ - Enhanced frontmatter with author, description, keywords, images, publish dates, tags, and more
16
+ - Better context for AI/RAG systems with richer document metadata
17
+ - Powered by `extruct` library
18
+ - Opt-in feature, backward compatible with existing workflows
19
+
20
+ ### Changed
21
+
22
+ **Simplified Profile System**
23
+ - Removed 7 built-in profiles (Next.js, React, Plaid, Tailwind, Bun, D3, Turborepo)
24
+ - Kept Stripe profile as reference implementation
25
+ - Generic fetcher works excellently for all documentation sites
26
+ - Users can create custom profiles or use URLs directly
27
+ - Reduced maintenance burden and codebase complexity
28
+
29
+ ### Technical Details
30
+
31
+ **New Dependencies:**
32
+ - Added `extruct>=0.15.0` for structured metadata extraction
33
+
34
+ **New Files:**
35
+ - `docpull/metadata_extractor.py` - Rich metadata extraction module
36
+ - `tests/test_metadata_extractor.py` - Comprehensive test suite for metadata extraction
37
+
38
+ **Updated Files:**
39
+ - `docpull/fetchers/base.py` - Integrated rich metadata extraction into fetch pipeline
40
+ - `docpull/fetchers/generic_async.py` - Added `use_rich_metadata` parameter
41
+ - `docpull/config.py` - Added `rich_metadata` configuration option
42
+ - `docpull/sources_config.py` - Added `rich_metadata` field to SourceConfig
43
+ - `docpull/cli.py` - Added `--rich-metadata` CLI flag
44
+ - `docpull/profiles/__init__.py` - Simplified to single Stripe profile
45
+
46
+ **Removed Files:**
47
+ - Removed 7 profile files and 7 fetcher implementation files
48
+
49
+ **Version Bump:**
50
+ - Updated version from `1.2.1` to `1.3.0`
51
+
52
+ ### Example Usage
53
+
54
+ ```bash
55
+ # Extract rich metadata during fetch
56
+ docpull https://docs.anthropic.com --rich-metadata
57
+
58
+ # Combine with other features
59
+ docpull https://stripe.com/docs --rich-metadata --create-index --language en
60
+
61
+ # Multi-source configuration
62
+ docpull --sources-file config.yaml # with rich_metadata: true per source
63
+ ```
64
+
65
+ ### Example Enhanced Frontmatter
66
+
67
+ ```yaml
68
+ ---
69
+ url: https://docs.example.com/guide
70
+ fetched: 2025-11-20
71
+ title: Getting Started Guide
72
+ description: Learn the basics of our platform
73
+ author: John Doe
74
+ keywords: [tutorial, guide, api]
75
+ image: https://docs.example.com/og-image.png
76
+ type: article
77
+ site_name: Example Docs
78
+ published_time: 2024-01-15T10:00:00Z
79
+ modified_time: 2024-01-20T15:30:00Z
80
+ ---
81
+ ```
82
+
8
83
  ## [1.2.0] - 2025-11-16
9
84
 
10
85
  ### Added - 15 Major New Features
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 1.2.1
3
+ Version: 1.3.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -43,6 +43,7 @@ Requires-Dist: requests>=2.31.0
43
43
  Requires-Dist: beautifulsoup4>=4.12.0
44
44
  Requires-Dist: html2text>=2020.1.16
45
45
  Requires-Dist: defusedxml>=0.7.1
46
+ Requires-Dist: extruct>=0.15.0
46
47
  Requires-Dist: aiohttp>=3.9.0
47
48
  Requires-Dist: rich>=13.0.0
48
49
  Requires-Dist: pyyaml>=6.0
@@ -72,7 +73,9 @@ Dynamic: license-file
72
73
  **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
73
74
  Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
74
75
 
75
- **NEW in v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
76
+ **NEW in v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
77
+
78
+ **v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
76
79
 
77
80
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
78
81
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
@@ -95,9 +98,15 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
95
98
  - Sitemap + link crawling
96
99
  - Rate limiting, timeouts, content-type checks
97
100
  - Saves docs in structured Markdown with YAML metadata
98
- - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
101
+ - Built-in Stripe profile as reference implementation (custom profiles easily added)
102
+
103
+ ### NEW in v1.3.0: Rich Metadata Extraction
104
+ - **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
105
+ - **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
106
+ - **AI/RAG Ready**: Richer context for embeddings and retrieval systems
107
+ - **Opt-in Feature**: Enabled with `--rich-metadata` flag
99
108
 
100
- ### NEW in v1.2.0: Advanced Optimization
109
+ ### v1.2.0: Advanced Optimization
101
110
  - **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
102
111
  - **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
103
112
  - **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
@@ -129,6 +138,9 @@ docpull stripe # use a built-in profile
129
138
  # NEW: Simple optimization (v1.2.0)
130
139
  docpull https://code.claude.com/docs --language en --create-index
131
140
 
141
+ # NEW: Rich metadata extraction (v1.3.0)
142
+ docpull https://docs.anthropic.com --rich-metadata --create-index
143
+
132
144
  # NEW: Advanced optimization (v1.2.0)
133
145
  docpull https://aptos.dev \
134
146
  --deduplicate \
@@ -189,6 +201,7 @@ fetcher.fetch()
189
201
  - `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
190
202
  - `--create-index` – generate INDEX.md with navigation
191
203
  - `--extract-metadata` – extract metadata to metadata.json
204
+ - `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
192
205
  - `--update-only-changed` – only download changed files
193
206
  - `--incremental` – enable incremental mode with resume
194
207
  - `--git-commit` – auto-commit changes
@@ -222,6 +235,24 @@ fetched: 2025-11-13
222
235
  ...
223
236
  ```
224
237
 
238
+ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
239
+
240
+ ```markdown
241
+ ---
242
+ url: https://stripe.com/docs/payments
243
+ fetched: 2025-11-13
244
+ title: Accept a payment
245
+ description: Learn how to accept payments with the Payment Intents API
246
+ author: Stripe
247
+ keywords: [payments, api, stripe, checkout]
248
+ image: https://stripe.com/img/docs-preview.png
249
+ type: article
250
+ site_name: Stripe Documentation
251
+ ---
252
+ # Payment Intents
253
+ ...
254
+ ```
255
+
225
256
  Directory layout mirrors the target site's structure.
226
257
 
227
258
  ## Configuration File
@@ -232,8 +263,8 @@ Directory layout mirrors the target site's structure.
232
263
  output_dir: ./docs
233
264
  rate_limit: 0.5
234
265
  sources:
235
- - stripe
236
- - nextjs
266
+ - stripe # Built-in profile
267
+ - https://docs.example.com # Or any URL
237
268
  ```
238
269
 
239
270
  Run with:
@@ -250,6 +281,7 @@ sources:
250
281
  language: en
251
282
  max_file_size: 200kb
252
283
  create_index: true
284
+ rich_metadata: true # Extract Open Graph, JSON-LD metadata
253
285
 
254
286
  claude-code:
255
287
  url: https://code.claude.com/docs
@@ -281,7 +313,7 @@ See `examples/` directory for more configuration examples.
281
313
 
282
314
  ## Custom Profiles
283
315
 
284
- Easily define profiles for frequently scraped sites.
316
+ docpull includes a Stripe profile as reference. Create custom profiles for other sites:
285
317
 
286
318
  ```python
287
319
  from docpull.profiles.base import SiteProfile
@@ -290,9 +322,13 @@ MY_PROFILE = SiteProfile(
290
322
  name="mysite",
291
323
  domains={"docs.mysite.com"},
292
324
  include_patterns=["/docs/", "/api/"],
325
+ sitemap_url="https://docs.mysite.com/sitemap.xml",
326
+ rate_limit=0.5,
293
327
  )
294
328
  ```
295
329
 
330
+ **Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
331
+
296
332
  ## Security
297
333
 
298
334
  - HTTPS-only
@@ -366,6 +402,34 @@ See `examples/` directory for comprehensive configuration examples.
366
402
  - **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
367
403
  - **One command** instead of 4+ separate commands with manual optimization
368
404
 
405
+ ## What's New in v1.3.0
406
+
407
+ This release adds rich structured metadata extraction for better AI/RAG integration.
408
+
409
+ **New Feature**:
410
+ - **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
411
+ - Adds author, description, keywords, images, publish dates, and more to frontmatter
412
+ - Enhances AI/RAG systems with richer context
413
+ - Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
414
+ - Powered by the extruct library
415
+
416
+ **Example enhanced frontmatter**:
417
+ ```yaml
418
+ ---
419
+ url: https://docs.example.com/guide
420
+ fetched: 2025-11-20
421
+ title: Getting Started Guide
422
+ description: Learn the basics of our platform
423
+ author: John Doe
424
+ keywords: [tutorial, guide, api]
425
+ image: https://docs.example.com/og-image.png
426
+ type: article
427
+ published_time: 2024-01-15T10:00:00Z
428
+ ---
429
+ ```
430
+
431
+ **Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
432
+
369
433
  ## What's New in v1.2.0
370
434
 
371
435
  This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
@@ -3,7 +3,9 @@
3
3
  **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
4
4
  Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
5
5
 
6
- **NEW in v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
6
+ **NEW in v1.3.0**: Rich structured metadata extraction (Open Graph, JSON-LD) for enhanced AI/RAG integration.
7
+
8
+ **v1.2.0**: 15 major features including language filtering, deduplication, auto-indexing, multi-source configuration, and more. Real-world testing shows **58% size reduction** with automatic optimization.
7
9
 
8
10
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
9
11
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
@@ -26,9 +28,15 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
26
28
  - Sitemap + link crawling
27
29
  - Rate limiting, timeouts, content-type checks
28
30
  - Saves docs in structured Markdown with YAML metadata
29
- - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
31
+ - Built-in Stripe profile as reference implementation (custom profiles easily added)
32
+
33
+ ### NEW in v1.3.0: Rich Metadata Extraction
34
+ - **Structured Metadata**: Extract Open Graph, JSON-LD, and microdata during fetch
35
+ - **Enhanced Frontmatter**: Adds author, description, keywords, images, publish dates, and more
36
+ - **AI/RAG Ready**: Richer context for embeddings and retrieval systems
37
+ - **Opt-in Feature**: Enabled with `--rich-metadata` flag
30
38
 
31
- ### NEW in v1.2.0: Advanced Optimization
39
+ ### v1.2.0: Advanced Optimization
32
40
  - **Language Filtering**: Auto-detect and filter by language (skip 352+ translation files)
33
41
  - **Deduplication**: Remove duplicates with SHA-256 hashing (save 10+ MB on duplicate content)
34
42
  - **Auto-Index Generation**: Create navigable INDEX.md with tree/TOC/categories/stats
@@ -60,6 +68,9 @@ docpull stripe # use a built-in profile
60
68
  # NEW: Simple optimization (v1.2.0)
61
69
  docpull https://code.claude.com/docs --language en --create-index
62
70
 
71
+ # NEW: Rich metadata extraction (v1.3.0)
72
+ docpull https://docs.anthropic.com --rich-metadata --create-index
73
+
63
74
  # NEW: Advanced optimization (v1.2.0)
64
75
  docpull https://aptos.dev \
65
76
  --deduplicate \
@@ -120,6 +131,7 @@ fetcher.fetch()
120
131
  - `--naming-strategy {full,short,flat,hierarchical}` – file naming strategy
121
132
  - `--create-index` – generate INDEX.md with navigation
122
133
  - `--extract-metadata` – extract metadata to metadata.json
134
+ - `--rich-metadata` – extract rich structured metadata (Open Graph, JSON-LD) during fetch
123
135
  - `--update-only-changed` – only download changed files
124
136
  - `--incremental` – enable incremental mode with resume
125
137
  - `--git-commit` – auto-commit changes
@@ -153,6 +165,24 @@ fetched: 2025-11-13
153
165
  ...
154
166
  ```
155
167
 
168
+ With `--rich-metadata`, the frontmatter includes Open Graph, JSON-LD, and other structured metadata:
169
+
170
+ ```markdown
171
+ ---
172
+ url: https://stripe.com/docs/payments
173
+ fetched: 2025-11-13
174
+ title: Accept a payment
175
+ description: Learn how to accept payments with the Payment Intents API
176
+ author: Stripe
177
+ keywords: [payments, api, stripe, checkout]
178
+ image: https://stripe.com/img/docs-preview.png
179
+ type: article
180
+ site_name: Stripe Documentation
181
+ ---
182
+ # Payment Intents
183
+ ...
184
+ ```
185
+
156
186
  Directory layout mirrors the target site's structure.
157
187
 
158
188
  ## Configuration File
@@ -163,8 +193,8 @@ Directory layout mirrors the target site's structure.
163
193
  output_dir: ./docs
164
194
  rate_limit: 0.5
165
195
  sources:
166
- - stripe
167
- - nextjs
196
+ - stripe # Built-in profile
197
+ - https://docs.example.com # Or any URL
168
198
  ```
169
199
 
170
200
  Run with:
@@ -181,6 +211,7 @@ sources:
181
211
  language: en
182
212
  max_file_size: 200kb
183
213
  create_index: true
214
+ rich_metadata: true # Extract Open Graph, JSON-LD metadata
184
215
 
185
216
  claude-code:
186
217
  url: https://code.claude.com/docs
@@ -212,7 +243,7 @@ See `examples/` directory for more configuration examples.
212
243
 
213
244
  ## Custom Profiles
214
245
 
215
- Easily define profiles for frequently scraped sites.
246
+ docpull includes a Stripe profile as reference. Create custom profiles for other sites:
216
247
 
217
248
  ```python
218
249
  from docpull.profiles.base import SiteProfile
@@ -221,9 +252,13 @@ MY_PROFILE = SiteProfile(
221
252
  name="mysite",
222
253
  domains={"docs.mysite.com"},
223
254
  include_patterns=["/docs/", "/api/"],
255
+ sitemap_url="https://docs.mysite.com/sitemap.xml",
256
+ rate_limit=0.5,
224
257
  )
225
258
  ```
226
259
 
260
+ **Want to contribute profiles?** Submit a PR with your custom profile! Popular ones may be added to the core or a community profiles repository.
261
+
227
262
  ## Security
228
263
 
229
264
  - HTTPS-only
@@ -297,6 +332,34 @@ See `examples/` directory for comprehensive configuration examples.
297
332
  - **After**: 1,250 files, 13 MB (58% reduction), full indexes generated
298
333
  - **One command** instead of 4+ separate commands with manual optimization
299
334
 
335
+ ## What's New in v1.3.0
336
+
337
+ This release adds rich structured metadata extraction for better AI/RAG integration.
338
+
339
+ **New Feature**:
340
+ - **Rich Metadata Extraction**: Extract Open Graph, JSON-LD, microdata, and other structured metadata during fetch
341
+ - Adds author, description, keywords, images, publish dates, and more to frontmatter
342
+ - Enhances AI/RAG systems with richer context
343
+ - Enabled with `--rich-metadata` flag or `rich_metadata: true` in config
344
+ - Powered by the extruct library
345
+
346
+ **Example enhanced frontmatter**:
347
+ ```yaml
348
+ ---
349
+ url: https://docs.example.com/guide
350
+ fetched: 2025-11-20
351
+ title: Getting Started Guide
352
+ description: Learn the basics of our platform
353
+ author: John Doe
354
+ keywords: [tutorial, guide, api]
355
+ image: https://docs.example.com/og-image.png
356
+ type: article
357
+ published_time: 2024-01-15T10:00:00Z
358
+ ---
359
+ ```
360
+
361
+ **Backward Compatible**: All existing workflows continue to work unchanged. Rich metadata is opt-in.
362
+
300
363
  ## What's New in v1.2.0
301
364
 
302
365
  This release adds 15 major features across 4 phases. See [CHANGELOG.md](CHANGELOG.md) for complete release notes.
@@ -0,0 +1,15 @@
1
+ __version__ = "1.3.0"
2
+
3
+ from .fetchers.base import BaseFetcher
4
+ from .fetchers.generic import GenericFetcher
5
+ from .fetchers.generic_async import GenericAsyncFetcher
6
+ from .fetchers.parallel_base import ParallelFetcher
7
+ from .fetchers.stripe import StripeFetcher
8
+
9
+ __all__ = [
10
+ "BaseFetcher",
11
+ "GenericFetcher",
12
+ "GenericAsyncFetcher",
13
+ "ParallelFetcher",
14
+ "StripeFetcher",
15
+ ]
@@ -295,13 +295,18 @@ Examples:
295
295
  )
296
296
 
297
297
  # Index Generation
298
- index_group = parser.add_argument_group("index generation")
298
+ index_group = parser.add_argument_group("index generation & metadata")
299
299
  index_group.add_argument(
300
300
  "--create-index", action="store_true", help="Create INDEX.md with file tree and navigation"
301
301
  )
302
302
  index_group.add_argument(
303
303
  "--extract-metadata", action="store_true", help="Extract metadata to metadata.json"
304
304
  )
305
+ index_group.add_argument(
306
+ "--rich-metadata",
307
+ action="store_true",
308
+ help="Extract rich structured metadata (Open Graph, JSON-LD) during fetch",
309
+ )
305
310
 
306
311
  # Update Detection
307
312
  cache_group = parser.add_argument_group("update detection & caching")
@@ -635,6 +640,7 @@ def run_generic_fetchers(args: argparse.Namespace) -> int:
635
640
  max_concurrent=max_concurrent,
636
641
  use_js=use_js,
637
642
  show_progress=show_progress,
643
+ use_rich_metadata=args.rich_metadata,
638
644
  )
639
645
  fetcher.fetch() # This calls asyncio.run() internally
640
646
 
@@ -741,6 +747,7 @@ def run_multi_source_fetch(args: argparse.Namespace) -> int:
741
747
  max_concurrent=source_config.max_concurrent or 10,
742
748
  use_js=source_config.javascript,
743
749
  show_progress=True,
750
+ use_rich_metadata=source_config.rich_metadata or False,
744
751
  )
745
752
 
746
753
  # Fetch
@@ -34,6 +34,7 @@ class FetcherConfig:
34
34
  naming_strategy: str = "full",
35
35
  create_index: bool = False,
36
36
  extract_metadata: bool = False,
37
+ rich_metadata: bool = False,
37
38
  update_only_changed: bool = False,
38
39
  incremental: bool = False,
39
40
  cache_dir: str = ".docpull-cache",
@@ -52,7 +53,7 @@ class FetcherConfig:
52
53
  skip_existing: Skip existing files
53
54
  log_level: Logging level
54
55
  log_file: Optional log file path
55
- sources: List of sources to fetch (e.g., ['stripe', 'plaid'])
56
+ sources: List of sources to fetch (profile names or URLs, e.g., ['stripe', 'https://docs.example.com'])
56
57
  dry_run: Dry run mode (don't download files)
57
58
  language: Include only this language (e.g., 'en')
58
59
  exclude_languages: Exclude these languages
@@ -67,6 +68,7 @@ class FetcherConfig:
67
68
  naming_strategy: File naming strategy (full, short, flat, hierarchical)
68
69
  create_index: Create INDEX.md with navigation
69
70
  extract_metadata: Extract metadata to metadata.json
71
+ rich_metadata: Extract rich structured metadata (Open Graph, JSON-LD) during fetch
70
72
  update_only_changed: Only download changed files
71
73
  incremental: Enable incremental mode
72
74
  cache_dir: Cache directory for update detection
@@ -81,7 +83,7 @@ class FetcherConfig:
81
83
  self.skip_existing = skip_existing
82
84
  self.log_level = log_level
83
85
  self.log_file = log_file
84
- self.sources = sources or ["plaid", "stripe"]
86
+ self.sources = sources or ["stripe"]
85
87
  self.dry_run = dry_run
86
88
 
87
89
  # v1.2.0 features
@@ -98,6 +100,7 @@ class FetcherConfig:
98
100
  self.naming_strategy = naming_strategy
99
101
  self.create_index = create_index
100
102
  self.extract_metadata = extract_metadata
103
+ self.rich_metadata = rich_metadata
101
104
  self.update_only_changed = update_only_changed
102
105
  self.incremental = incremental
103
106
  self.cache_dir = Path(cache_dir)
@@ -131,11 +134,13 @@ class FetcherConfig:
131
134
  if not isinstance(rate_limit, (int, float)) or rate_limit < 0 or rate_limit > 60:
132
135
  raise ValueError("rate_limit must be between 0 and 60")
133
136
 
134
- # Validate sources
135
- valid_sources = {"bun", "d3", "nextjs", "plaid", "react", "stripe", "tailwind", "turborepo"}
136
- sources = config_dict.get("sources", ["plaid", "stripe"])
137
- if not all(s in valid_sources for s in sources):
138
- raise ValueError(f"Invalid sources. Must be from: {valid_sources}")
137
+ # Validate sources (built-in profiles or URLs)
138
+ valid_sources = {"stripe"}
139
+ sources = config_dict.get("sources", ["stripe"])
140
+ # Allow URLs or valid profile names
141
+ for source in sources:
142
+ if not (source in valid_sources or source.startswith("http://") or source.startswith("https://")):
143
+ raise ValueError(f"Invalid source: {source}. Must be 'stripe' or a URL")
139
144
 
140
145
  # Validate log_level
141
146
  log_level = config_dict.get("log_level", "INFO")
@@ -0,0 +1,9 @@
1
+ from .base import BaseFetcher
2
+ from .parallel_base import ParallelFetcher
3
+ from .stripe import StripeFetcher
4
+
5
+ __all__ = [
6
+ "BaseFetcher",
7
+ "ParallelFetcher",
8
+ "StripeFetcher",
9
+ ]
@@ -59,10 +59,12 @@ class BaseFetcher(ABC):
59
59
  skip_existing: bool = True,
60
60
  logger: Optional[logging.Logger] = None,
61
61
  allowed_domains: Optional[set[str]] = None,
62
+ use_rich_metadata: bool = False,
62
63
  ) -> None:
63
64
  self.output_dir = Path(output_dir).resolve()
64
65
  self.rate_limit = rate_limit
65
66
  self.skip_existing = skip_existing
67
+ self.use_rich_metadata = use_rich_metadata
66
68
  self.logger = logger or logging.getLogger(f"{__name__}.{self.__class__.__name__}")
67
69
  self.allowed_domains = allowed_domains
68
70
  self.h2t = html2text.HTML2Text()
@@ -98,6 +100,14 @@ class BaseFetcher(ABC):
98
100
  if user_agent is None:
99
101
  user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
100
102
  self.session.headers.update({"User-Agent": user_agent})
103
+
104
+ # Initialize rich metadata extractor if enabled
105
+ self.rich_metadata_extractor = None
106
+ if self.use_rich_metadata:
107
+ from ..metadata_extractor import RichMetadataExtractor
108
+
109
+ self.rich_metadata_extractor = RichMetadataExtractor()
110
+
101
111
  self.stats: FetcherStats = {
102
112
  "fetched": 0,
103
113
  "skipped": 0,
@@ -358,6 +368,14 @@ class BaseFetcher(ABC):
358
368
 
359
369
  soup = BeautifulSoup(content, "html.parser")
360
370
 
371
+ # Extract rich metadata if enabled
372
+ rich_meta = None
373
+ if self.use_rich_metadata and self.rich_metadata_extractor:
374
+ try:
375
+ rich_meta = self.rich_metadata_extractor.extract(content.decode("utf-8"), url)
376
+ except Exception as e:
377
+ self.logger.debug(f"Rich metadata extraction failed for {url}: {e}")
378
+
361
379
  for element in soup(["script", "style", "nav", "footer", "header"]):
362
380
  element.decompose()
363
381
  main_content = (
@@ -369,12 +387,47 @@ class BaseFetcher(ABC):
369
387
 
370
388
  if main_content:
371
389
  markdown = self.h2t.handle(str(main_content))
372
- frontmatter = f"""---
373
- url: {url}
374
- fetched: {time.strftime('%Y-%m-%d')}
375
- ---
376
390
 
377
- """
391
+ # Build frontmatter with optional rich metadata
392
+ frontmatter_parts = [
393
+ "---",
394
+ f"url: {url}",
395
+ f"fetched: {time.strftime('%Y-%m-%d')}",
396
+ ]
397
+
398
+ if rich_meta:
399
+ # Add rich metadata fields if available
400
+ if rich_meta.get("title"):
401
+ frontmatter_parts.append(f"title: {rich_meta['title']}")
402
+ if rich_meta.get("description"):
403
+ # Escape any colons in description
404
+ desc = str(rich_meta["description"]).replace(":", "\\:")
405
+ frontmatter_parts.append(f"description: {desc}")
406
+ if rich_meta.get("author"):
407
+ frontmatter_parts.append(f"author: {rich_meta['author']}")
408
+ if rich_meta.get("keywords"):
409
+ keywords_str = ", ".join(rich_meta["keywords"])
410
+ frontmatter_parts.append(f"keywords: [{keywords_str}]")
411
+ if rich_meta.get("image"):
412
+ frontmatter_parts.append(f"image: {rich_meta['image']}")
413
+ if rich_meta.get("type"):
414
+ frontmatter_parts.append(f"type: {rich_meta['type']}")
415
+ if rich_meta.get("site_name"):
416
+ frontmatter_parts.append(f"site_name: {rich_meta['site_name']}")
417
+ if rich_meta.get("published_time"):
418
+ frontmatter_parts.append(f"published_time: {rich_meta['published_time']}")
419
+ if rich_meta.get("modified_time"):
420
+ frontmatter_parts.append(f"modified_time: {rich_meta['modified_time']}")
421
+ if rich_meta.get("section"):
422
+ frontmatter_parts.append(f"section: {rich_meta['section']}")
423
+ if rich_meta.get("tags"):
424
+ tags_str = ", ".join(rich_meta["tags"])
425
+ frontmatter_parts.append(f"tags: [{tags_str}]")
426
+
427
+ frontmatter_parts.append("---")
428
+ frontmatter_parts.append("") # Empty line after frontmatter
429
+
430
+ frontmatter = "\n".join(frontmatter_parts)
378
431
  return frontmatter + markdown.strip()
379
432
  else:
380
433
  return f"# Error\n\nCould not find main content for {url}"
@@ -38,6 +38,7 @@ class GenericAsyncFetcher(BaseFetcher):
38
38
  max_concurrent: int = 10,
39
39
  use_js: bool = False,
40
40
  show_progress: bool = True,
41
+ use_rich_metadata: bool = False,
41
42
  ) -> None:
42
43
  """
43
44
  Initialize async generic fetcher.
@@ -54,8 +55,15 @@ class GenericAsyncFetcher(BaseFetcher):
54
55
  max_concurrent: Maximum concurrent requests
55
56
  use_js: Enable JavaScript rendering (requires playwright)
56
57
  show_progress: Show progress bars
58
+ use_rich_metadata: Extract rich structured metadata (Open Graph, JSON-LD)
57
59
  """
58
- super().__init__(output_dir, rate_limit, skip_existing=skip_existing, logger=logger)
60
+ super().__init__(
61
+ output_dir,
62
+ rate_limit,
63
+ skip_existing=skip_existing,
64
+ logger=logger,
65
+ use_rich_metadata=use_rich_metadata,
66
+ )
59
67
 
60
68
  # Determine if input is a URL or profile name
61
69
  if url_or_profile.startswith(("http://", "https://")):