docpull 1.2.0__tar.gz → 1.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. docpull-1.3.0/.editorconfig +30 -0
  2. docpull-1.3.0/.pre-commit-config.yaml +30 -0
  3. docpull-1.3.0/CHANGELOG.md +403 -0
  4. docpull-1.3.0/CONTRIBUTING.md +189 -0
  5. docpull-1.3.0/MANIFEST.in +49 -0
  6. docpull-1.3.0/Makefile +44 -0
  7. {docpull-1.2.0 → docpull-1.3.0}/PKG-INFO +72 -7
  8. {docpull-1.2.0 → docpull-1.3.0}/README.md +69 -6
  9. docpull-1.3.0/SECURITY.md +206 -0
  10. docpull-1.3.0/TROUBLESHOOTING.md +348 -0
  11. docpull-1.3.0/docpull/__init__.py +15 -0
  12. {docpull-1.2.0 → docpull-1.3.0}/docpull/archive.py +1 -1
  13. {docpull-1.2.0 → docpull-1.3.0}/docpull/cache.py +58 -26
  14. {docpull-1.2.0 → docpull-1.3.0}/docpull/cli.py +12 -5
  15. {docpull-1.2.0 → docpull-1.3.0}/docpull/config.py +12 -7
  16. docpull-1.3.0/docpull/fetchers/__init__.py +9 -0
  17. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/base.py +60 -6
  18. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/generic_async.py +9 -1
  19. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/stripe.py +3 -14
  20. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/__init__.py +10 -4
  21. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/base.py +34 -5
  22. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/json.py +12 -10
  23. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/markdown.py +4 -2
  24. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/sqlite.py +12 -10
  25. {docpull-1.2.0 → docpull-1.3.0}/docpull/formatters/toon.py +5 -3
  26. {docpull-1.2.0 → docpull-1.3.0}/docpull/hooks.py +12 -7
  27. {docpull-1.2.0 → docpull-1.3.0}/docpull/indexer.py +42 -19
  28. {docpull-1.2.0 → docpull-1.3.0}/docpull/metadata.py +53 -13
  29. docpull-1.3.0/docpull/metadata_extractor.py +283 -0
  30. {docpull-1.2.0 → docpull-1.3.0}/docpull/orchestrator.py +11 -10
  31. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/__init__.py +2 -1
  32. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/base.py +7 -6
  33. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/content_filter.py +51 -8
  34. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/deduplicator.py +33 -13
  35. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/language_filter.py +33 -22
  36. {docpull-1.2.0 → docpull-1.3.0}/docpull/processors/size_limiter.py +19 -12
  37. {docpull-1.2.0 → docpull-1.3.0}/docpull/profiles/__init__.py +5 -22
  38. {docpull-1.2.0 → docpull-1.3.0}/docpull/sources_config.py +195 -9
  39. {docpull-1.2.0 → docpull-1.3.0}/docpull/vcs.py +1 -1
  40. {docpull-1.2.0 → docpull-1.3.0}/docpull.egg-info/SOURCES.txt +18 -24
  41. docpull-1.3.0/examples/README.md +280 -0
  42. docpull-1.3.0/examples/deduplication-strategies.yaml +29 -0
  43. docpull-1.3.0/examples/format-conversion.yaml +25 -0
  44. docpull-1.3.0/examples/incremental-updates.yaml +26 -0
  45. docpull-1.3.0/examples/multi-source-optimized.yaml +45 -0
  46. docpull-1.3.0/examples/selective-crawling.yaml +26 -0
  47. docpull-1.3.0/examples/simple-optimization.yaml +14 -0
  48. {docpull-1.2.0 → docpull-1.3.0}/pyproject.toml +8 -2
  49. docpull-1.3.0/requirements.txt +34 -0
  50. {docpull-1.2.0 → docpull-1.3.0}/tests/test_config.py +3 -6
  51. docpull-1.3.0/tests/test_metadata_extractor.py +233 -0
  52. {docpull-1.2.0 → docpull-1.3.0}/tests/test_orchestrator.py +2 -1
  53. docpull-1.2.0/docpull/__init__.py +0 -29
  54. docpull-1.2.0/docpull/fetchers/__init__.py +0 -23
  55. docpull-1.2.0/docpull/fetchers/bun.py +0 -59
  56. docpull-1.2.0/docpull/fetchers/d3.py +0 -211
  57. docpull-1.2.0/docpull/fetchers/nextjs.py +0 -50
  58. docpull-1.2.0/docpull/fetchers/plaid.py +0 -92
  59. docpull-1.2.0/docpull/fetchers/react.py +0 -59
  60. docpull-1.2.0/docpull/fetchers/tailwind.py +0 -59
  61. docpull-1.2.0/docpull/fetchers/turborepo.py +0 -57
  62. docpull-1.2.0/docpull/profiles/bun.py +0 -14
  63. docpull-1.2.0/docpull/profiles/d3.py +0 -17
  64. docpull-1.2.0/docpull/profiles/nextjs.py +0 -15
  65. docpull-1.2.0/docpull/profiles/plaid.py +0 -16
  66. docpull-1.2.0/docpull/profiles/react.py +0 -14
  67. docpull-1.2.0/docpull/profiles/tailwind.py +0 -14
  68. docpull-1.2.0/docpull/profiles/turborepo.py +0 -14
  69. docpull-1.2.0/docpull.egg-info/PKG-INFO +0 -394
  70. docpull-1.2.0/docpull.egg-info/dependency_links.txt +0 -1
  71. docpull-1.2.0/docpull.egg-info/entry_points.txt +0 -2
  72. docpull-1.2.0/docpull.egg-info/requires.txt +0 -29
  73. docpull-1.2.0/docpull.egg-info/top_level.txt +0 -1
  74. docpull-1.2.0/tests/test_async_fetcher.py +0 -147
  75. docpull-1.2.0/tests/test_fetchers.py +0 -57
  76. docpull-1.2.0/tests/test_formatters.py +0 -276
  77. docpull-1.2.0/tests/test_processors.py +0 -424
  78. {docpull-1.2.0 → docpull-1.3.0}/LICENSE +0 -0
  79. {docpull-1.2.0 → docpull-1.3.0}/docpull/__main__.py +0 -0
  80. {docpull-1.2.0 → docpull-1.3.0}/docpull/doctor.py +0 -0
  81. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/async_fetcher.py +0 -0
  82. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/generic.py +0 -0
  83. {docpull-1.2.0 → docpull-1.3.0}/docpull/fetchers/parallel_base.py +0 -0
  84. {docpull-1.2.0 → docpull-1.3.0}/docpull/naming.py +0 -0
  85. {docpull-1.2.0 → docpull-1.3.0}/docpull/profiles/base.py +0 -0
  86. {docpull-1.2.0 → docpull-1.3.0}/docpull/profiles/stripe.py +0 -0
  87. {docpull-1.2.0 → docpull-1.3.0}/docpull/py.typed +0 -0
  88. {docpull-1.2.0 → docpull-1.3.0}/docpull/utils/__init__.py +0 -0
  89. {docpull-1.2.0 → docpull-1.3.0}/docpull/utils/file_utils.py +0 -0
  90. {docpull-1.2.0 → docpull-1.3.0}/docpull/utils/logging_config.py +0 -0
  91. {docpull-1.2.0 → docpull-1.3.0}/setup.cfg +0 -0
  92. {docpull-1.2.0 → docpull-1.3.0}/tests/test_sources_config.py +0 -0
@@ -0,0 +1,30 @@
1
+ # EditorConfig helps maintain consistent coding styles
2
+ # https://editorconfig.org
3
+
4
+ root = true
5
+
6
+ [*]
7
+ charset = utf-8
8
+ end_of_line = lf
9
+ insert_final_newline = true
10
+ trim_trailing_whitespace = true
11
+
12
+ [*.{py,pyi}]
13
+ indent_style = space
14
+ indent_size = 4
15
+ max_line_length = 110
16
+
17
+ [*.{yml,yaml}]
18
+ indent_style = space
19
+ indent_size = 2
20
+
21
+ [*.{json,toml}]
22
+ indent_style = space
23
+ indent_size = 2
24
+
25
+ [*.md]
26
+ trim_trailing_whitespace = false
27
+ max_line_length = off
28
+
29
+ [Makefile]
30
+ indent_style = tab
@@ -0,0 +1,30 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ - id: check-merge-conflict
10
+ - id: debug-statements
11
+ - id: mixed-line-ending
12
+
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ rev: v0.1.9
15
+ hooks:
16
+ - id: ruff
17
+ args: [--fix]
18
+ - id: ruff-format
19
+
20
+ - repo: https://github.com/pre-commit/mirrors-mypy
21
+ rev: v1.8.0
22
+ hooks:
23
+ - id: mypy
24
+ additional_dependencies: [types-requests, types-PyYAML]
25
+ args: [--ignore-missing-imports]
26
+ exclude: ^tests/
27
+ verbose: true
28
+ # Allow failures for now - mypy is informational
29
+ # Remove this line once type issues are fixed
30
+ stages: [manual]
@@ -0,0 +1,403 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.3.0] - 2025-11-20
9
+
10
+ ### Added
11
+
12
+ **Rich Metadata Extraction**
13
+ - Extract structured metadata (Open Graph, JSON-LD, microdata) during fetch
14
+ - New `--rich-metadata` CLI flag to enable rich metadata extraction
15
+ - Enhanced frontmatter with author, description, keywords, images, publish dates, tags, and more
16
+ - Better context for AI/RAG systems with richer document metadata
17
+ - Powered by `extruct` library
18
+ - Opt-in feature, backward compatible with existing workflows
19
+
20
+ ### Changed
21
+
22
+ **Simplified Profile System**
23
+ - Removed 7 built-in profiles (Next.js, React, Plaid, Tailwind, Bun, D3, Turborepo)
24
+ - Kept Stripe profile as reference implementation
25
+ - Generic fetcher works excellently for all documentation sites
26
+ - Users can create custom profiles or use URLs directly
27
+ - Reduced maintenance burden and codebase complexity
28
+
29
+ ### Technical Details
30
+
31
+ **New Dependencies:**
32
+ - Added `extruct>=0.15.0` for structured metadata extraction
33
+
34
+ **New Files:**
35
+ - `docpull/metadata_extractor.py` - Rich metadata extraction module
36
+ - `tests/test_metadata_extractor.py` - Comprehensive test suite for metadata extraction
37
+
38
+ **Updated Files:**
39
+ - `docpull/fetchers/base.py` - Integrated rich metadata extraction into fetch pipeline
40
+ - `docpull/fetchers/generic_async.py` - Added `use_rich_metadata` parameter
41
+ - `docpull/config.py` - Added `rich_metadata` configuration option
42
+ - `docpull/sources_config.py` - Added `rich_metadata` field to SourceConfig
43
+ - `docpull/cli.py` - Added `--rich-metadata` CLI flag
44
+ - `docpull/profiles/__init__.py` - Simplified to single Stripe profile
45
+
46
+ **Removed Files:**
47
+ - Removed 7 profile files and 7 fetcher implementation files
48
+
49
+ **Version Bump:**
50
+ - Updated version from `1.2.1` to `1.3.0`
51
+
52
+ ### Example Usage
53
+
54
+ ```bash
55
+ # Extract rich metadata during fetch
56
+ docpull https://docs.anthropic.com --rich-metadata
57
+
58
+ # Combine with other features
59
+ docpull https://stripe.com/docs --rich-metadata --create-index --language en
60
+
61
+ # Multi-source configuration
62
+ docpull --sources-file config.yaml # with rich_metadata: true per source
63
+ ```
64
+
65
+ ### Example Enhanced Frontmatter
66
+
67
+ ```yaml
68
+ ---
69
+ url: https://docs.example.com/guide
70
+ fetched: 2025-11-20
71
+ title: Getting Started Guide
72
+ description: Learn the basics of our platform
73
+ author: John Doe
74
+ keywords: [tutorial, guide, api]
75
+ image: https://docs.example.com/og-image.png
76
+ type: article
77
+ site_name: Example Docs
78
+ published_time: 2024-01-15T10:00:00Z
79
+ modified_time: 2024-01-20T15:30:00Z
80
+ ---
81
+ ```
82
+
83
+ ## [1.2.0] - 2025-11-16
84
+
85
+ ### Added - 15 Major New Features
86
+
87
+ This release represents a massive expansion of docpull's capabilities, adding 15 major features across 4 phases. Based on real-world usage pulling 1,914 files (31 MB) from Anthropic, Claude Code, Aptos, and Shelby documentation, these features enable automatic optimization reducing output to ~13 MB (58% reduction).
88
+
89
+ **Note**: All new features are backward compatible. Existing workflows continue to work unchanged.
90
+
91
+ #### Phase 1: Essential Optimizations (Top Priority Features)
92
+
93
+ **1. Language Filtering** (`--language`, `--exclude-languages`)
94
+ - Filter documentation by language code during download or post-process
95
+ - Automatic detection from URL patterns (`/en/`, `_en_`, `docs_en_`, etc.)
96
+ - **Real-world impact**: Claude Code docs downloaded in 9 languages = 352 unnecessary files for English-only users
97
+ - Example: `docpull https://code.claude.com/docs --language en`
98
+
99
+ **2. Deduplication** (`--deduplicate`, `--keep-variant`, `--remove-patterns`)
100
+ - Remove duplicate files based on SHA-256 content hash
101
+ - Keep specific variants (e.g., `mainnet` vs `testnet/devnet`)
102
+ - Configurable keep strategies: first, last, shortest, longest, pattern
103
+ - **Real-world impact**: Aptos docs had 456 Move reference files across 3 environments (2/3 duplicates = 304 files, ~10 MB saved)
104
+ - Example: `docpull https://aptos.dev --deduplicate --keep-variant mainnet`
105
+
106
+ **3. Auto-Index Generation** (`--create-index`, `--index-styles`, `--per-directory-index`)
107
+ - Generate INDEX.md with file tree, table of contents, categories, and statistics
108
+ - Per-directory indexes for nested documentation
109
+ - **Real-world impact**: Makes 1,914 files actually navigable and usable
110
+ - Index styles: tree, toc (table of contents), categories, stats
111
+ - Example: `docpull https://docs.anthropic.com --create-index`
112
+
113
+ **4. Size Limits** (`--max-file-size`, `--max-total-size`, `--size-limit-action`)
114
+ - Skip, truncate, or warn on oversized files
115
+ - Prevent runaway downloads with total size limits
116
+ - **Real-world impact**: Some REST API docs were 308 KB with full JSON responses
117
+ - Actions: skip (default), truncate (keep first N bytes), warn (log only)
118
+ - Example: `docpull https://aptos.dev --max-file-size 200kb --max-total-size 500mb`
119
+
120
+ **5. Multi-Source Configuration** (`--sources-file`, `--generate-sources-config`)
121
+ - Configure multiple documentation sources in a single YAML file
122
+ - Per-source settings for language, deduplication, size limits, etc.
123
+ - **Real-world impact**: One command instead of 4+ separate commands with manual optimization
124
+ - Repeatable, version-controlled documentation workflows
125
+ - Example: `docpull --sources-file my-docs.yaml`
126
+
127
+ #### Phase 2: Content Control
128
+
129
+ **6. Selective Crawling** (`--include-paths`, `--exclude-paths`)
130
+ - Only download URLs matching include patterns
131
+ - Skip entire branches matching exclude patterns
132
+ - Glob-style pattern matching (`*/api/*`, `*/guides/*`)
133
+ - Early termination for excluded branches (faster crawling)
134
+ - Example: `docpull https://aptos.dev --include-paths "build/guides/*" --exclude-paths "*/changelog"`
135
+
136
+ **7. Content Filtering** (`--exclude-sections`)
137
+ - Remove verbose sections by header name (Examples, Changelog, Full Response, etc.)
138
+ - Regex-based content filtering and truncation (future expansion)
139
+ - Keep schemas and reference docs, remove bloated examples
140
+ - Applied during post-processing after download
141
+ - Example: `docpull https://aptos.dev --exclude-sections "Examples" "Full Response" "Changelog"`
142
+
143
+ **8. Format Conversion** (`--format`)
144
+ - **markdown** (default): Standard markdown with YAML frontmatter
145
+ - **toon**: Terser Object Oriented Notation (40-60% size reduction, optimized for LLMs)
146
+ - **json**: Structured JSON with sections, headers, and metadata
147
+ - **sqlite**: Searchable database with FTS5 full-text search
148
+ - Example: `docpull https://docs.anthropic.com --format toon` or `--format sqlite`
149
+
150
+ **9. Smart Naming** (`--naming-strategy`)
151
+ - **full** (default): Preserve complete path structure with domain prefix
152
+ - **short**: Remove domain prefix, keep directory structure
153
+ - **flat**: Single directory with descriptive hyphenated names
154
+ - **hierarchical**: Smart hierarchy based on common documentation patterns
155
+ - Example: `docpull https://docs.anthropic.com --naming-strategy hierarchical`
156
+
157
+ #### Phase 3: Advanced Features
158
+
159
+ **10. Metadata Extraction** (`--extract-metadata`)
160
+ - Extract titles, URLs, word counts, categories, last updated dates
161
+ - Aggregate statistics: total files, total size, file types, categories
162
+ - Output to metadata.json for analysis, search indexing, or documentation audits
163
+ - Example: `docpull https://docs.anthropic.com --extract-metadata`
164
+
165
+ **11. Update Detection** (`--check-updates`, `--update-only-changed`)
166
+ - Check which files have changed without downloading
167
+ - Only fetch modified files based on checksums, ETags, Last-Modified headers
168
+ - Manifest tracking with automatic cache management
169
+ - Saves bandwidth and time on regular documentation updates
170
+ - Example: `docpull https://docs.anthropic.com --update-only-changed`
171
+
172
+ **12. Incremental Mode** (`--incremental`, `--resume`, `--cache-dir`, `--clear-cache`)
173
+ - Resume interrupted downloads from checkpoint
174
+ - State persistence across sessions
175
+ - Cache directory for manifests and state files
176
+ - Essential for large documentation sets (1000+ files)
177
+ - Example: `docpull https://aptos.dev --incremental --resume`
178
+
179
+ #### Phase 4: Extensibility
180
+
181
+ **13. Hooks & Plugins** (`--post-process-hook`, `--pre-fetch-hook`)
182
+ - Python plugin system for custom processing
183
+ - Hook types: `pre_fetch`, `post_fetch`, `post_process`, `filter`
184
+ - Decorator-based hook registration (`@hook(HookType.POST_PROCESS)`)
185
+ - Load hooks from Python files
186
+ - Example: `docpull https://docs.anthropic.com --post-process-hook ./optimize.py`
187
+
188
+ **14. Git Integration** (`--git-commit`, `--git-message`, `--git-tag`, `--git-author`)
189
+ - Automatically commit documentation changes after successful fetch
190
+ - Customizable commit messages with templates (`{date}`, `{datetime}`)
191
+ - Optional tagging for versioned snapshots
192
+ - Track documentation evolution over time
193
+ - Example: `docpull --sources-file sources.yaml --git-commit --git-message "Update docs - {date}"`
194
+
195
+ **15. Archive Mode** (`--archive`, `--archive-format`, `--archive-name`)
196
+ - Create compressed archives of documentation
197
+ - Formats: tar.gz (default), tar.bz2, tar.xz, zip
198
+ - Date-stamped archives for distribution
199
+ - Single-file documentation bundles
200
+ - Example: `docpull https://docs.anthropic.com --archive --archive-format tar.gz`
201
+
202
+ ### Added - New Modules
203
+
204
+ - `docpull/processors/`: Post-processing pipeline
205
+ - `base.py`: BaseProcessor interface and ProcessorPipeline
206
+ - `language_filter.py`: Language filtering processor
207
+ - `deduplicator.py`: Deduplication processor with hash-based detection
208
+ - `size_limiter.py`: Size limit enforcement
209
+ - `content_filter.py`: Section and content filtering
210
+ - `docpull/formatters/`: Output format converters
211
+ - `base.py`: BaseFormatter interface
212
+ - `markdown.py`: Markdown formatter (default)
213
+ - `toon.py`: TOON format converter (compact for LLMs)
214
+ - `json.py`: JSON formatter with structured sections
215
+ - `sqlite.py`: SQLite database with FTS5 search
216
+ - `docpull/indexer.py`: Auto-index generation with tree/TOC/categories/stats
217
+ - `docpull/naming.py`: Smart naming strategies (full, short, flat, hierarchical)
218
+ - `docpull/metadata.py`: Metadata extraction and aggregation
219
+ - `docpull/cache.py`: Cache management for update detection and incremental fetching
220
+ - `docpull/hooks.py`: Plugin/hook system with decorator support
221
+ - `docpull/vcs.py`: Git integration (commit, tag, status, diff)
222
+ - `docpull/archive.py`: Archive creation (tarball, zip)
223
+ - `docpull/sources_config.py`: Multi-source YAML configuration with per-source settings
224
+ - Enhanced `docpull/cli.py`: Integrated all new CLI arguments with organized argument groups
225
+
226
+ ### Changed - New Required Dependencies
227
+
228
+ **IMPORTANT**: This release adds new required dependencies for enhanced functionality.
229
+
230
+ 1. **PyYAML is now a REQUIRED dependency** (was optional in v1.1.0)
231
+ - Required for `--sources-file` multi-source configuration
232
+ - Automatically installed with: `pip install --upgrade docpull`
233
+
234
+ 2. **GitPython is now a REQUIRED dependency** (new in v1.2.0)
235
+ - Required for `--git-commit` git integration features
236
+ - Automatically installed with: `pip install --upgrade docpull`
237
+
238
+ **Backward Compatibility**: All existing CLI commands and workflows continue to work. New features are purely additive.
239
+
240
+ ### Changed - Improvements
241
+
242
+ - CLI organized into logical argument groups (Multi-Source, Language Filtering, Deduplication, Size Limits, Content Filtering, Output Format, Index Generation, Metadata, Update Detection, Hooks, Git Integration, Archive Mode)
243
+ - Enhanced configuration schema to support all 15 new features
244
+ - Better error messages and validation throughout
245
+ - Structured logging with feature-specific messages
246
+ - Comprehensive documentation and examples
247
+
248
+ ### Performance Improvements
249
+
250
+ Real-world optimization results from testing with 1,914 files (31 MB):
251
+ - **Language filtering**: -352 files, -5-10 MB (Claude Code docs in 9 languages → English only)
252
+ - **Deduplication**: -304 files, -10 MB (Aptos Move references across 3 environments)
253
+ - **Size limits**: -3-5 MB (Skip verbose API examples over 200 KB)
254
+ - **Content filtering**: Additional KB savings by removing Changelog/Examples sections
255
+ - **Combined optimizations**: 31 MB → ~13 MB (58% reduction)
256
+
257
+ ### Documentation
258
+
259
+ - Comprehensive CHANGELOG with feature descriptions and real-world impact
260
+ - Updated README with all 15 features and usage examples
261
+ - Migration guide for v1.x users
262
+ - Example `sources.yaml` configuration file
263
+ - Hook development guide and examples
264
+
265
+ ### Testing
266
+
267
+ - Unit tests for all new processor modules (language_filter, deduplicator, size_limiter, content_filter)
268
+ - Unit tests for all formatters (markdown, TOON, JSON, SQLite)
269
+ - Unit tests for indexer, naming, metadata extraction, cache management
270
+ - Unit tests for hooks system, git integration, archive creation
271
+ - Integration tests for multi-source workflows
272
+ - Mock-based tests for external dependencies (git, sqlite)
273
+
274
+ ---
275
+
276
+ ## Example: What You Can Now Do
277
+
278
+ ### Before v1.2.0 (Manual Process):
279
+ ```bash
280
+ docpull https://docs.anthropic.com --output-dir ./docs/anthropic
281
+ docpull https://code.claude.com/docs --output-dir ./docs/claude-code
282
+ docpull https://aptos.dev --output-dir ./docs/aptos
283
+ docpull https://shelby.xyz --output-dir ./docs/shelby
284
+ # Then manually run optimization scripts
285
+ # Result: 31 MB, 1,914 files, no navigation
286
+ ```
287
+
288
+ ### After v1.2.0 (One Command):
289
+ ```bash
290
+ docpull --sources-file docs-config.yaml
291
+ ```
292
+
293
+ **docs-config.yaml:**
294
+ ```yaml
295
+ sources:
296
+ anthropic:
297
+ url: https://docs.anthropic.com
298
+ language: en
299
+ max_file_size: 200kb
300
+ create_index: true
301
+
302
+ claude-code:
303
+ url: https://code.claude.com/docs
304
+ language: en # Skips 352 translation files!
305
+ create_index: true
306
+
307
+ aptos:
308
+ url: https://aptos.dev
309
+ deduplicate: true
310
+ keep_variant: mainnet # Skips 304 duplicates!
311
+ max_file_size: 200kb
312
+ include_paths: ["build/*"]
313
+
314
+ shelby:
315
+ url: https://docs.shelby.xyz
316
+ create_index: true
317
+
318
+ git_commit: true
319
+ git_message: "Update docs - {date}"
320
+ ```
321
+
322
+ **Result**: ~13 MB (58% smaller), all indexes created, one command, repeatable, version-controlled!
323
+
324
+ ---
325
+
326
+ ## [1.1.0] - 2025-11-14
327
+
328
+ ### Added
329
+ - `--doctor` command for diagnosing installation and dependency issues
330
+ - Checks all core dependencies (requests, beautifulsoup4, html2text, defusedxml, aiohttp, rich)
331
+ - Checks optional dependencies (PyYAML, Playwright) with installation suggestions
332
+ - Tests network connectivity
333
+ - Verifies output directory write permissions
334
+ - Works even when dependencies are missing
335
+ - `requirements.txt` file for transparent dependency listing
336
+ - Comprehensive `TROUBLESHOOTING.md` documentation with:
337
+ - Installation troubleshooting (missing dependencies, pipx issues)
338
+ - Runtime issue solutions (YAML config errors, JavaScript rendering)
339
+ - Diagnostic tools usage guide
340
+ - Common error messages reference table
341
+ - Quick reference commands
342
+
343
+ ### Changed
344
+ - Improved error handling for missing dependencies
345
+ - Early dependency checking at CLI entry point
346
+ - Clear, actionable error messages with installation instructions
347
+ - Specific recommendations for pipx, pip, and development installations
348
+ - Enhanced YAML configuration error handling
349
+ - Auto-fallback to JSON when PyYAML is not installed
350
+ - Clear error messages for YAML-related import errors
351
+ - Helpful suggestions for installing optional dependencies
352
+ - Updated README.md with:
353
+ - `--doctor` command in Quick Start section
354
+ - Reference to TROUBLESHOOTING.md
355
+ - Better troubleshooting guidance
356
+
357
+ ### Fixed
358
+ - Improved user experience when dependencies are missing (no more confusing tracebacks)
359
+ - Better handling of optional dependency errors (PyYAML, Playwright)
360
+
361
+ ## [1.0.0] - 2025-11-07
362
+
363
+ ### Added
364
+ - Initial release of docpull
365
+ - Support for fetching documentation from multiple sources:
366
+ - Stripe API documentation
367
+ - Plaid API documentation
368
+ - Next.js documentation
369
+ - D3.js documentation (devdocs.io)
370
+ - Bun runtime documentation
371
+ - Tailwind CSS documentation
372
+ - React documentation
373
+ - CLI interface with config file support (YAML/JSON)
374
+ - Parallel fetching with ThreadPoolExecutor for improved performance
375
+ - Security features:
376
+ - Path traversal protection
377
+ - XXE (XML External Entity) protection
378
+ - File size limits (50MB default)
379
+ - Redirect limits (5 hops)
380
+ - Request timeouts (30s)
381
+ - HTTPS enforcement with certificate verification
382
+ - Rate limiting to respect server resources
383
+ - Structured logging with configurable levels
384
+ - YAML frontmatter metadata in generated markdown files
385
+ - Config file generation command
386
+ - Extensible fetcher architecture for easy addition of new sources
387
+ - Comprehensive documentation and examples
388
+
389
+ ### Changed
390
+ - Cleaned up README to remove emojis and update to organization URLs
391
+ - Applied 2025 PyPI best practices to packaging configuration
392
+ - Reorganized project structure for better maintainability
393
+
394
+ ### Security
395
+ - Implemented multiple security layers for safe web scraping
396
+ - Added security scanning with Bandit and pip-audit
397
+ - Created GitHub Actions workflow for automated security checks
398
+ - Documented security features in SECURITY.md
399
+
400
+ ---
401
+
402
+ [1.1.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.1.0
403
+ [1.0.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.0.0
@@ -0,0 +1,189 @@
1
+ # Contributing to docpull
2
+
3
+ Thank you for your interest in contributing to docpull! This document provides guidelines and instructions for contributing.
4
+
5
+ ## Development Setup
6
+
7
+ 1. **Fork and clone the repository**
8
+
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/docpull.git
11
+ cd docpull
12
+ ```
13
+
14
+ 2. **Set up development environment**
15
+
16
+ ```bash
17
+ # Create virtual environment
18
+ python -m venv .venv
19
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
20
+
21
+ # Install with development dependencies
22
+ pip install -e ".[dev]"
23
+
24
+ # Install pre-commit hooks
25
+ pre-commit install
26
+ ```
27
+
28
+ 3. **Verify setup**
29
+
30
+ ```bash
31
+ # Run tests
32
+ make test
33
+
34
+ # Run linting
35
+ make lint
36
+
37
+ # Run formatting
38
+ make format
39
+ ```
40
+
41
+ ## Development Workflow
42
+
43
+ 1. **Create a feature branch**
44
+
45
+ ```bash
46
+ git checkout -b feature/your-feature-name
47
+ # or
48
+ git checkout -b fix/your-bug-fix
49
+ ```
50
+
51
+ 2. **Make your changes**
52
+
53
+ - Write clear, readable code
54
+ - Follow existing code style (enforced by pre-commit hooks)
55
+ - Add tests for new functionality
56
+ - Update documentation as needed
57
+ - Update CHANGELOG.md with your changes
58
+
59
+ 3. **Run tests and linting**
60
+
61
+ ```bash
62
+ # Run all tests
63
+ make test
64
+
65
+ # Run linting
66
+ make lint
67
+
68
+ # Format code
69
+ make format
70
+
71
+ # Clean artifacts
72
+ make clean
73
+ ```
74
+
75
+ 4. **Commit your changes**
76
+
77
+ ```bash
78
+ git add .
79
+ git commit -m "feat: add new feature"
80
+ # or
81
+ git commit -m "fix: resolve bug in X"
82
+ ```
83
+
84
+ Use [Conventional Commits](https://www.conventionalcommits.org/) format:
85
+ - `feat:` - New features
86
+ - `fix:` - Bug fixes
87
+ - `docs:` - Documentation changes
88
+ - `test:` - Test additions/changes
89
+ - `refactor:` - Code refactoring
90
+ - `chore:` - Maintenance tasks
91
+ - `ci:` - CI/CD changes
92
+
93
+ 5. **Push and create a pull request**
94
+
95
+ ```bash
96
+ git push origin feature/your-feature-name
97
+ ```
98
+
99
+ Then open a pull request on GitHub.
100
+
101
+ ## Code Style
102
+
103
+ - **Python**: Follow PEP 8 (enforced by Ruff and Black)
104
+ - **Line length**: 110 characters
105
+ - **Type hints**: Required for all functions
106
+ - **Docstrings**: Required for public APIs
107
+
108
+ Pre-commit hooks will automatically:
109
+ - Fix trailing whitespace
110
+ - Format code with Black and Ruff
111
+ - Sort imports
112
+ - Check for common issues
113
+
114
+ ## Testing
115
+
116
+ - Write tests for all new features
117
+ - Maintain or improve test coverage
118
+ - Run tests with: `make test`
119
+ - Check coverage with: `pytest --cov=docpull --cov-report=html`
120
+
121
+ ## Documentation
122
+
123
+ - Update README.md for user-facing changes
124
+ - Update CHANGELOG.md with all changes
125
+ - Add docstrings to new functions/classes
126
+ - Update TROUBLESHOOTING.md for common issues
127
+
128
+ ## Pull Request Process
129
+
130
+ 1. **Ensure all tests pass**
131
+ 2. **Update CHANGELOG.md** with your changes
132
+ 3. **Update documentation** as needed
133
+ 4. **Fill out the PR template** completely
134
+ 5. **Wait for review** - maintainers will review your PR
135
+ 6. **Address feedback** - make requested changes
136
+ 7. **Merge** - once approved, maintainers will merge
137
+
138
+ ## Release Process
139
+
140
+ Releases are automated via GitHub Actions:
141
+
142
+ 1. Maintainer triggers release workflow
143
+ 2. Workflow updates version numbers
144
+ 3. Workflow updates CHANGELOG.md
145
+ 4. Workflow creates git tag
146
+ 5. Workflow creates GitHub release
147
+ 6. Publish workflow automatically deploys to PyPI
148
+
149
+ ## Adding a New Fetcher
150
+
151
+ To add support for a new documentation source:
152
+
153
+ 1. Create `docpull/fetchers/yoursite.py`:
154
+
155
+ ```python
156
+ from .base import BaseFetcher
157
+
158
+ class YourSiteFetcher(BaseFetcher):
159
+ """Fetch documentation from YourSite."""
160
+
161
+ def __init__(self, output_dir: str = "yoursite-docs"):
162
+ super().__init__(output_dir)
163
+ self.base_url = "https://docs.yoursite.com"
164
+
165
+ def fetch_all(self) -> None:
166
+ # Implement fetching logic
167
+ pass
168
+ ```
169
+
170
+ 2. Register in `docpull/__init__.py`
171
+ 3. Add tests in `tests/test_yoursite.py`
172
+ 4. Update README.md with usage example
173
+ 5. Update CHANGELOG.md
174
+
175
+ ## Getting Help
176
+
177
+ - Open an issue for bugs or feature requests
178
+ - Check existing issues before creating new ones
179
+ - Use `docpull --doctor` to diagnose issues
180
+ - See TROUBLESHOOTING.md for common problems
181
+
182
+ ## Code of Conduct
183
+
184
+ - Be respectful and inclusive
185
+ - Provide constructive feedback
186
+ - Focus on the code, not the person
187
+ - Help others learn and grow
188
+
189
+ Thank you for contributing to docpull!
@@ -0,0 +1,49 @@
1
+ # Include documentation files
2
+ include README.md
3
+ include LICENSE
4
+ include CHANGELOG.md
5
+ include SECURITY.md
6
+ include CONTRIBUTING.md
7
+ include TROUBLESHOOTING.md
8
+
9
+ # Include configuration files
10
+ include pyproject.toml
11
+ include requirements.txt
12
+ include Makefile
13
+ include .editorconfig
14
+
15
+ # Include pre-commit configuration
16
+ include .pre-commit-config.yaml
17
+
18
+ # Include example configurations
19
+ recursive-include examples *.yaml
20
+ include examples/README.md
21
+
22
+ # Include type stub marker
23
+ include docpull/py.typed
24
+
25
+ # Include all Python files in profiles
26
+ recursive-include docpull/profiles *.py
27
+
28
+ # Exclude unnecessary files
29
+ global-exclude __pycache__
30
+ global-exclude *.py[cod]
31
+ global-exclude *~
32
+ global-exclude .DS_Store
33
+ global-exclude *.swp
34
+ global-exclude *.swo
35
+
36
+ # Exclude test artifacts and caches
37
+ prune tests/__pycache__
38
+ prune .pytest_cache
39
+ prune .mypy_cache
40
+ prune .ruff_cache
41
+ prune htmlcov
42
+ prune .docpull-cache
43
+ prune .git
44
+ prune .github
45
+
46
+ # Exclude build artifacts
47
+ prune build
48
+ prune dist
49
+ prune *.egg-info