docpull 1.2.0__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. docpull-1.2.1/.editorconfig +30 -0
  2. docpull-1.2.1/.pre-commit-config.yaml +30 -0
  3. docpull-1.2.1/CHANGELOG.md +328 -0
  4. docpull-1.2.1/CONTRIBUTING.md +189 -0
  5. docpull-1.2.1/MANIFEST.in +49 -0
  6. docpull-1.2.1/Makefile +44 -0
  7. {docpull-1.2.0 → docpull-1.2.1}/PKG-INFO +2 -1
  8. docpull-1.2.1/SECURITY.md +206 -0
  9. docpull-1.2.1/TROUBLESHOOTING.md +348 -0
  10. {docpull-1.2.0 → docpull-1.2.1}/docpull/archive.py +1 -1
  11. {docpull-1.2.0 → docpull-1.2.1}/docpull/cache.py +58 -26
  12. {docpull-1.2.0 → docpull-1.2.1}/docpull/cli.py +4 -4
  13. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/base.py +2 -1
  14. docpull-1.2.1/docpull/fetchers/nextjs.py +59 -0
  15. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/plaid.py +10 -13
  16. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/stripe.py +3 -14
  17. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/__init__.py +10 -4
  18. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/base.py +34 -5
  19. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/json.py +12 -10
  20. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/markdown.py +4 -2
  21. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/sqlite.py +12 -10
  22. {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/toon.py +5 -3
  23. {docpull-1.2.0 → docpull-1.2.1}/docpull/hooks.py +12 -7
  24. {docpull-1.2.0 → docpull-1.2.1}/docpull/indexer.py +42 -19
  25. {docpull-1.2.0 → docpull-1.2.1}/docpull/metadata.py +53 -13
  26. {docpull-1.2.0 → docpull-1.2.1}/docpull/orchestrator.py +11 -10
  27. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/__init__.py +2 -1
  28. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/base.py +7 -6
  29. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/content_filter.py +51 -8
  30. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/deduplicator.py +33 -13
  31. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/language_filter.py +33 -22
  32. {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/size_limiter.py +19 -12
  33. {docpull-1.2.0 → docpull-1.2.1}/docpull/sources_config.py +194 -9
  34. {docpull-1.2.0 → docpull-1.2.1}/docpull/vcs.py +1 -1
  35. {docpull-1.2.0 → docpull-1.2.1}/docpull.egg-info/SOURCES.txt +16 -10
  36. docpull-1.2.1/examples/README.md +280 -0
  37. docpull-1.2.1/examples/deduplication-strategies.yaml +29 -0
  38. docpull-1.2.1/examples/format-conversion.yaml +25 -0
  39. docpull-1.2.1/examples/incremental-updates.yaml +26 -0
  40. docpull-1.2.1/examples/multi-source-optimized.yaml +45 -0
  41. docpull-1.2.1/examples/selective-crawling.yaml +26 -0
  42. docpull-1.2.1/examples/simple-optimization.yaml +14 -0
  43. {docpull-1.2.0 → docpull-1.2.1}/pyproject.toml +3 -2
  44. docpull-1.2.1/requirements.txt +34 -0
  45. {docpull-1.2.0 → docpull-1.2.1}/tests/test_config.py +0 -3
  46. {docpull-1.2.0 → docpull-1.2.1}/tests/test_orchestrator.py +2 -1
  47. docpull-1.2.0/docpull/fetchers/nextjs.py +0 -50
  48. docpull-1.2.0/docpull.egg-info/PKG-INFO +0 -394
  49. docpull-1.2.0/docpull.egg-info/dependency_links.txt +0 -1
  50. docpull-1.2.0/docpull.egg-info/entry_points.txt +0 -2
  51. docpull-1.2.0/docpull.egg-info/requires.txt +0 -29
  52. docpull-1.2.0/docpull.egg-info/top_level.txt +0 -1
  53. docpull-1.2.0/tests/test_async_fetcher.py +0 -147
  54. docpull-1.2.0/tests/test_fetchers.py +0 -57
  55. docpull-1.2.0/tests/test_formatters.py +0 -276
  56. docpull-1.2.0/tests/test_processors.py +0 -424
  57. {docpull-1.2.0 → docpull-1.2.1}/LICENSE +0 -0
  58. {docpull-1.2.0 → docpull-1.2.1}/README.md +0 -0
  59. {docpull-1.2.0 → docpull-1.2.1}/docpull/__init__.py +0 -0
  60. {docpull-1.2.0 → docpull-1.2.1}/docpull/__main__.py +0 -0
  61. {docpull-1.2.0 → docpull-1.2.1}/docpull/config.py +0 -0
  62. {docpull-1.2.0 → docpull-1.2.1}/docpull/doctor.py +0 -0
  63. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/__init__.py +0 -0
  64. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/async_fetcher.py +0 -0
  65. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/bun.py +0 -0
  66. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/d3.py +0 -0
  67. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/generic.py +0 -0
  68. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/generic_async.py +0 -0
  69. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/parallel_base.py +0 -0
  70. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/react.py +0 -0
  71. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/tailwind.py +0 -0
  72. {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/turborepo.py +0 -0
  73. {docpull-1.2.0 → docpull-1.2.1}/docpull/naming.py +0 -0
  74. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/__init__.py +0 -0
  75. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/base.py +0 -0
  76. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/bun.py +0 -0
  77. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/d3.py +0 -0
  78. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/nextjs.py +0 -0
  79. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/plaid.py +0 -0
  80. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/react.py +0 -0
  81. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/stripe.py +0 -0
  82. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/tailwind.py +0 -0
  83. {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/turborepo.py +0 -0
  84. {docpull-1.2.0 → docpull-1.2.1}/docpull/py.typed +0 -0
  85. {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/__init__.py +0 -0
  86. {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/file_utils.py +0 -0
  87. {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/logging_config.py +0 -0
  88. {docpull-1.2.0 → docpull-1.2.1}/setup.cfg +0 -0
  89. {docpull-1.2.0 → docpull-1.2.1}/tests/test_sources_config.py +0 -0
@@ -0,0 +1,30 @@
1
+ # EditorConfig helps maintain consistent coding styles
2
+ # https://editorconfig.org
3
+
4
+ root = true
5
+
6
+ [*]
7
+ charset = utf-8
8
+ end_of_line = lf
9
+ insert_final_newline = true
10
+ trim_trailing_whitespace = true
11
+
12
+ [*.{py,pyi}]
13
+ indent_style = space
14
+ indent_size = 4
15
+ max_line_length = 110
16
+
17
+ [*.{yml,yaml}]
18
+ indent_style = space
19
+ indent_size = 2
20
+
21
+ [*.{json,toml}]
22
+ indent_style = space
23
+ indent_size = 2
24
+
25
+ [*.md]
26
+ trim_trailing_whitespace = false
27
+ max_line_length = off
28
+
29
+ [Makefile]
30
+ indent_style = tab
@@ -0,0 +1,30 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.5.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ - id: check-merge-conflict
10
+ - id: debug-statements
11
+ - id: mixed-line-ending
12
+
13
+ - repo: https://github.com/astral-sh/ruff-pre-commit
14
+ rev: v0.1.9
15
+ hooks:
16
+ - id: ruff
17
+ args: [--fix]
18
+ - id: ruff-format
19
+
20
+ - repo: https://github.com/pre-commit/mirrors-mypy
21
+ rev: v1.8.0
22
+ hooks:
23
+ - id: mypy
24
+ additional_dependencies: [types-requests, types-PyYAML]
25
+ args: [--ignore-missing-imports]
26
+ exclude: ^tests/
27
+ verbose: true
28
+ # Allow failures for now - mypy is informational
29
+ # Remove this line once type issues are fixed
30
+ stages: [manual]
@@ -0,0 +1,328 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.2.0] - 2025-11-16
9
+
10
+ ### Added - 15 Major New Features
11
+
12
+ This release represents a massive expansion of docpull's capabilities, adding 15 major features across 4 phases. Based on real-world usage pulling 1,914 files (31 MB) from Anthropic, Claude Code, Aptos, and Shelby documentation, these features enable automatic optimization reducing output to ~13 MB (58% reduction).
13
+
14
+ **Note**: All new features are backward compatible. Existing workflows continue to work unchanged.
15
+
16
+ #### Phase 1: Essential Optimizations (Top Priority Features)
17
+
18
+ **1. Language Filtering** (`--language`, `--exclude-languages`)
19
+ - Filter documentation by language code during download or post-process
20
+ - Automatic detection from URL patterns (`/en/`, `_en_`, `docs_en_`, etc.)
21
+ - **Real-world impact**: Claude Code docs downloaded in 9 languages = 352 unnecessary files for English-only users
22
+ - Example: `docpull https://code.claude.com/docs --language en`
23
+
24
+ **2. Deduplication** (`--deduplicate`, `--keep-variant`, `--remove-patterns`)
25
+ - Remove duplicate files based on SHA-256 content hash
26
+ - Keep specific variants (e.g., `mainnet` vs `testnet/devnet`)
27
+ - Configurable keep strategies: first, last, shortest, longest, pattern
28
+ - **Real-world impact**: Aptos docs had 456 Move reference files across 3 environments (2/3 duplicates = 304 files, ~10 MB saved)
29
+ - Example: `docpull https://aptos.dev --deduplicate --keep-variant mainnet`
30
+
31
+ **3. Auto-Index Generation** (`--create-index`, `--index-styles`, `--per-directory-index`)
32
+ - Generate INDEX.md with file tree, table of contents, categories, and statistics
33
+ - Per-directory indexes for nested documentation
34
+ - **Real-world impact**: Makes 1,914 files actually navigable and usable
35
+ - Index styles: tree, toc (table of contents), categories, stats
36
+ - Example: `docpull https://docs.anthropic.com --create-index`
37
+
38
+ **4. Size Limits** (`--max-file-size`, `--max-total-size`, `--size-limit-action`)
39
+ - Skip, truncate, or warn on oversized files
40
+ - Prevent runaway downloads with total size limits
41
+ - **Real-world impact**: Some REST API docs were 308 KB with full JSON responses
42
+ - Actions: skip (default), truncate (keep first N bytes), warn (log only)
43
+ - Example: `docpull https://aptos.dev --max-file-size 200kb --max-total-size 500mb`
44
+
45
+ **5. Multi-Source Configuration** (`--sources-file`, `--generate-sources-config`)
46
+ - Configure multiple documentation sources in a single YAML file
47
+ - Per-source settings for language, deduplication, size limits, etc.
48
+ - **Real-world impact**: One command instead of 4+ separate commands with manual optimization
49
+ - Repeatable, version-controlled documentation workflows
50
+ - Example: `docpull --sources-file my-docs.yaml`
51
+
52
+ #### Phase 2: Content Control
53
+
54
+ **6. Selective Crawling** (`--include-paths`, `--exclude-paths`)
55
+ - Only download URLs matching include patterns
56
+ - Skip entire branches matching exclude patterns
57
+ - Glob-style pattern matching (`*/api/*`, `*/guides/*`)
58
+ - Early termination for excluded branches (faster crawling)
59
+ - Example: `docpull https://aptos.dev --include-paths "build/guides/*" --exclude-paths "*/changelog"`
60
+
61
+ **7. Content Filtering** (`--exclude-sections`)
62
+ - Remove verbose sections by header name (Examples, Changelog, Full Response, etc.)
63
+ - Regex-based content filtering and truncation (future expansion)
64
+ - Keep schemas and reference docs, remove bloated examples
65
+ - Applied during post-processing after download
66
+ - Example: `docpull https://aptos.dev --exclude-sections "Examples" "Full Response" "Changelog"`
67
+
68
+ **8. Format Conversion** (`--format`)
69
+ - **markdown** (default): Standard markdown with YAML frontmatter
70
+ - **toon**: Terser Object Oriented Notation (40-60% size reduction, optimized for LLMs)
71
+ - **json**: Structured JSON with sections, headers, and metadata
72
+ - **sqlite**: Searchable database with FTS5 full-text search
73
+ - Example: `docpull https://docs.anthropic.com --format toon` or `--format sqlite`
74
+
75
+ **9. Smart Naming** (`--naming-strategy`)
76
+ - **full** (default): Preserve complete path structure with domain prefix
77
+ - **short**: Remove domain prefix, keep directory structure
78
+ - **flat**: Single directory with descriptive hyphenated names
79
+ - **hierarchical**: Smart hierarchy based on common documentation patterns
80
+ - Example: `docpull https://docs.anthropic.com --naming-strategy hierarchical`
81
+
82
+ #### Phase 3: Advanced Features
83
+
84
+ **10. Metadata Extraction** (`--extract-metadata`)
85
+ - Extract titles, URLs, word counts, categories, last updated dates
86
+ - Aggregate statistics: total files, total size, file types, categories
87
+ - Output to metadata.json for analysis, search indexing, or documentation audits
88
+ - Example: `docpull https://docs.anthropic.com --extract-metadata`
89
+
90
+ **11. Update Detection** (`--check-updates`, `--update-only-changed`)
91
+ - Check which files have changed without downloading
92
+ - Only fetch modified files based on checksums, ETags, Last-Modified headers
93
+ - Manifest tracking with automatic cache management
94
+ - Saves bandwidth and time on regular documentation updates
95
+ - Example: `docpull https://docs.anthropic.com --update-only-changed`
96
+
97
+ **12. Incremental Mode** (`--incremental`, `--resume`, `--cache-dir`, `--clear-cache`)
98
+ - Resume interrupted downloads from checkpoint
99
+ - State persistence across sessions
100
+ - Cache directory for manifests and state files
101
+ - Essential for large documentation sets (1000+ files)
102
+ - Example: `docpull https://aptos.dev --incremental --resume`
103
+
104
+ #### Phase 4: Extensibility
105
+
106
+ **13. Hooks & Plugins** (`--post-process-hook`, `--pre-fetch-hook`)
107
+ - Python plugin system for custom processing
108
+ - Hook types: `pre_fetch`, `post_fetch`, `post_process`, `filter`
109
+ - Decorator-based hook registration (`@hook(HookType.POST_PROCESS)`)
110
+ - Load hooks from Python files
111
+ - Example: `docpull https://docs.anthropic.com --post-process-hook ./optimize.py`
112
+
113
+ **14. Git Integration** (`--git-commit`, `--git-message`, `--git-tag`, `--git-author`)
114
+ - Automatically commit documentation changes after successful fetch
115
+ - Customizable commit messages with templates (`{date}`, `{datetime}`)
116
+ - Optional tagging for versioned snapshots
117
+ - Track documentation evolution over time
118
+ - Example: `docpull --sources-file sources.yaml --git-commit --git-message "Update docs - {date}"`
119
+
120
+ **15. Archive Mode** (`--archive`, `--archive-format`, `--archive-name`)
121
+ - Create compressed archives of documentation
122
+ - Formats: tar.gz (default), tar.bz2, tar.xz, zip
123
+ - Date-stamped archives for distribution
124
+ - Single-file documentation bundles
125
+ - Example: `docpull https://docs.anthropic.com --archive --archive-format tar.gz`
126
+
127
+ ### Added - New Modules
128
+
129
+ - `docpull/processors/`: Post-processing pipeline
130
+ - `base.py`: BaseProcessor interface and ProcessorPipeline
131
+ - `language_filter.py`: Language filtering processor
132
+ - `deduplicator.py`: Deduplication processor with hash-based detection
133
+ - `size_limiter.py`: Size limit enforcement
134
+ - `content_filter.py`: Section and content filtering
135
+ - `docpull/formatters/`: Output format converters
136
+ - `base.py`: BaseFormatter interface
137
+ - `markdown.py`: Markdown formatter (default)
138
+ - `toon.py`: TOON format converter (compact for LLMs)
139
+ - `json.py`: JSON formatter with structured sections
140
+ - `sqlite.py`: SQLite database with FTS5 search
141
+ - `docpull/indexer.py`: Auto-index generation with tree/TOC/categories/stats
142
+ - `docpull/naming.py`: Smart naming strategies (full, short, flat, hierarchical)
143
+ - `docpull/metadata.py`: Metadata extraction and aggregation
144
+ - `docpull/cache.py`: Cache management for update detection and incremental fetching
145
+ - `docpull/hooks.py`: Plugin/hook system with decorator support
146
+ - `docpull/vcs.py`: Git integration (commit, tag, status, diff)
147
+ - `docpull/archive.py`: Archive creation (tarball, zip)
148
+ - `docpull/sources_config.py`: Multi-source YAML configuration with per-source settings
149
+ - Enhanced `docpull/cli.py`: Integrated all new CLI arguments with organized argument groups
150
+
151
+ ### Changed - New Required Dependencies
152
+
153
+ **IMPORTANT**: This release adds new required dependencies for enhanced functionality.
154
+
155
+ 1. **PyYAML is now a REQUIRED dependency** (was optional in v1.1.0)
156
+ - Required for `--sources-file` multi-source configuration
157
+ - Automatically installed with: `pip install --upgrade docpull`
158
+
159
+ 2. **GitPython is now a REQUIRED dependency** (new in v1.2.0)
160
+ - Required for `--git-commit` git integration features
161
+ - Automatically installed with: `pip install --upgrade docpull`
162
+
163
+ **Backward Compatibility**: All existing CLI commands and workflows continue to work. New features are purely additive.
164
+
165
+ ### Changed - Improvements
166
+
167
+ - CLI organized into logical argument groups (Multi-Source, Language Filtering, Deduplication, Size Limits, Content Filtering, Output Format, Index Generation, Metadata, Update Detection, Hooks, Git Integration, Archive Mode)
168
+ - Enhanced configuration schema to support all 15 new features
169
+ - Better error messages and validation throughout
170
+ - Structured logging with feature-specific messages
171
+ - Comprehensive documentation and examples
172
+
173
+ ### Performance Improvements
174
+
175
+ Real-world optimization results from testing with 1,914 files (31 MB):
176
+ - **Language filtering**: -352 files, -5-10 MB (Claude Code docs in 9 languages → English only)
177
+ - **Deduplication**: -304 files, -10 MB (Aptos Move references across 3 environments)
178
+ - **Size limits**: -3-5 MB (Skip verbose API examples over 200 KB)
179
+ - **Content filtering**: Additional KB savings by removing Changelog/Examples sections
180
+ - **Combined optimizations**: 31 MB → ~13 MB (58% reduction)
181
+
182
+ ### Documentation
183
+
184
+ - Comprehensive CHANGELOG with feature descriptions and real-world impact
185
+ - Updated README with all 15 features and usage examples
186
+ - Migration guide for v1.x users
187
+ - Example `sources.yaml` configuration file
188
+ - Hook development guide and examples
189
+
190
+ ### Testing
191
+
192
+ - Unit tests for all new processor modules (language_filter, deduplicator, size_limiter, content_filter)
193
+ - Unit tests for all formatters (markdown, TOON, JSON, SQLite)
194
+ - Unit tests for indexer, naming, metadata extraction, cache management
195
+ - Unit tests for hooks system, git integration, archive creation
196
+ - Integration tests for multi-source workflows
197
+ - Mock-based tests for external dependencies (git, sqlite)
198
+
199
+ ---
200
+
201
+ ## Example: What You Can Now Do
202
+
203
+ ### Before v1.2.0 (Manual Process):
204
+ ```bash
205
+ docpull https://docs.anthropic.com --output-dir ./docs/anthropic
206
+ docpull https://code.claude.com/docs --output-dir ./docs/claude-code
207
+ docpull https://aptos.dev --output-dir ./docs/aptos
208
+ docpull https://shelby.xyz --output-dir ./docs/shelby
209
+ # Then manually run optimization scripts
210
+ # Result: 31 MB, 1,914 files, no navigation
211
+ ```
212
+
213
+ ### After v1.2.0 (One Command):
214
+ ```bash
215
+ docpull --sources-file docs-config.yaml
216
+ ```
217
+
218
+ **docs-config.yaml:**
219
+ ```yaml
220
+ sources:
221
+ anthropic:
222
+ url: https://docs.anthropic.com
223
+ language: en
224
+ max_file_size: 200kb
225
+ create_index: true
226
+
227
+ claude-code:
228
+ url: https://code.claude.com/docs
229
+ language: en # Skips 352 translation files!
230
+ create_index: true
231
+
232
+ aptos:
233
+ url: https://aptos.dev
234
+ deduplicate: true
235
+ keep_variant: mainnet # Skips 304 duplicates!
236
+ max_file_size: 200kb
237
+ include_paths: ["build/*"]
238
+
239
+ shelby:
240
+ url: https://docs.shelby.xyz
241
+ create_index: true
242
+
243
+ git_commit: true
244
+ git_message: "Update docs - {date}"
245
+ ```
246
+
247
+ **Result**: ~13 MB (58% smaller), all indexes created, one command, repeatable, version-controlled!
248
+
249
+ ---
250
+
251
+ ## [1.1.0] - 2025-11-14
252
+
253
+ ### Added
254
+ - `--doctor` command for diagnosing installation and dependency issues
255
+ - Checks all core dependencies (requests, beautifulsoup4, html2text, defusedxml, aiohttp, rich)
256
+ - Checks optional dependencies (PyYAML, Playwright) with installation suggestions
257
+ - Tests network connectivity
258
+ - Verifies output directory write permissions
259
+ - Works even when dependencies are missing
260
+ - `requirements.txt` file for transparent dependency listing
261
+ - Comprehensive `TROUBLESHOOTING.md` documentation with:
262
+ - Installation troubleshooting (missing dependencies, pipx issues)
263
+ - Runtime issue solutions (YAML config errors, JavaScript rendering)
264
+ - Diagnostic tools usage guide
265
+ - Common error messages reference table
266
+ - Quick reference commands
267
+
268
+ ### Changed
269
+ - Improved error handling for missing dependencies
270
+ - Early dependency checking at CLI entry point
271
+ - Clear, actionable error messages with installation instructions
272
+ - Specific recommendations for pipx, pip, and development installations
273
+ - Enhanced YAML configuration error handling
274
+ - Auto-fallback to JSON when PyYAML is not installed
275
+ - Clear error messages for YAML-related import errors
276
+ - Helpful suggestions for installing optional dependencies
277
+ - Updated README.md with:
278
+ - `--doctor` command in Quick Start section
279
+ - Reference to TROUBLESHOOTING.md
280
+ - Better troubleshooting guidance
281
+
282
+ ### Fixed
283
+ - Improved user experience when dependencies are missing (no more confusing tracebacks)
284
+ - Better handling of optional dependency errors (PyYAML, Playwright)
285
+
286
+ ## [1.0.0] - 2025-11-07
287
+
288
+ ### Added
289
+ - Initial release of docpull
290
+ - Support for fetching documentation from multiple sources:
291
+ - Stripe API documentation
292
+ - Plaid API documentation
293
+ - Next.js documentation
294
+ - D3.js documentation (devdocs.io)
295
+ - Bun runtime documentation
296
+ - Tailwind CSS documentation
297
+ - React documentation
298
+ - CLI interface with config file support (YAML/JSON)
299
+ - Parallel fetching with ThreadPoolExecutor for improved performance
300
+ - Security features:
301
+ - Path traversal protection
302
+ - XXE (XML External Entity) protection
303
+ - File size limits (50MB default)
304
+ - Redirect limits (5 hops)
305
+ - Request timeouts (30s)
306
+ - HTTPS enforcement with certificate verification
307
+ - Rate limiting to respect server resources
308
+ - Structured logging with configurable levels
309
+ - YAML frontmatter metadata in generated markdown files
310
+ - Config file generation command
311
+ - Extensible fetcher architecture for easy addition of new sources
312
+ - Comprehensive documentation and examples
313
+
314
+ ### Changed
315
+ - Cleaned up README to remove emojis and update to organization URLs
316
+ - Applied 2025 PyPI best practices to packaging configuration
317
+ - Reorganized project structure for better maintainability
318
+
319
+ ### Security
320
+ - Implemented multiple security layers for safe web scraping
321
+ - Added security scanning with Bandit and pip-audit
322
+ - Created GitHub Actions workflow for automated security checks
323
+ - Documented security features in SECURITY.md
324
+
325
+ ---
326
+
327
+ [1.1.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.1.0
328
+ [1.0.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.0.0
@@ -0,0 +1,189 @@
1
+ # Contributing to docpull
2
+
3
+ Thank you for your interest in contributing to docpull! This document provides guidelines and instructions for contributing.
4
+
5
+ ## Development Setup
6
+
7
+ 1. **Fork and clone the repository**
8
+
9
+ ```bash
10
+ git clone https://github.com/YOUR_USERNAME/docpull.git
11
+ cd docpull
12
+ ```
13
+
14
+ 2. **Set up development environment**
15
+
16
+ ```bash
17
+ # Create virtual environment
18
+ python -m venv .venv
19
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
20
+
21
+ # Install with development dependencies
22
+ pip install -e ".[dev]"
23
+
24
+ # Install pre-commit hooks
25
+ pre-commit install
26
+ ```
27
+
28
+ 3. **Verify setup**
29
+
30
+ ```bash
31
+ # Run tests
32
+ make test
33
+
34
+ # Run linting
35
+ make lint
36
+
37
+ # Run formatting
38
+ make format
39
+ ```
40
+
41
+ ## Development Workflow
42
+
43
+ 1. **Create a feature branch**
44
+
45
+ ```bash
46
+ git checkout -b feature/your-feature-name
47
+ # or
48
+ git checkout -b fix/your-bug-fix
49
+ ```
50
+
51
+ 2. **Make your changes**
52
+
53
+ - Write clear, readable code
54
+ - Follow existing code style (enforced by pre-commit hooks)
55
+ - Add tests for new functionality
56
+ - Update documentation as needed
57
+ - Update CHANGELOG.md with your changes
58
+
59
+ 3. **Run tests and linting**
60
+
61
+ ```bash
62
+ # Run all tests
63
+ make test
64
+
65
+ # Run linting
66
+ make lint
67
+
68
+ # Format code
69
+ make format
70
+
71
+ # Clean artifacts
72
+ make clean
73
+ ```
74
+
75
+ 4. **Commit your changes**
76
+
77
+ ```bash
78
+ git add .
79
+ git commit -m "feat: add new feature"
80
+ # or
81
+ git commit -m "fix: resolve bug in X"
82
+ ```
83
+
84
+ Use [Conventional Commits](https://www.conventionalcommits.org/) format:
85
+ - `feat:` - New features
86
+ - `fix:` - Bug fixes
87
+ - `docs:` - Documentation changes
88
+ - `test:` - Test additions/changes
89
+ - `refactor:` - Code refactoring
90
+ - `chore:` - Maintenance tasks
91
+ - `ci:` - CI/CD changes
92
+
93
+ 5. **Push and create a pull request**
94
+
95
+ ```bash
96
+ git push origin feature/your-feature-name
97
+ ```
98
+
99
+ Then open a pull request on GitHub.
100
+
101
+ ## Code Style
102
+
103
+ - **Python**: Follow PEP 8 (enforced by Ruff and Black)
104
+ - **Line length**: 110 characters
105
+ - **Type hints**: Required for all functions
106
+ - **Docstrings**: Required for public APIs
107
+
108
+ Pre-commit hooks will automatically:
109
+ - Fix trailing whitespace
110
+ - Format code with Black and Ruff
111
+ - Sort imports
112
+ - Check for common issues
113
+
114
+ ## Testing
115
+
116
+ - Write tests for all new features
117
+ - Maintain or improve test coverage
118
+ - Run tests with: `make test`
119
+ - Check coverage with: `pytest --cov=docpull --cov-report=html`
120
+
121
+ ## Documentation
122
+
123
+ - Update README.md for user-facing changes
124
+ - Update CHANGELOG.md with all changes
125
+ - Add docstrings to new functions/classes
126
+ - Update TROUBLESHOOTING.md for common issues
127
+
128
+ ## Pull Request Process
129
+
130
+ 1. **Ensure all tests pass**
131
+ 2. **Update CHANGELOG.md** with your changes
132
+ 3. **Update documentation** as needed
133
+ 4. **Fill out the PR template** completely
134
+ 5. **Wait for review** - maintainers will review your PR
135
+ 6. **Address feedback** - make requested changes
136
+ 7. **Merge** - once approved, maintainers will merge
137
+
138
+ ## Release Process
139
+
140
+ Releases are automated via GitHub Actions:
141
+
142
+ 1. Maintainer triggers release workflow
143
+ 2. Workflow updates version numbers
144
+ 3. Workflow updates CHANGELOG.md
145
+ 4. Workflow creates git tag
146
+ 5. Workflow creates GitHub release
147
+ 6. Publish workflow automatically deploys to PyPI
148
+
149
+ ## Adding a New Fetcher
150
+
151
+ To add support for a new documentation source:
152
+
153
+ 1. Create `docpull/fetchers/yoursite.py`:
154
+
155
+ ```python
156
+ from .base import BaseFetcher
157
+
158
+ class YourSiteFetcher(BaseFetcher):
159
+ """Fetch documentation from YourSite."""
160
+
161
+ def __init__(self, output_dir: str = "yoursite-docs"):
162
+ super().__init__(output_dir)
163
+ self.base_url = "https://docs.yoursite.com"
164
+
165
+ def fetch_all(self) -> None:
166
+ # Implement fetching logic
167
+ pass
168
+ ```
169
+
170
+ 2. Register in `docpull/__init__.py`
171
+ 3. Add tests in `tests/test_yoursite.py`
172
+ 4. Update README.md with usage example
173
+ 5. Update CHANGELOG.md
174
+
175
+ ## Getting Help
176
+
177
+ - Open an issue for bugs or feature requests
178
+ - Check existing issues before creating new ones
179
+ - Use `docpull --doctor` to diagnose issues
180
+ - See TROUBLESHOOTING.md for common problems
181
+
182
+ ## Code of Conduct
183
+
184
+ - Be respectful and inclusive
185
+ - Provide constructive feedback
186
+ - Focus on the code, not the person
187
+ - Help others learn and grow
188
+
189
+ Thank you for contributing to docpull!
@@ -0,0 +1,49 @@
1
+ # Include documentation files
2
+ include README.md
3
+ include LICENSE
4
+ include CHANGELOG.md
5
+ include SECURITY.md
6
+ include CONTRIBUTING.md
7
+ include TROUBLESHOOTING.md
8
+
9
+ # Include configuration files
10
+ include pyproject.toml
11
+ include requirements.txt
12
+ include Makefile
13
+ include .editorconfig
14
+
15
+ # Include pre-commit configuration
16
+ include .pre-commit-config.yaml
17
+
18
+ # Include example configurations
19
+ recursive-include examples *.yaml
20
+ include examples/README.md
21
+
22
+ # Include type stub marker
23
+ include docpull/py.typed
24
+
25
+ # Include all Python files in profiles
26
+ recursive-include docpull/profiles *.py
27
+
28
+ # Exclude unnecessary files
29
+ global-exclude __pycache__
30
+ global-exclude *.py[cod]
31
+ global-exclude *~
32
+ global-exclude .DS_Store
33
+ global-exclude *.swp
34
+ global-exclude *.swo
35
+
36
+ # Exclude test artifacts and caches
37
+ prune tests/__pycache__
38
+ prune .pytest_cache
39
+ prune .mypy_cache
40
+ prune .ruff_cache
41
+ prune htmlcov
42
+ prune .docpull-cache
43
+ prune .git
44
+ prune .github
45
+
46
+ # Exclude build artifacts
47
+ prune build
48
+ prune dist
49
+ prune *.egg-info
docpull-1.2.1/Makefile ADDED
@@ -0,0 +1,44 @@
1
+ .PHONY: clean clean-pyc clean-build clean-test help
2
+
3
+ help:
4
+ @echo "clean - remove all build, test, coverage and Python artifacts"
5
+ @echo "clean-build - remove build artifacts"
6
+ @echo "clean-pyc - remove Python file artifacts"
7
+ @echo "clean-test - remove test and coverage artifacts"
8
+ @echo "test - run tests with pytest"
9
+ @echo "lint - check style with ruff"
10
+ @echo "format - format code with black"
11
+
12
+ clean: clean-build clean-pyc clean-test
13
+
14
+ clean-build:
15
+ rm -rf build/
16
+ rm -rf dist/
17
+ rm -rf .eggs/
18
+ find . -path ./.venv -prune -o -name '*.egg-info' -exec rm -rf {} + || true
19
+ find . -path ./.venv -prune -o -name '*.egg' -exec rm -f {} + || true
20
+
21
+ clean-pyc:
22
+ find . -path ./.venv -prune -o -name '*.pyc' -exec rm -f {} + || true
23
+ find . -path ./.venv -prune -o -name '*.pyo' -exec rm -f {} + || true
24
+ find . -path ./.venv -prune -o -name '*~' -exec rm -f {} + || true
25
+ find . -path ./.venv -prune -o -name '__pycache__' -exec rm -rf {} + || true
26
+
27
+ clean-test:
28
+ rm -rf .pytest_cache/
29
+ rm -rf .mypy_cache/
30
+ rm -rf .ruff_cache/
31
+ rm -rf htmlcov/
32
+ rm -rf .coverage
33
+ rm -rf coverage.xml
34
+ rm -rf docs/
35
+ rm -rf test-docs/
36
+
37
+ test:
38
+ pytest
39
+
40
+ lint:
41
+ ruff check .
42
+
43
+ format:
44
+ black .
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 1.2.0
3
+ Version: 1.2.1
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -33,6 +33,7 @@ Classifier: Programming Language :: Python :: 3.10
33
33
  Classifier: Programming Language :: Python :: 3.11
34
34
  Classifier: Programming Language :: Python :: 3.12
35
35
  Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3.14
36
37
  Classifier: Programming Language :: Python :: 3 :: Only
37
38
  Classifier: Typing :: Typed
38
39
  Requires-Python: >=3.9