docpull 1.2.0__tar.gz → 1.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-1.2.1/.editorconfig +30 -0
- docpull-1.2.1/.pre-commit-config.yaml +30 -0
- docpull-1.2.1/CHANGELOG.md +328 -0
- docpull-1.2.1/CONTRIBUTING.md +189 -0
- docpull-1.2.1/MANIFEST.in +49 -0
- docpull-1.2.1/Makefile +44 -0
- {docpull-1.2.0 → docpull-1.2.1}/PKG-INFO +2 -1
- docpull-1.2.1/SECURITY.md +206 -0
- docpull-1.2.1/TROUBLESHOOTING.md +348 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/archive.py +1 -1
- {docpull-1.2.0 → docpull-1.2.1}/docpull/cache.py +58 -26
- {docpull-1.2.0 → docpull-1.2.1}/docpull/cli.py +4 -4
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/base.py +2 -1
- docpull-1.2.1/docpull/fetchers/nextjs.py +59 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/plaid.py +10 -13
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/stripe.py +3 -14
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/__init__.py +10 -4
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/base.py +34 -5
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/json.py +12 -10
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/markdown.py +4 -2
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/sqlite.py +12 -10
- {docpull-1.2.0 → docpull-1.2.1}/docpull/formatters/toon.py +5 -3
- {docpull-1.2.0 → docpull-1.2.1}/docpull/hooks.py +12 -7
- {docpull-1.2.0 → docpull-1.2.1}/docpull/indexer.py +42 -19
- {docpull-1.2.0 → docpull-1.2.1}/docpull/metadata.py +53 -13
- {docpull-1.2.0 → docpull-1.2.1}/docpull/orchestrator.py +11 -10
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/__init__.py +2 -1
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/base.py +7 -6
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/content_filter.py +51 -8
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/deduplicator.py +33 -13
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/language_filter.py +33 -22
- {docpull-1.2.0 → docpull-1.2.1}/docpull/processors/size_limiter.py +19 -12
- {docpull-1.2.0 → docpull-1.2.1}/docpull/sources_config.py +194 -9
- {docpull-1.2.0 → docpull-1.2.1}/docpull/vcs.py +1 -1
- {docpull-1.2.0 → docpull-1.2.1}/docpull.egg-info/SOURCES.txt +16 -10
- docpull-1.2.1/examples/README.md +280 -0
- docpull-1.2.1/examples/deduplication-strategies.yaml +29 -0
- docpull-1.2.1/examples/format-conversion.yaml +25 -0
- docpull-1.2.1/examples/incremental-updates.yaml +26 -0
- docpull-1.2.1/examples/multi-source-optimized.yaml +45 -0
- docpull-1.2.1/examples/selective-crawling.yaml +26 -0
- docpull-1.2.1/examples/simple-optimization.yaml +14 -0
- {docpull-1.2.0 → docpull-1.2.1}/pyproject.toml +3 -2
- docpull-1.2.1/requirements.txt +34 -0
- {docpull-1.2.0 → docpull-1.2.1}/tests/test_config.py +0 -3
- {docpull-1.2.0 → docpull-1.2.1}/tests/test_orchestrator.py +2 -1
- docpull-1.2.0/docpull/fetchers/nextjs.py +0 -50
- docpull-1.2.0/docpull.egg-info/PKG-INFO +0 -394
- docpull-1.2.0/docpull.egg-info/dependency_links.txt +0 -1
- docpull-1.2.0/docpull.egg-info/entry_points.txt +0 -2
- docpull-1.2.0/docpull.egg-info/requires.txt +0 -29
- docpull-1.2.0/docpull.egg-info/top_level.txt +0 -1
- docpull-1.2.0/tests/test_async_fetcher.py +0 -147
- docpull-1.2.0/tests/test_fetchers.py +0 -57
- docpull-1.2.0/tests/test_formatters.py +0 -276
- docpull-1.2.0/tests/test_processors.py +0 -424
- {docpull-1.2.0 → docpull-1.2.1}/LICENSE +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/README.md +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/__init__.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/__main__.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/config.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/doctor.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/__init__.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/async_fetcher.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/bun.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/d3.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/generic.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/generic_async.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/react.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/tailwind.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/fetchers/turborepo.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/naming.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/__init__.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/base.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/bun.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/d3.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/nextjs.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/plaid.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/react.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/stripe.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/tailwind.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/profiles/turborepo.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/py.typed +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/__init__.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/file_utils.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/docpull/utils/logging_config.py +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/setup.cfg +0 -0
- {docpull-1.2.0 → docpull-1.2.1}/tests/test_sources_config.py +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# EditorConfig helps maintain consistent coding styles
|
|
2
|
+
# https://editorconfig.org
|
|
3
|
+
|
|
4
|
+
root = true
|
|
5
|
+
|
|
6
|
+
[*]
|
|
7
|
+
charset = utf-8
|
|
8
|
+
end_of_line = lf
|
|
9
|
+
insert_final_newline = true
|
|
10
|
+
trim_trailing_whitespace = true
|
|
11
|
+
|
|
12
|
+
[*.{py,pyi}]
|
|
13
|
+
indent_style = space
|
|
14
|
+
indent_size = 4
|
|
15
|
+
max_line_length = 110
|
|
16
|
+
|
|
17
|
+
[*.{yml,yaml}]
|
|
18
|
+
indent_style = space
|
|
19
|
+
indent_size = 2
|
|
20
|
+
|
|
21
|
+
[*.{json,toml}]
|
|
22
|
+
indent_style = space
|
|
23
|
+
indent_size = 2
|
|
24
|
+
|
|
25
|
+
[*.md]
|
|
26
|
+
trim_trailing_whitespace = false
|
|
27
|
+
max_line_length = off
|
|
28
|
+
|
|
29
|
+
[Makefile]
|
|
30
|
+
indent_style = tab
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.5.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
- id: check-merge-conflict
|
|
10
|
+
- id: debug-statements
|
|
11
|
+
- id: mixed-line-ending
|
|
12
|
+
|
|
13
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
14
|
+
rev: v0.1.9
|
|
15
|
+
hooks:
|
|
16
|
+
- id: ruff
|
|
17
|
+
args: [--fix]
|
|
18
|
+
- id: ruff-format
|
|
19
|
+
|
|
20
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
21
|
+
rev: v1.8.0
|
|
22
|
+
hooks:
|
|
23
|
+
- id: mypy
|
|
24
|
+
additional_dependencies: [types-requests, types-PyYAML]
|
|
25
|
+
args: [--ignore-missing-imports]
|
|
26
|
+
exclude: ^tests/
|
|
27
|
+
verbose: true
|
|
28
|
+
# Allow failures for now - mypy is informational
|
|
29
|
+
# Remove this line once type issues are fixed
|
|
30
|
+
stages: [manual]
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [1.2.0] - 2025-11-16
|
|
9
|
+
|
|
10
|
+
### Added - 15 Major New Features
|
|
11
|
+
|
|
12
|
+
This release represents a massive expansion of docpull's capabilities, adding 15 major features across 4 phases. Based on real-world usage pulling 1,914 files (31 MB) from Anthropic, Claude Code, Aptos, and Shelby documentation, these features enable automatic optimization reducing output to ~13 MB (58% reduction).
|
|
13
|
+
|
|
14
|
+
**Note**: All new features are backward compatible. Existing workflows continue to work unchanged.
|
|
15
|
+
|
|
16
|
+
#### Phase 1: Essential Optimizations (Top Priority Features)
|
|
17
|
+
|
|
18
|
+
**1. Language Filtering** (`--language`, `--exclude-languages`)
|
|
19
|
+
- Filter documentation by language code during download or post-process
|
|
20
|
+
- Automatic detection from URL patterns (`/en/`, `_en_`, `docs_en_`, etc.)
|
|
21
|
+
- **Real-world impact**: Claude Code docs downloaded in 9 languages = 352 unnecessary files for English-only users
|
|
22
|
+
- Example: `docpull https://code.claude.com/docs --language en`
|
|
23
|
+
|
|
24
|
+
**2. Deduplication** (`--deduplicate`, `--keep-variant`, `--remove-patterns`)
|
|
25
|
+
- Remove duplicate files based on SHA-256 content hash
|
|
26
|
+
- Keep specific variants (e.g., `mainnet` vs `testnet/devnet`)
|
|
27
|
+
- Configurable keep strategies: first, last, shortest, longest, pattern
|
|
28
|
+
- **Real-world impact**: Aptos docs had 456 Move reference files across 3 environments (2/3 duplicates = 304 files, ~10 MB saved)
|
|
29
|
+
- Example: `docpull https://aptos.dev --deduplicate --keep-variant mainnet`
|
|
30
|
+
|
|
31
|
+
**3. Auto-Index Generation** (`--create-index`, `--index-styles`, `--per-directory-index`)
|
|
32
|
+
- Generate INDEX.md with file tree, table of contents, categories, and statistics
|
|
33
|
+
- Per-directory indexes for nested documentation
|
|
34
|
+
- **Real-world impact**: Makes 1,914 files actually navigable and usable
|
|
35
|
+
- Index styles: tree, toc (table of contents), categories, stats
|
|
36
|
+
- Example: `docpull https://docs.anthropic.com --create-index`
|
|
37
|
+
|
|
38
|
+
**4. Size Limits** (`--max-file-size`, `--max-total-size`, `--size-limit-action`)
|
|
39
|
+
- Skip, truncate, or warn on oversized files
|
|
40
|
+
- Prevent runaway downloads with total size limits
|
|
41
|
+
- **Real-world impact**: Some REST API docs were 308 KB with full JSON responses
|
|
42
|
+
- Actions: skip (default), truncate (keep first N bytes), warn (log only)
|
|
43
|
+
- Example: `docpull https://aptos.dev --max-file-size 200kb --max-total-size 500mb`
|
|
44
|
+
|
|
45
|
+
**5. Multi-Source Configuration** (`--sources-file`, `--generate-sources-config`)
|
|
46
|
+
- Configure multiple documentation sources in a single YAML file
|
|
47
|
+
- Per-source settings for language, deduplication, size limits, etc.
|
|
48
|
+
- **Real-world impact**: One command instead of 4+ separate commands with manual optimization
|
|
49
|
+
- Repeatable, version-controlled documentation workflows
|
|
50
|
+
- Example: `docpull --sources-file my-docs.yaml`
|
|
51
|
+
|
|
52
|
+
#### Phase 2: Content Control
|
|
53
|
+
|
|
54
|
+
**6. Selective Crawling** (`--include-paths`, `--exclude-paths`)
|
|
55
|
+
- Only download URLs matching include patterns
|
|
56
|
+
- Skip entire branches matching exclude patterns
|
|
57
|
+
- Glob-style pattern matching (`*/api/*`, `*/guides/*`)
|
|
58
|
+
- Early termination for excluded branches (faster crawling)
|
|
59
|
+
- Example: `docpull https://aptos.dev --include-paths "build/guides/*" --exclude-paths "*/changelog"`
|
|
60
|
+
|
|
61
|
+
**7. Content Filtering** (`--exclude-sections`)
|
|
62
|
+
- Remove verbose sections by header name (Examples, Changelog, Full Response, etc.)
|
|
63
|
+
- Regex-based content filtering and truncation (future expansion)
|
|
64
|
+
- Keep schemas and reference docs, remove bloated examples
|
|
65
|
+
- Applied during post-processing after download
|
|
66
|
+
- Example: `docpull https://aptos.dev --exclude-sections "Examples" "Full Response" "Changelog"`
|
|
67
|
+
|
|
68
|
+
**8. Format Conversion** (`--format`)
|
|
69
|
+
- **markdown** (default): Standard markdown with YAML frontmatter
|
|
70
|
+
- **toon**: Terser Object Oriented Notation (40-60% size reduction, optimized for LLMs)
|
|
71
|
+
- **json**: Structured JSON with sections, headers, and metadata
|
|
72
|
+
- **sqlite**: Searchable database with FTS5 full-text search
|
|
73
|
+
- Example: `docpull https://docs.anthropic.com --format toon` or `--format sqlite`
|
|
74
|
+
|
|
75
|
+
**9. Smart Naming** (`--naming-strategy`)
|
|
76
|
+
- **full** (default): Preserve complete path structure with domain prefix
|
|
77
|
+
- **short**: Remove domain prefix, keep directory structure
|
|
78
|
+
- **flat**: Single directory with descriptive hyphenated names
|
|
79
|
+
- **hierarchical**: Smart hierarchy based on common documentation patterns
|
|
80
|
+
- Example: `docpull https://docs.anthropic.com --naming-strategy hierarchical`
|
|
81
|
+
|
|
82
|
+
#### Phase 3: Advanced Features
|
|
83
|
+
|
|
84
|
+
**10. Metadata Extraction** (`--extract-metadata`)
|
|
85
|
+
- Extract titles, URLs, word counts, categories, last updated dates
|
|
86
|
+
- Aggregate statistics: total files, total size, file types, categories
|
|
87
|
+
- Output to metadata.json for analysis, search indexing, or documentation audits
|
|
88
|
+
- Example: `docpull https://docs.anthropic.com --extract-metadata`
|
|
89
|
+
|
|
90
|
+
**11. Update Detection** (`--check-updates`, `--update-only-changed`)
|
|
91
|
+
- Check which files have changed without downloading
|
|
92
|
+
- Only fetch modified files based on checksums, ETags, Last-Modified headers
|
|
93
|
+
- Manifest tracking with automatic cache management
|
|
94
|
+
- Saves bandwidth and time on regular documentation updates
|
|
95
|
+
- Example: `docpull https://docs.anthropic.com --update-only-changed`
|
|
96
|
+
|
|
97
|
+
**12. Incremental Mode** (`--incremental`, `--resume`, `--cache-dir`, `--clear-cache`)
|
|
98
|
+
- Resume interrupted downloads from checkpoint
|
|
99
|
+
- State persistence across sessions
|
|
100
|
+
- Cache directory for manifests and state files
|
|
101
|
+
- Essential for large documentation sets (1000+ files)
|
|
102
|
+
- Example: `docpull https://aptos.dev --incremental --resume`
|
|
103
|
+
|
|
104
|
+
#### Phase 4: Extensibility
|
|
105
|
+
|
|
106
|
+
**13. Hooks & Plugins** (`--post-process-hook`, `--pre-fetch-hook`)
|
|
107
|
+
- Python plugin system for custom processing
|
|
108
|
+
- Hook types: `pre_fetch`, `post_fetch`, `post_process`, `filter`
|
|
109
|
+
- Decorator-based hook registration (`@hook(HookType.POST_PROCESS)`)
|
|
110
|
+
- Load hooks from Python files
|
|
111
|
+
- Example: `docpull https://docs.anthropic.com --post-process-hook ./optimize.py`
|
|
112
|
+
|
|
113
|
+
**14. Git Integration** (`--git-commit`, `--git-message`, `--git-tag`, `--git-author`)
|
|
114
|
+
- Automatically commit documentation changes after successful fetch
|
|
115
|
+
- Customizable commit messages with templates (`{date}`, `{datetime}`)
|
|
116
|
+
- Optional tagging for versioned snapshots
|
|
117
|
+
- Track documentation evolution over time
|
|
118
|
+
- Example: `docpull --sources-file sources.yaml --git-commit --git-message "Update docs - {date}"`
|
|
119
|
+
|
|
120
|
+
**15. Archive Mode** (`--archive`, `--archive-format`, `--archive-name`)
|
|
121
|
+
- Create compressed archives of documentation
|
|
122
|
+
- Formats: tar.gz (default), tar.bz2, tar.xz, zip
|
|
123
|
+
- Date-stamped archives for distribution
|
|
124
|
+
- Single-file documentation bundles
|
|
125
|
+
- Example: `docpull https://docs.anthropic.com --archive --archive-format tar.gz`
|
|
126
|
+
|
|
127
|
+
### Added - New Modules
|
|
128
|
+
|
|
129
|
+
- `docpull/processors/`: Post-processing pipeline
|
|
130
|
+
- `base.py`: BaseProcessor interface and ProcessorPipeline
|
|
131
|
+
- `language_filter.py`: Language filtering processor
|
|
132
|
+
- `deduplicator.py`: Deduplication processor with hash-based detection
|
|
133
|
+
- `size_limiter.py`: Size limit enforcement
|
|
134
|
+
- `content_filter.py`: Section and content filtering
|
|
135
|
+
- `docpull/formatters/`: Output format converters
|
|
136
|
+
- `base.py`: BaseFormatter interface
|
|
137
|
+
- `markdown.py`: Markdown formatter (default)
|
|
138
|
+
- `toon.py`: TOON format converter (compact for LLMs)
|
|
139
|
+
- `json.py`: JSON formatter with structured sections
|
|
140
|
+
- `sqlite.py`: SQLite database with FTS5 search
|
|
141
|
+
- `docpull/indexer.py`: Auto-index generation with tree/TOC/categories/stats
|
|
142
|
+
- `docpull/naming.py`: Smart naming strategies (full, short, flat, hierarchical)
|
|
143
|
+
- `docpull/metadata.py`: Metadata extraction and aggregation
|
|
144
|
+
- `docpull/cache.py`: Cache management for update detection and incremental fetching
|
|
145
|
+
- `docpull/hooks.py`: Plugin/hook system with decorator support
|
|
146
|
+
- `docpull/vcs.py`: Git integration (commit, tag, status, diff)
|
|
147
|
+
- `docpull/archive.py`: Archive creation (tarball, zip)
|
|
148
|
+
- `docpull/sources_config.py`: Multi-source YAML configuration with per-source settings
|
|
149
|
+
- Enhanced `docpull/cli.py`: Integrated all new CLI arguments with organized argument groups
|
|
150
|
+
|
|
151
|
+
### Changed - New Required Dependencies
|
|
152
|
+
|
|
153
|
+
**IMPORTANT**: This release adds new required dependencies for enhanced functionality.
|
|
154
|
+
|
|
155
|
+
1. **PyYAML is now a REQUIRED dependency** (was optional in v1.1.0)
|
|
156
|
+
- Required for `--sources-file` multi-source configuration
|
|
157
|
+
- Automatically installed with: `pip install --upgrade docpull`
|
|
158
|
+
|
|
159
|
+
2. **GitPython is now a REQUIRED dependency** (new in v1.2.0)
|
|
160
|
+
- Required for `--git-commit` git integration features
|
|
161
|
+
- Automatically installed with: `pip install --upgrade docpull`
|
|
162
|
+
|
|
163
|
+
**Backward Compatibility**: All existing CLI commands and workflows continue to work. New features are purely additive.
|
|
164
|
+
|
|
165
|
+
### Changed - Improvements
|
|
166
|
+
|
|
167
|
+
- CLI organized into logical argument groups (Multi-Source, Language Filtering, Deduplication, Size Limits, Content Filtering, Output Format, Index Generation, Metadata, Update Detection, Hooks, Git Integration, Archive Mode)
|
|
168
|
+
- Enhanced configuration schema to support all 15 new features
|
|
169
|
+
- Better error messages and validation throughout
|
|
170
|
+
- Structured logging with feature-specific messages
|
|
171
|
+
- Comprehensive documentation and examples
|
|
172
|
+
|
|
173
|
+
### Performance Improvements
|
|
174
|
+
|
|
175
|
+
Real-world optimization results from testing with 1,914 files (31 MB):
|
|
176
|
+
- **Language filtering**: -352 files, -5-10 MB (Claude Code docs in 9 languages → English only)
|
|
177
|
+
- **Deduplication**: -304 files, -10 MB (Aptos Move references across 3 environments)
|
|
178
|
+
- **Size limits**: -3-5 MB (Skip verbose API examples over 200 KB)
|
|
179
|
+
- **Content filtering**: Additional KB savings by removing Changelog/Examples sections
|
|
180
|
+
- **Combined optimizations**: 31 MB → ~13 MB (58% reduction)
|
|
181
|
+
|
|
182
|
+
### Documentation
|
|
183
|
+
|
|
184
|
+
- Comprehensive CHANGELOG with feature descriptions and real-world impact
|
|
185
|
+
- Updated README with all 15 features and usage examples
|
|
186
|
+
- Migration guide for v1.x users
|
|
187
|
+
- Example `sources.yaml` configuration file
|
|
188
|
+
- Hook development guide and examples
|
|
189
|
+
|
|
190
|
+
### Testing
|
|
191
|
+
|
|
192
|
+
- Unit tests for all new processor modules (language_filter, deduplicator, size_limiter, content_filter)
|
|
193
|
+
- Unit tests for all formatters (markdown, TOON, JSON, SQLite)
|
|
194
|
+
- Unit tests for indexer, naming, metadata extraction, cache management
|
|
195
|
+
- Unit tests for hooks system, git integration, archive creation
|
|
196
|
+
- Integration tests for multi-source workflows
|
|
197
|
+
- Mock-based tests for external dependencies (git, sqlite)
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Example: What You Can Now Do
|
|
202
|
+
|
|
203
|
+
### Before v1.2.0 (Manual Process):
|
|
204
|
+
```bash
|
|
205
|
+
docpull https://docs.anthropic.com --output-dir ./docs/anthropic
|
|
206
|
+
docpull https://code.claude.com/docs --output-dir ./docs/claude-code
|
|
207
|
+
docpull https://aptos.dev --output-dir ./docs/aptos
|
|
208
|
+
docpull https://shelby.xyz --output-dir ./docs/shelby
|
|
209
|
+
# Then manually run optimization scripts
|
|
210
|
+
# Result: 31 MB, 1,914 files, no navigation
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### After v1.2.0 (One Command):
|
|
214
|
+
```bash
|
|
215
|
+
docpull --sources-file docs-config.yaml
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
**docs-config.yaml:**
|
|
219
|
+
```yaml
|
|
220
|
+
sources:
|
|
221
|
+
anthropic:
|
|
222
|
+
url: https://docs.anthropic.com
|
|
223
|
+
language: en
|
|
224
|
+
max_file_size: 200kb
|
|
225
|
+
create_index: true
|
|
226
|
+
|
|
227
|
+
claude-code:
|
|
228
|
+
url: https://code.claude.com/docs
|
|
229
|
+
language: en # Skips 352 translation files!
|
|
230
|
+
create_index: true
|
|
231
|
+
|
|
232
|
+
aptos:
|
|
233
|
+
url: https://aptos.dev
|
|
234
|
+
deduplicate: true
|
|
235
|
+
keep_variant: mainnet # Skips 304 duplicates!
|
|
236
|
+
max_file_size: 200kb
|
|
237
|
+
include_paths: ["build/*"]
|
|
238
|
+
|
|
239
|
+
shelby:
|
|
240
|
+
url: https://docs.shelby.xyz
|
|
241
|
+
create_index: true
|
|
242
|
+
|
|
243
|
+
git_commit: true
|
|
244
|
+
git_message: "Update docs - {date}"
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
**Result**: ~13 MB (58% smaller), all indexes created, one command, repeatable, version-controlled!
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## [1.1.0] - 2025-11-14
|
|
252
|
+
|
|
253
|
+
### Added
|
|
254
|
+
- `--doctor` command for diagnosing installation and dependency issues
|
|
255
|
+
- Checks all core dependencies (requests, beautifulsoup4, html2text, defusedxml, aiohttp, rich)
|
|
256
|
+
- Checks optional dependencies (PyYAML, Playwright) with installation suggestions
|
|
257
|
+
- Tests network connectivity
|
|
258
|
+
- Verifies output directory write permissions
|
|
259
|
+
- Works even when dependencies are missing
|
|
260
|
+
- `requirements.txt` file for transparent dependency listing
|
|
261
|
+
- Comprehensive `TROUBLESHOOTING.md` documentation with:
|
|
262
|
+
- Installation troubleshooting (missing dependencies, pipx issues)
|
|
263
|
+
- Runtime issue solutions (YAML config errors, JavaScript rendering)
|
|
264
|
+
- Diagnostic tools usage guide
|
|
265
|
+
- Common error messages reference table
|
|
266
|
+
- Quick reference commands
|
|
267
|
+
|
|
268
|
+
### Changed
|
|
269
|
+
- Improved error handling for missing dependencies
|
|
270
|
+
- Early dependency checking at CLI entry point
|
|
271
|
+
- Clear, actionable error messages with installation instructions
|
|
272
|
+
- Specific recommendations for pipx, pip, and development installations
|
|
273
|
+
- Enhanced YAML configuration error handling
|
|
274
|
+
- Auto-fallback to JSON when PyYAML is not installed
|
|
275
|
+
- Clear error messages for YAML-related import errors
|
|
276
|
+
- Helpful suggestions for installing optional dependencies
|
|
277
|
+
- Updated README.md with:
|
|
278
|
+
- `--doctor` command in Quick Start section
|
|
279
|
+
- Reference to TROUBLESHOOTING.md
|
|
280
|
+
- Better troubleshooting guidance
|
|
281
|
+
|
|
282
|
+
### Fixed
|
|
283
|
+
- Improved user experience when dependencies are missing (no more confusing tracebacks)
|
|
284
|
+
- Better handling of optional dependency errors (PyYAML, Playwright)
|
|
285
|
+
|
|
286
|
+
## [1.0.0] - 2025-11-07
|
|
287
|
+
|
|
288
|
+
### Added
|
|
289
|
+
- Initial release of docpull
|
|
290
|
+
- Support for fetching documentation from multiple sources:
|
|
291
|
+
- Stripe API documentation
|
|
292
|
+
- Plaid API documentation
|
|
293
|
+
- Next.js documentation
|
|
294
|
+
- D3.js documentation (devdocs.io)
|
|
295
|
+
- Bun runtime documentation
|
|
296
|
+
- Tailwind CSS documentation
|
|
297
|
+
- React documentation
|
|
298
|
+
- CLI interface with config file support (YAML/JSON)
|
|
299
|
+
- Parallel fetching with ThreadPoolExecutor for improved performance
|
|
300
|
+
- Security features:
|
|
301
|
+
- Path traversal protection
|
|
302
|
+
- XXE (XML External Entity) protection
|
|
303
|
+
- File size limits (50MB default)
|
|
304
|
+
- Redirect limits (5 hops)
|
|
305
|
+
- Request timeouts (30s)
|
|
306
|
+
- HTTPS enforcement with certificate verification
|
|
307
|
+
- Rate limiting to respect server resources
|
|
308
|
+
- Structured logging with configurable levels
|
|
309
|
+
- YAML frontmatter metadata in generated markdown files
|
|
310
|
+
- Config file generation command
|
|
311
|
+
- Extensible fetcher architecture for easy addition of new sources
|
|
312
|
+
- Comprehensive documentation and examples
|
|
313
|
+
|
|
314
|
+
### Changed
|
|
315
|
+
- Cleaned up README to remove emojis and update to organization URLs
|
|
316
|
+
- Applied 2025 PyPI best practices to packaging configuration
|
|
317
|
+
- Reorganized project structure for better maintainability
|
|
318
|
+
|
|
319
|
+
### Security
|
|
320
|
+
- Implemented multiple security layers for safe web scraping
|
|
321
|
+
- Added security scanning with Bandit and pip-audit
|
|
322
|
+
- Created GitHub Actions workflow for automated security checks
|
|
323
|
+
- Documented security features in SECURITY.md
|
|
324
|
+
|
|
325
|
+
---
|
|
326
|
+
|
|
327
|
+
[1.1.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.1.0
|
|
328
|
+
[1.0.0]: https://github.com/raintree-technology/docpull/releases/tag/v1.0.0
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
# Contributing to docpull
|
|
2
|
+
|
|
3
|
+
Thank you for your interest in contributing to docpull! This document provides guidelines and instructions for contributing.
|
|
4
|
+
|
|
5
|
+
## Development Setup
|
|
6
|
+
|
|
7
|
+
1. **Fork and clone the repository**
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/YOUR_USERNAME/docpull.git
|
|
11
|
+
cd docpull
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
2. **Set up development environment**
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Create virtual environment
|
|
18
|
+
python -m venv .venv
|
|
19
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
20
|
+
|
|
21
|
+
# Install with development dependencies
|
|
22
|
+
pip install -e ".[dev]"
|
|
23
|
+
|
|
24
|
+
# Install pre-commit hooks
|
|
25
|
+
pre-commit install
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
3. **Verify setup**
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# Run tests
|
|
32
|
+
make test
|
|
33
|
+
|
|
34
|
+
# Run linting
|
|
35
|
+
make lint
|
|
36
|
+
|
|
37
|
+
# Run formatting
|
|
38
|
+
make format
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Development Workflow
|
|
42
|
+
|
|
43
|
+
1. **Create a feature branch**
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git checkout -b feature/your-feature-name
|
|
47
|
+
# or
|
|
48
|
+
git checkout -b fix/your-bug-fix
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
2. **Make your changes**
|
|
52
|
+
|
|
53
|
+
- Write clear, readable code
|
|
54
|
+
- Follow existing code style (enforced by pre-commit hooks)
|
|
55
|
+
- Add tests for new functionality
|
|
56
|
+
- Update documentation as needed
|
|
57
|
+
- Update CHANGELOG.md with your changes
|
|
58
|
+
|
|
59
|
+
3. **Run tests and linting**
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Run all tests
|
|
63
|
+
make test
|
|
64
|
+
|
|
65
|
+
# Run linting
|
|
66
|
+
make lint
|
|
67
|
+
|
|
68
|
+
# Format code
|
|
69
|
+
make format
|
|
70
|
+
|
|
71
|
+
# Clean artifacts
|
|
72
|
+
make clean
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
4. **Commit your changes**
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
git add .
|
|
79
|
+
git commit -m "feat: add new feature"
|
|
80
|
+
# or
|
|
81
|
+
git commit -m "fix: resolve bug in X"
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Use [Conventional Commits](https://www.conventionalcommits.org/) format:
|
|
85
|
+
- `feat:` - New features
|
|
86
|
+
- `fix:` - Bug fixes
|
|
87
|
+
- `docs:` - Documentation changes
|
|
88
|
+
- `test:` - Test additions/changes
|
|
89
|
+
- `refactor:` - Code refactoring
|
|
90
|
+
- `chore:` - Maintenance tasks
|
|
91
|
+
- `ci:` - CI/CD changes
|
|
92
|
+
|
|
93
|
+
5. **Push and create a pull request**
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
git push origin feature/your-feature-name
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Then open a pull request on GitHub.
|
|
100
|
+
|
|
101
|
+
## Code Style
|
|
102
|
+
|
|
103
|
+
- **Python**: Follow PEP 8 (enforced by Ruff and Black)
|
|
104
|
+
- **Line length**: 110 characters
|
|
105
|
+
- **Type hints**: Required for all functions
|
|
106
|
+
- **Docstrings**: Required for public APIs
|
|
107
|
+
|
|
108
|
+
Pre-commit hooks will automatically:
|
|
109
|
+
- Fix trailing whitespace
|
|
110
|
+
- Format code with Black and Ruff
|
|
111
|
+
- Sort imports
|
|
112
|
+
- Check for common issues
|
|
113
|
+
|
|
114
|
+
## Testing
|
|
115
|
+
|
|
116
|
+
- Write tests for all new features
|
|
117
|
+
- Maintain or improve test coverage
|
|
118
|
+
- Run tests with: `make test`
|
|
119
|
+
- Check coverage with: `pytest --cov=docpull --cov-report=html`
|
|
120
|
+
|
|
121
|
+
## Documentation
|
|
122
|
+
|
|
123
|
+
- Update README.md for user-facing changes
|
|
124
|
+
- Update CHANGELOG.md with all changes
|
|
125
|
+
- Add docstrings to new functions/classes
|
|
126
|
+
- Update TROUBLESHOOTING.md for common issues
|
|
127
|
+
|
|
128
|
+
## Pull Request Process
|
|
129
|
+
|
|
130
|
+
1. **Ensure all tests pass**
|
|
131
|
+
2. **Update CHANGELOG.md** with your changes
|
|
132
|
+
3. **Update documentation** as needed
|
|
133
|
+
4. **Fill out the PR template** completely
|
|
134
|
+
5. **Wait for review** - maintainers will review your PR
|
|
135
|
+
6. **Address feedback** - make requested changes
|
|
136
|
+
7. **Merge** - once approved, maintainers will merge
|
|
137
|
+
|
|
138
|
+
## Release Process
|
|
139
|
+
|
|
140
|
+
Releases are automated via GitHub Actions:
|
|
141
|
+
|
|
142
|
+
1. Maintainer triggers release workflow
|
|
143
|
+
2. Workflow updates version numbers
|
|
144
|
+
3. Workflow updates CHANGELOG.md
|
|
145
|
+
4. Workflow creates git tag
|
|
146
|
+
5. Workflow creates GitHub release
|
|
147
|
+
6. Publish workflow automatically deploys to PyPI
|
|
148
|
+
|
|
149
|
+
## Adding a New Fetcher
|
|
150
|
+
|
|
151
|
+
To add support for a new documentation source:
|
|
152
|
+
|
|
153
|
+
1. Create `docpull/fetchers/yoursite.py`:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from .base import BaseFetcher
|
|
157
|
+
|
|
158
|
+
class YourSiteFetcher(BaseFetcher):
|
|
159
|
+
"""Fetch documentation from YourSite."""
|
|
160
|
+
|
|
161
|
+
def __init__(self, output_dir: str = "yoursite-docs"):
|
|
162
|
+
super().__init__(output_dir)
|
|
163
|
+
self.base_url = "https://docs.yoursite.com"
|
|
164
|
+
|
|
165
|
+
def fetch_all(self) -> None:
|
|
166
|
+
# Implement fetching logic
|
|
167
|
+
pass
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
2. Register in `docpull/__init__.py`
|
|
171
|
+
3. Add tests in `tests/test_yoursite.py`
|
|
172
|
+
4. Update README.md with usage example
|
|
173
|
+
5. Update CHANGELOG.md
|
|
174
|
+
|
|
175
|
+
## Getting Help
|
|
176
|
+
|
|
177
|
+
- Open an issue for bugs or feature requests
|
|
178
|
+
- Check existing issues before creating new ones
|
|
179
|
+
- Use `docpull --doctor` to diagnose issues
|
|
180
|
+
- See TROUBLESHOOTING.md for common problems
|
|
181
|
+
|
|
182
|
+
## Code of Conduct
|
|
183
|
+
|
|
184
|
+
- Be respectful and inclusive
|
|
185
|
+
- Provide constructive feedback
|
|
186
|
+
- Focus on the code, not the person
|
|
187
|
+
- Help others learn and grow
|
|
188
|
+
|
|
189
|
+
Thank you for contributing to docpull!
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Include documentation files
|
|
2
|
+
include README.md
|
|
3
|
+
include LICENSE
|
|
4
|
+
include CHANGELOG.md
|
|
5
|
+
include SECURITY.md
|
|
6
|
+
include CONTRIBUTING.md
|
|
7
|
+
include TROUBLESHOOTING.md
|
|
8
|
+
|
|
9
|
+
# Include configuration files
|
|
10
|
+
include pyproject.toml
|
|
11
|
+
include requirements.txt
|
|
12
|
+
include Makefile
|
|
13
|
+
include .editorconfig
|
|
14
|
+
|
|
15
|
+
# Include pre-commit configuration
|
|
16
|
+
include .pre-commit-config.yaml
|
|
17
|
+
|
|
18
|
+
# Include example configurations
|
|
19
|
+
recursive-include examples *.yaml
|
|
20
|
+
include examples/README.md
|
|
21
|
+
|
|
22
|
+
# Include type stub marker
|
|
23
|
+
include docpull/py.typed
|
|
24
|
+
|
|
25
|
+
# Include all Python files in profiles
|
|
26
|
+
recursive-include docpull/profiles *.py
|
|
27
|
+
|
|
28
|
+
# Exclude unnecessary files
|
|
29
|
+
global-exclude __pycache__
|
|
30
|
+
global-exclude *.py[cod]
|
|
31
|
+
global-exclude *~
|
|
32
|
+
global-exclude .DS_Store
|
|
33
|
+
global-exclude *.swp
|
|
34
|
+
global-exclude *.swo
|
|
35
|
+
|
|
36
|
+
# Exclude test artifacts and caches
|
|
37
|
+
prune tests/__pycache__
|
|
38
|
+
prune .pytest_cache
|
|
39
|
+
prune .mypy_cache
|
|
40
|
+
prune .ruff_cache
|
|
41
|
+
prune htmlcov
|
|
42
|
+
prune .docpull-cache
|
|
43
|
+
prune .git
|
|
44
|
+
prune .github
|
|
45
|
+
|
|
46
|
+
# Exclude build artifacts
|
|
47
|
+
prune build
|
|
48
|
+
prune dist
|
|
49
|
+
prune *.egg-info
|
docpull-1.2.1/Makefile
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
.PHONY: clean clean-pyc clean-build clean-test help
|
|
2
|
+
|
|
3
|
+
help:
|
|
4
|
+
@echo "clean - remove all build, test, coverage and Python artifacts"
|
|
5
|
+
@echo "clean-build - remove build artifacts"
|
|
6
|
+
@echo "clean-pyc - remove Python file artifacts"
|
|
7
|
+
@echo "clean-test - remove test and coverage artifacts"
|
|
8
|
+
@echo "test - run tests with pytest"
|
|
9
|
+
@echo "lint - check style with ruff"
|
|
10
|
+
@echo "format - format code with black"
|
|
11
|
+
|
|
12
|
+
clean: clean-build clean-pyc clean-test
|
|
13
|
+
|
|
14
|
+
clean-build:
|
|
15
|
+
rm -rf build/
|
|
16
|
+
rm -rf dist/
|
|
17
|
+
rm -rf .eggs/
|
|
18
|
+
find . -path ./.venv -prune -o -name '*.egg-info' -exec rm -rf {} + || true
|
|
19
|
+
find . -path ./.venv -prune -o -name '*.egg' -exec rm -f {} + || true
|
|
20
|
+
|
|
21
|
+
clean-pyc:
|
|
22
|
+
find . -path ./.venv -prune -o -name '*.pyc' -exec rm -f {} + || true
|
|
23
|
+
find . -path ./.venv -prune -o -name '*.pyo' -exec rm -f {} + || true
|
|
24
|
+
find . -path ./.venv -prune -o -name '*~' -exec rm -f {} + || true
|
|
25
|
+
find . -path ./.venv -prune -o -name '__pycache__' -exec rm -rf {} + || true
|
|
26
|
+
|
|
27
|
+
clean-test:
|
|
28
|
+
rm -rf .pytest_cache/
|
|
29
|
+
rm -rf .mypy_cache/
|
|
30
|
+
rm -rf .ruff_cache/
|
|
31
|
+
rm -rf htmlcov/
|
|
32
|
+
rm -rf .coverage
|
|
33
|
+
rm -rf coverage.xml
|
|
34
|
+
rm -rf docs/
|
|
35
|
+
rm -rf test-docs/
|
|
36
|
+
|
|
37
|
+
test:
|
|
38
|
+
pytest
|
|
39
|
+
|
|
40
|
+
lint:
|
|
41
|
+
ruff check .
|
|
42
|
+
|
|
43
|
+
format:
|
|
44
|
+
black .
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -33,6 +33,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
33
33
|
Classifier: Programming Language :: Python :: 3.11
|
|
34
34
|
Classifier: Programming Language :: Python :: 3.12
|
|
35
35
|
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
36
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
38
|
Classifier: Typing :: Typed
|
|
38
39
|
Requires-Python: >=3.9
|