docpull 1.5.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-2.2.0/PKG-INFO +208 -0
- docpull-2.2.0/README.md +131 -0
- {docpull-1.5.0 → docpull-2.2.0}/pyproject.toml +28 -6
- docpull-2.2.0/src/docpull/__init__.py +56 -0
- docpull-2.2.0/src/docpull/cache/__init__.py +12 -0
- docpull-2.2.0/src/docpull/cache/manager.py +502 -0
- docpull-2.2.0/src/docpull/cache/streaming_dedup.py +136 -0
- docpull-2.2.0/src/docpull/cli.py +522 -0
- docpull-2.2.0/src/docpull/concurrency/__init__.py +15 -0
- docpull-2.2.0/src/docpull/concurrency/browser_pool.py +336 -0
- docpull-2.2.0/src/docpull/concurrency/manager.py +111 -0
- docpull-2.2.0/src/docpull/conversion/__init__.py +15 -0
- docpull-2.2.0/src/docpull/conversion/extractor.py +247 -0
- docpull-2.2.0/src/docpull/conversion/markdown.py +203 -0
- docpull-2.2.0/src/docpull/conversion/protocols.py +46 -0
- docpull-2.2.0/src/docpull/core/__init__.py +5 -0
- docpull-2.2.0/src/docpull/core/fetcher.py +648 -0
- docpull-2.2.0/src/docpull/discovery/__init__.py +46 -0
- docpull-2.2.0/src/docpull/discovery/composite.py +127 -0
- docpull-2.2.0/src/docpull/discovery/crawler.py +255 -0
- docpull-2.2.0/src/docpull/discovery/filters.py +231 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +22 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +294 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/enhanced.py +315 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/protocols.py +33 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/static.py +160 -0
- docpull-2.2.0/src/docpull/discovery/protocols.py +52 -0
- docpull-2.2.0/src/docpull/discovery/sitemap.py +287 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull/doctor.py +5 -4
- docpull-2.2.0/src/docpull/http/__init__.py +13 -0
- docpull-2.2.0/src/docpull/http/client.py +353 -0
- docpull-2.2.0/src/docpull/http/protocols.py +78 -0
- docpull-2.2.0/src/docpull/http/rate_limiter.py +259 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull/metadata_extractor.py +16 -14
- docpull-2.2.0/src/docpull/models/__init__.py +42 -0
- docpull-2.2.0/src/docpull/models/config.py +340 -0
- docpull-2.2.0/src/docpull/models/events.py +162 -0
- docpull-2.2.0/src/docpull/models/profiles.py +103 -0
- docpull-2.2.0/src/docpull/pipeline/__init__.py +5 -0
- docpull-2.2.0/src/docpull/pipeline/base.py +189 -0
- docpull-2.2.0/src/docpull/pipeline/steps/__init__.py +21 -0
- docpull-2.2.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
- docpull-2.2.0/src/docpull/pipeline/steps/convert.py +134 -0
- docpull-2.2.0/src/docpull/pipeline/steps/dedup.py +96 -0
- docpull-2.2.0/src/docpull/pipeline/steps/fetch.py +192 -0
- docpull-2.2.0/src/docpull/pipeline/steps/metadata.py +139 -0
- docpull-2.2.0/src/docpull/pipeline/steps/save.py +167 -0
- docpull-2.2.0/src/docpull/pipeline/steps/save_json.py +191 -0
- docpull-2.2.0/src/docpull/pipeline/steps/save_sqlite.py +171 -0
- docpull-2.2.0/src/docpull/pipeline/steps/validate.py +140 -0
- docpull-2.2.0/src/docpull/security/__init__.py +6 -0
- docpull-2.2.0/src/docpull/security/robots.py +193 -0
- docpull-2.2.0/src/docpull/security/url_validator.py +175 -0
- docpull-2.2.0/src/docpull.egg-info/PKG-INFO +208 -0
- docpull-2.2.0/src/docpull.egg-info/SOURCES.txt +67 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/requires.txt +1 -0
- docpull-2.2.0/tests/test_link_extractors.py +270 -0
- docpull-2.2.0/tests/test_v2_conversion.py +293 -0
- docpull-2.2.0/tests/test_v2_discovery.py +356 -0
- docpull-2.2.0/tests/test_v2_integration.py +360 -0
- docpull-2.2.0/tests/test_v2_pipeline.py +370 -0
- docpull-1.5.0/PKG-INFO +0 -478
- docpull-1.5.0/README.md +0 -402
- docpull-1.5.0/docpull/__init__.py +0 -13
- docpull-1.5.0/docpull/archive.py +0 -186
- docpull-1.5.0/docpull/cache.py +0 -256
- docpull-1.5.0/docpull/cli.py +0 -782
- docpull-1.5.0/docpull/config.py +0 -332
- docpull-1.5.0/docpull/fetchers/__init__.py +0 -11
- docpull-1.5.0/docpull/fetchers/async_fetcher.py +0 -463
- docpull-1.5.0/docpull/fetchers/base.py +0 -686
- docpull-1.5.0/docpull/fetchers/generic.py +0 -215
- docpull-1.5.0/docpull/fetchers/generic_async.py +0 -324
- docpull-1.5.0/docpull/fetchers/parallel_base.py +0 -93
- docpull-1.5.0/docpull/file_utils.py +0 -97
- docpull-1.5.0/docpull/formatters/__init__.py +0 -50
- docpull-1.5.0/docpull/formatters/base.py +0 -102
- docpull-1.5.0/docpull/formatters/json.py +0 -100
- docpull-1.5.0/docpull/formatters/markdown.py +0 -49
- docpull-1.5.0/docpull/formatters/sqlite.py +0 -266
- docpull-1.5.0/docpull/formatters/toon.py +0 -90
- docpull-1.5.0/docpull/hooks.py +0 -222
- docpull-1.5.0/docpull/indexer.py +0 -410
- docpull-1.5.0/docpull/metadata.py +0 -224
- docpull-1.5.0/docpull/naming.py +0 -259
- docpull-1.5.0/docpull/orchestrator.py +0 -254
- docpull-1.5.0/docpull/processors/__init__.py +0 -18
- docpull-1.5.0/docpull/processors/base.py +0 -151
- docpull-1.5.0/docpull/processors/content_filter.py +0 -292
- docpull-1.5.0/docpull/processors/deduplicator.py +0 -233
- docpull-1.5.0/docpull/processors/language_filter.py +0 -181
- docpull-1.5.0/docpull/processors/size_limiter.py +0 -221
- docpull-1.5.0/docpull/sources_config.py +0 -446
- docpull-1.5.0/docpull/vcs.py +0 -224
- docpull-1.5.0/docpull.egg-info/PKG-INFO +0 -478
- docpull-1.5.0/docpull.egg-info/SOURCES.txt +0 -49
- docpull-1.5.0/tests/test_config.py +0 -39
- docpull-1.5.0/tests/test_metadata_extractor.py +0 -233
- docpull-1.5.0/tests/test_orchestrator.py +0 -331
- docpull-1.5.0/tests/test_sources_config.py +0 -348
- {docpull-1.5.0 → docpull-2.2.0}/LICENSE +0 -0
- {docpull-1.5.0 → docpull-2.2.0}/setup.cfg +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull/__main__.py +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull/logging_config.py +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull/py.typed +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/entry_points.txt +0 -0
- {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/top_level.txt +0 -0
docpull-2.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 2.2.0
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
37
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
|
+
Classifier: Typing :: Typed
|
|
39
|
+
Requires-Python: >=3.10
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: requests>=2.31.0
|
|
43
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
44
|
+
Requires-Dist: html2text>=2020.1.16
|
|
45
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
46
|
+
Requires-Dist: extruct>=0.15.0
|
|
47
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
48
|
+
Requires-Dist: rich>=13.0.0
|
|
49
|
+
Requires-Dist: pyyaml>=6.0
|
|
50
|
+
Requires-Dist: gitpython>=3.1.40
|
|
51
|
+
Requires-Dist: pydantic>=2.0
|
|
52
|
+
Provides-Extra: js
|
|
53
|
+
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
54
|
+
Provides-Extra: proxy
|
|
55
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
56
|
+
Provides-Extra: normalize
|
|
57
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
|
|
58
|
+
Provides-Extra: all
|
|
59
|
+
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
60
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
|
|
61
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
62
|
+
Provides-Extra: dev
|
|
63
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
64
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
65
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
66
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
67
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
68
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
69
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
70
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
71
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
72
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
73
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
74
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
75
|
+
Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
|
|
76
|
+
Dynamic: license-file
|
|
77
|
+
|
|
78
|
+
# docpull
|
|
79
|
+
|
|
80
|
+
**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
|
|
81
|
+
|
|
82
|
+
[](https://www.python.org/downloads/)
|
|
83
|
+
[](https://badge.fury.io/py/docpull)
|
|
84
|
+
[](https://pepy.tech/project/docpull)
|
|
85
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
86
|
+
|
|
87
|
+
## Install
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install docpull
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Usage
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Basic fetch
|
|
97
|
+
docpull https://docs.example.com
|
|
98
|
+
|
|
99
|
+
# With options
|
|
100
|
+
docpull https://aptos.dev --max-pages 100 --output-dir ./docs
|
|
101
|
+
|
|
102
|
+
# Filter paths
|
|
103
|
+
docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
|
|
104
|
+
|
|
105
|
+
# Enable caching for incremental updates
|
|
106
|
+
docpull https://docs.example.com --cache
|
|
107
|
+
|
|
108
|
+
# JavaScript-heavy sites
|
|
109
|
+
pip install docpull[js]
|
|
110
|
+
docpull https://spa-site.com --js
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Profiles
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
|
|
117
|
+
docpull https://site.com --profile mirror # Full site archive with caching
|
|
118
|
+
docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Options
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
Crawl:
|
|
125
|
+
--max-pages N Maximum pages to fetch
|
|
126
|
+
--max-depth N Maximum crawl depth
|
|
127
|
+
--include-paths P Only crawl matching URL patterns
|
|
128
|
+
--exclude-paths P Skip matching URL patterns
|
|
129
|
+
--js Enable JavaScript rendering
|
|
130
|
+
|
|
131
|
+
Cache:
|
|
132
|
+
--cache Enable caching for incremental updates
|
|
133
|
+
--cache-dir DIR Cache directory (default: .docpull-cache)
|
|
134
|
+
--cache-ttl DAYS Days before cache expires (default: 30)
|
|
135
|
+
|
|
136
|
+
Content:
|
|
137
|
+
--streaming-dedup Real-time duplicate detection
|
|
138
|
+
--language CODE Filter by language (e.g., en)
|
|
139
|
+
|
|
140
|
+
Output:
|
|
141
|
+
--output-dir, -o DIR Output directory (default: ./docs)
|
|
142
|
+
--dry-run Show what would be fetched
|
|
143
|
+
--verbose, -v Verbose output
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
See `docpull --help` for all options.
|
|
147
|
+
|
|
148
|
+
## Python API
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
import asyncio
|
|
152
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
153
|
+
|
|
154
|
+
async def main():
|
|
155
|
+
config = DocpullConfig(
|
|
156
|
+
url="https://docs.example.com",
|
|
157
|
+
profile=ProfileName.RAG,
|
|
158
|
+
crawl={"max_pages": 100},
|
|
159
|
+
cache={"enabled": True},
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
async with Fetcher(config) as fetcher:
|
|
163
|
+
async for event in fetcher.run():
|
|
164
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
165
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
166
|
+
|
|
167
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
168
|
+
|
|
169
|
+
asyncio.run(main())
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Output
|
|
173
|
+
|
|
174
|
+
Each page becomes a Markdown file with YAML frontmatter:
|
|
175
|
+
|
|
176
|
+
```markdown
|
|
177
|
+
---
|
|
178
|
+
title: "Getting Started"
|
|
179
|
+
source: https://docs.example.com/guide
|
|
180
|
+
---
|
|
181
|
+
|
|
182
|
+
# Getting Started
|
|
183
|
+
...
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
## Security
|
|
187
|
+
|
|
188
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
189
|
+
- Blocks private/internal network IPs
|
|
190
|
+
- Path traversal and XXE protection
|
|
191
|
+
|
|
192
|
+
## Troubleshooting
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
docpull --doctor # Check installation
|
|
196
|
+
docpull URL --verbose # Verbose output
|
|
197
|
+
docpull URL --dry-run # Test without downloading
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
## Links
|
|
201
|
+
|
|
202
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
203
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
204
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
205
|
+
|
|
206
|
+
## License
|
|
207
|
+
|
|
208
|
+
MIT
|
docpull-2.2.0/README.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# docpull
|
|
2
|
+
|
|
3
|
+
**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://badge.fury.io/py/docpull)
|
|
7
|
+
[](https://pepy.tech/project/docpull)
|
|
8
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
pip install docpull
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Basic fetch
|
|
20
|
+
docpull https://docs.example.com
|
|
21
|
+
|
|
22
|
+
# With options
|
|
23
|
+
docpull https://aptos.dev --max-pages 100 --output-dir ./docs
|
|
24
|
+
|
|
25
|
+
# Filter paths
|
|
26
|
+
docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
|
|
27
|
+
|
|
28
|
+
# Enable caching for incremental updates
|
|
29
|
+
docpull https://docs.example.com --cache
|
|
30
|
+
|
|
31
|
+
# JavaScript-heavy sites
|
|
32
|
+
pip install docpull[js]
|
|
33
|
+
docpull https://spa-site.com --js
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Profiles
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
|
|
40
|
+
docpull https://site.com --profile mirror # Full site archive with caching
|
|
41
|
+
docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Options
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
Crawl:
|
|
48
|
+
--max-pages N Maximum pages to fetch
|
|
49
|
+
--max-depth N Maximum crawl depth
|
|
50
|
+
--include-paths P Only crawl matching URL patterns
|
|
51
|
+
--exclude-paths P Skip matching URL patterns
|
|
52
|
+
--js Enable JavaScript rendering
|
|
53
|
+
|
|
54
|
+
Cache:
|
|
55
|
+
--cache Enable caching for incremental updates
|
|
56
|
+
--cache-dir DIR Cache directory (default: .docpull-cache)
|
|
57
|
+
--cache-ttl DAYS Days before cache expires (default: 30)
|
|
58
|
+
|
|
59
|
+
Content:
|
|
60
|
+
--streaming-dedup Real-time duplicate detection
|
|
61
|
+
--language CODE Filter by language (e.g., en)
|
|
62
|
+
|
|
63
|
+
Output:
|
|
64
|
+
--output-dir, -o DIR Output directory (default: ./docs)
|
|
65
|
+
--dry-run Show what would be fetched
|
|
66
|
+
--verbose, -v Verbose output
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
See `docpull --help` for all options.
|
|
70
|
+
|
|
71
|
+
## Python API
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import asyncio
|
|
75
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
76
|
+
|
|
77
|
+
async def main():
|
|
78
|
+
config = DocpullConfig(
|
|
79
|
+
url="https://docs.example.com",
|
|
80
|
+
profile=ProfileName.RAG,
|
|
81
|
+
crawl={"max_pages": 100},
|
|
82
|
+
cache={"enabled": True},
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
async with Fetcher(config) as fetcher:
|
|
86
|
+
async for event in fetcher.run():
|
|
87
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
88
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
89
|
+
|
|
90
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
91
|
+
|
|
92
|
+
asyncio.run(main())
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Output
|
|
96
|
+
|
|
97
|
+
Each page becomes a Markdown file with YAML frontmatter:
|
|
98
|
+
|
|
99
|
+
```markdown
|
|
100
|
+
---
|
|
101
|
+
title: "Getting Started"
|
|
102
|
+
source: https://docs.example.com/guide
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
# Getting Started
|
|
106
|
+
...
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Security
|
|
110
|
+
|
|
111
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
112
|
+
- Blocks private/internal network IPs
|
|
113
|
+
- Path traversal and XXE protection
|
|
114
|
+
|
|
115
|
+
## Troubleshooting
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
docpull --doctor # Check installation
|
|
119
|
+
docpull URL --verbose # Verbose output
|
|
120
|
+
docpull URL --dry-run # Test without downloading
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Links
|
|
124
|
+
|
|
125
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
126
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
127
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
128
|
+
|
|
129
|
+
## License
|
|
130
|
+
|
|
131
|
+
MIT
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "2.2.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
12
|
license = "MIT"
|
|
13
13
|
license-files = ["LICENSE"]
|
|
14
14
|
authors = [
|
|
@@ -72,6 +72,7 @@ dependencies = [
|
|
|
72
72
|
"rich>=13.0.0",
|
|
73
73
|
"pyyaml>=6.0",
|
|
74
74
|
"gitpython>=3.1.40",
|
|
75
|
+
"pydantic>=2.0",
|
|
75
76
|
]
|
|
76
77
|
|
|
77
78
|
[project.optional-dependencies]
|
|
@@ -117,7 +118,7 @@ Repository = "https://github.com/raintree-technology/docpull"
|
|
|
117
118
|
"Releases" = "https://github.com/raintree-technology/docpull/releases"
|
|
118
119
|
|
|
119
120
|
[tool.setuptools.packages.find]
|
|
120
|
-
where = ["
|
|
121
|
+
where = ["src"]
|
|
121
122
|
include = ["docpull*"]
|
|
122
123
|
|
|
123
124
|
[tool.setuptools.package-data]
|
|
@@ -133,10 +134,10 @@ target-version = "py39"
|
|
|
133
134
|
|
|
134
135
|
[tool.ruff.lint]
|
|
135
136
|
select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
|
|
136
|
-
ignore = []
|
|
137
|
+
ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
|
|
137
138
|
|
|
138
139
|
[tool.mypy]
|
|
139
|
-
python_version = "3.
|
|
140
|
+
python_version = "3.10"
|
|
140
141
|
warn_return_any = true
|
|
141
142
|
warn_unused_configs = true
|
|
142
143
|
disallow_untyped_defs = true
|
|
@@ -144,7 +145,14 @@ disallow_any_unimported = true
|
|
|
144
145
|
no_implicit_optional = true
|
|
145
146
|
strict_equality = true
|
|
146
147
|
warn_redundant_casts = true
|
|
148
|
+
ignore_missing_imports = true
|
|
147
149
|
exclude = ["tests/"]
|
|
150
|
+
plugins = ["pydantic.mypy"]
|
|
151
|
+
|
|
152
|
+
[tool.pydantic-mypy]
|
|
153
|
+
init_forbid_extra = true
|
|
154
|
+
init_typed = true
|
|
155
|
+
warn_required_dynamic_aliases = true
|
|
148
156
|
|
|
149
157
|
[[tool.mypy.overrides]]
|
|
150
158
|
module = "playwright.*"
|
|
@@ -154,6 +162,20 @@ ignore_missing_imports = true
|
|
|
154
162
|
module = "extruct.*"
|
|
155
163
|
ignore_missing_imports = true
|
|
156
164
|
|
|
165
|
+
[[tool.mypy.overrides]]
|
|
166
|
+
module = "url_normalize"
|
|
167
|
+
ignore_missing_imports = true
|
|
168
|
+
|
|
169
|
+
[[tool.mypy.overrides]]
|
|
170
|
+
module = "docpull.models.*"
|
|
171
|
+
disallow_any_unimported = false
|
|
172
|
+
warn_return_any = false
|
|
173
|
+
|
|
174
|
+
[[tool.mypy.overrides]]
|
|
175
|
+
module = "docpull.concurrency.browser_pool"
|
|
176
|
+
disallow_any_unimported = false
|
|
177
|
+
warn_return_any = false
|
|
178
|
+
|
|
157
179
|
[[tool.mypy.overrides]]
|
|
158
180
|
module = "tests.*"
|
|
159
181
|
disallow_untyped_defs = false
|
|
@@ -175,7 +197,7 @@ markers = [
|
|
|
175
197
|
]
|
|
176
198
|
|
|
177
199
|
[tool.coverage.run]
|
|
178
|
-
source = ["docpull"]
|
|
200
|
+
source = ["src/docpull"]
|
|
179
201
|
omit = ["tests/*", "*/test_*.py"]
|
|
180
202
|
|
|
181
203
|
[tool.coverage.report]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
docpull - Fetch and convert documentation from any URL to markdown.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from docpull import Fetcher, DocpullConfig, ProfileName
|
|
6
|
+
|
|
7
|
+
config = DocpullConfig(
|
|
8
|
+
url="https://docs.example.com",
|
|
9
|
+
profile=ProfileName.RAG,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
async with Fetcher(config) as fetcher:
|
|
13
|
+
async for event in fetcher.run():
|
|
14
|
+
print(event)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "2.2.0"
|
|
18
|
+
|
|
19
|
+
from .cache import CacheManager, StreamingDeduplicator
|
|
20
|
+
from .core.fetcher import Fetcher, fetch_blocking
|
|
21
|
+
from .models.config import (
|
|
22
|
+
CacheConfig,
|
|
23
|
+
ContentFilterConfig,
|
|
24
|
+
CrawlConfig,
|
|
25
|
+
DocpullConfig,
|
|
26
|
+
IntegrationConfig,
|
|
27
|
+
NetworkConfig,
|
|
28
|
+
OutputConfig,
|
|
29
|
+
PerformanceConfig,
|
|
30
|
+
ProfileName,
|
|
31
|
+
)
|
|
32
|
+
from .models.events import EventType, FetchEvent, FetchStats
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"__version__",
|
|
36
|
+
# Core
|
|
37
|
+
"Fetcher",
|
|
38
|
+
"fetch_blocking",
|
|
39
|
+
# Config
|
|
40
|
+
"DocpullConfig",
|
|
41
|
+
"ProfileName",
|
|
42
|
+
"CrawlConfig",
|
|
43
|
+
"ContentFilterConfig",
|
|
44
|
+
"OutputConfig",
|
|
45
|
+
"NetworkConfig",
|
|
46
|
+
"PerformanceConfig",
|
|
47
|
+
"IntegrationConfig",
|
|
48
|
+
"CacheConfig",
|
|
49
|
+
# Events
|
|
50
|
+
"EventType",
|
|
51
|
+
"FetchEvent",
|
|
52
|
+
"FetchStats",
|
|
53
|
+
# Cache
|
|
54
|
+
"CacheManager",
|
|
55
|
+
"StreamingDeduplicator",
|
|
56
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Caching and deduplication for docpull."""
|
|
2
|
+
|
|
3
|
+
from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
|
|
4
|
+
from .streaming_dedup import StreamingDeduplicator
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"CacheManager",
|
|
8
|
+
"CacheState",
|
|
9
|
+
"ManifestEntry",
|
|
10
|
+
"StreamingDeduplicator",
|
|
11
|
+
"DEFAULT_TTL_DAYS",
|
|
12
|
+
]
|