docpull 1.3.0__tar.gz → 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-2.0.0/PKG-INFO +207 -0
- docpull-2.0.0/README.md +130 -0
- {docpull-1.3.0 → docpull-2.0.0}/pyproject.toml +35 -5
- docpull-2.0.0/src/docpull/__init__.py +56 -0
- docpull-2.0.0/src/docpull/cache/__init__.py +12 -0
- docpull-2.0.0/src/docpull/cache/manager.py +388 -0
- docpull-2.0.0/src/docpull/cache/streaming_dedup.py +135 -0
- docpull-2.0.0/src/docpull/cli.py +408 -0
- docpull-2.0.0/src/docpull/concurrency/__init__.py +15 -0
- docpull-2.0.0/src/docpull/concurrency/browser_pool.py +337 -0
- docpull-2.0.0/src/docpull/concurrency/manager.py +111 -0
- docpull-2.0.0/src/docpull/conversion/__init__.py +15 -0
- docpull-2.0.0/src/docpull/conversion/extractor.py +246 -0
- docpull-2.0.0/src/docpull/conversion/markdown.py +201 -0
- docpull-2.0.0/src/docpull/conversion/protocols.py +46 -0
- docpull-2.0.0/src/docpull/core/__init__.py +5 -0
- docpull-2.0.0/src/docpull/core/fetcher.py +501 -0
- docpull-2.0.0/src/docpull/discovery/__init__.py +29 -0
- docpull-2.0.0/src/docpull/discovery/composite.py +127 -0
- docpull-2.0.0/src/docpull/discovery/crawler.py +242 -0
- docpull-2.0.0/src/docpull/discovery/filters.py +230 -0
- docpull-2.0.0/src/docpull/discovery/protocols.py +52 -0
- docpull-2.0.0/src/docpull/discovery/sitemap.py +258 -0
- docpull-2.0.0/src/docpull/http/__init__.py +12 -0
- docpull-2.0.0/src/docpull/http/client.py +321 -0
- docpull-2.0.0/src/docpull/http/protocols.py +76 -0
- docpull-2.0.0/src/docpull/http/rate_limiter.py +148 -0
- {docpull-1.3.0 → docpull-2.0.0/src}/docpull/metadata_extractor.py +3 -3
- docpull-2.0.0/src/docpull/models/__init__.py +37 -0
- docpull-2.0.0/src/docpull/models/config.py +265 -0
- docpull-2.0.0/src/docpull/models/events.py +145 -0
- docpull-2.0.0/src/docpull/models/profiles.py +101 -0
- docpull-2.0.0/src/docpull/pipeline/__init__.py +5 -0
- docpull-2.0.0/src/docpull/pipeline/base.py +187 -0
- docpull-2.0.0/src/docpull/pipeline/steps/__init__.py +17 -0
- docpull-2.0.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
- docpull-2.0.0/src/docpull/pipeline/steps/convert.py +134 -0
- docpull-2.0.0/src/docpull/pipeline/steps/dedup.py +96 -0
- docpull-2.0.0/src/docpull/pipeline/steps/fetch.py +192 -0
- docpull-2.0.0/src/docpull/pipeline/steps/metadata.py +139 -0
- docpull-2.0.0/src/docpull/pipeline/steps/save.py +167 -0
- docpull-2.0.0/src/docpull/pipeline/steps/validate.py +140 -0
- docpull-2.0.0/src/docpull/security/__init__.py +6 -0
- docpull-2.0.0/src/docpull/security/robots.py +192 -0
- docpull-2.0.0/src/docpull/security/url_validator.py +174 -0
- docpull-2.0.0/src/docpull.egg-info/PKG-INFO +207 -0
- docpull-2.0.0/src/docpull.egg-info/SOURCES.txt +59 -0
- docpull-2.0.0/src/docpull.egg-info/dependency_links.txt +1 -0
- docpull-2.0.0/src/docpull.egg-info/entry_points.txt +2 -0
- docpull-2.0.0/src/docpull.egg-info/requires.txt +39 -0
- docpull-2.0.0/src/docpull.egg-info/top_level.txt +1 -0
- docpull-2.0.0/tests/test_v2_conversion.py +294 -0
- docpull-2.0.0/tests/test_v2_discovery.py +355 -0
- docpull-2.0.0/tests/test_v2_integration.py +359 -0
- docpull-2.0.0/tests/test_v2_pipeline.py +369 -0
- docpull-1.3.0/.editorconfig +0 -30
- docpull-1.3.0/.pre-commit-config.yaml +0 -30
- docpull-1.3.0/CHANGELOG.md +0 -403
- docpull-1.3.0/CONTRIBUTING.md +0 -189
- docpull-1.3.0/MANIFEST.in +0 -49
- docpull-1.3.0/Makefile +0 -44
- docpull-1.3.0/PKG-INFO +0 -459
- docpull-1.3.0/README.md +0 -389
- docpull-1.3.0/SECURITY.md +0 -206
- docpull-1.3.0/TROUBLESHOOTING.md +0 -348
- docpull-1.3.0/docpull/__init__.py +0 -15
- docpull-1.3.0/docpull/archive.py +0 -186
- docpull-1.3.0/docpull/cache.py +0 -256
- docpull-1.3.0/docpull/cli.py +0 -851
- docpull-1.3.0/docpull/config.py +0 -316
- docpull-1.3.0/docpull/fetchers/__init__.py +0 -9
- docpull-1.3.0/docpull/fetchers/async_fetcher.py +0 -322
- docpull-1.3.0/docpull/fetchers/base.py +0 -502
- docpull-1.3.0/docpull/fetchers/generic.py +0 -255
- docpull-1.3.0/docpull/fetchers/generic_async.py +0 -290
- docpull-1.3.0/docpull/fetchers/parallel_base.py +0 -93
- docpull-1.3.0/docpull/fetchers/stripe.py +0 -49
- docpull-1.3.0/docpull/formatters/__init__.py +0 -50
- docpull-1.3.0/docpull/formatters/base.py +0 -102
- docpull-1.3.0/docpull/formatters/json.py +0 -100
- docpull-1.3.0/docpull/formatters/markdown.py +0 -49
- docpull-1.3.0/docpull/formatters/sqlite.py +0 -266
- docpull-1.3.0/docpull/formatters/toon.py +0 -90
- docpull-1.3.0/docpull/hooks.py +0 -222
- docpull-1.3.0/docpull/indexer.py +0 -410
- docpull-1.3.0/docpull/metadata.py +0 -224
- docpull-1.3.0/docpull/naming.py +0 -259
- docpull-1.3.0/docpull/orchestrator.py +0 -254
- docpull-1.3.0/docpull/processors/__init__.py +0 -18
- docpull-1.3.0/docpull/processors/base.py +0 -151
- docpull-1.3.0/docpull/processors/content_filter.py +0 -292
- docpull-1.3.0/docpull/processors/deduplicator.py +0 -233
- docpull-1.3.0/docpull/processors/language_filter.py +0 -181
- docpull-1.3.0/docpull/processors/size_limiter.py +0 -221
- docpull-1.3.0/docpull/profiles/__init__.py +0 -53
- docpull-1.3.0/docpull/profiles/base.py +0 -64
- docpull-1.3.0/docpull/profiles/stripe.py +0 -14
- docpull-1.3.0/docpull/sources_config.py +0 -446
- docpull-1.3.0/docpull/utils/__init__.py +0 -6
- docpull-1.3.0/docpull/utils/file_utils.py +0 -97
- docpull-1.3.0/docpull/vcs.py +0 -224
- docpull-1.3.0/docpull.egg-info/SOURCES.txt +0 -64
- docpull-1.3.0/examples/README.md +0 -280
- docpull-1.3.0/examples/deduplication-strategies.yaml +0 -29
- docpull-1.3.0/examples/format-conversion.yaml +0 -25
- docpull-1.3.0/examples/incremental-updates.yaml +0 -26
- docpull-1.3.0/examples/multi-source-optimized.yaml +0 -45
- docpull-1.3.0/examples/selective-crawling.yaml +0 -26
- docpull-1.3.0/examples/simple-optimization.yaml +0 -14
- docpull-1.3.0/requirements.txt +0 -34
- docpull-1.3.0/tests/test_config.py +0 -43
- docpull-1.3.0/tests/test_metadata_extractor.py +0 -233
- docpull-1.3.0/tests/test_orchestrator.py +0 -331
- docpull-1.3.0/tests/test_sources_config.py +0 -348
- {docpull-1.3.0 → docpull-2.0.0}/LICENSE +0 -0
- {docpull-1.3.0 → docpull-2.0.0}/setup.cfg +0 -0
- {docpull-1.3.0 → docpull-2.0.0/src}/docpull/__main__.py +0 -0
- {docpull-1.3.0 → docpull-2.0.0/src}/docpull/doctor.py +0 -0
- {docpull-1.3.0/docpull/utils → docpull-2.0.0/src/docpull}/logging_config.py +0 -0
- {docpull-1.3.0 → docpull-2.0.0/src}/docpull/py.typed +0 -0
docpull-2.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
37
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
|
+
Classifier: Typing :: Typed
|
|
39
|
+
Requires-Python: >=3.9
|
|
40
|
+
Description-Content-Type: text/markdown
|
|
41
|
+
License-File: LICENSE
|
|
42
|
+
Requires-Dist: requests>=2.31.0
|
|
43
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
44
|
+
Requires-Dist: html2text>=2020.1.16
|
|
45
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
46
|
+
Requires-Dist: extruct>=0.15.0
|
|
47
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
48
|
+
Requires-Dist: rich>=13.0.0
|
|
49
|
+
Requires-Dist: pyyaml>=6.0
|
|
50
|
+
Requires-Dist: gitpython>=3.1.40
|
|
51
|
+
Requires-Dist: pydantic>=2.0
|
|
52
|
+
Provides-Extra: js
|
|
53
|
+
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
54
|
+
Provides-Extra: proxy
|
|
55
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
56
|
+
Provides-Extra: normalize
|
|
57
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
|
|
58
|
+
Provides-Extra: all
|
|
59
|
+
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
60
|
+
Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
|
|
61
|
+
Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
62
|
+
Provides-Extra: dev
|
|
63
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
64
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
65
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
66
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
67
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
68
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
69
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
70
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
71
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
72
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
73
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
74
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
75
|
+
Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
|
|
76
|
+
Dynamic: license-file
|
|
77
|
+
|
|
78
|
+
# docpull
|
|
79
|
+
|
|
80
|
+
**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
|
|
81
|
+
|
|
82
|
+
[](https://www.python.org/downloads/)
|
|
83
|
+
[](https://badge.fury.io/py/docpull)
|
|
84
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
85
|
+
|
|
86
|
+
## Install
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install docpull
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Usage
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# Basic fetch
|
|
96
|
+
docpull https://docs.example.com
|
|
97
|
+
|
|
98
|
+
# With options
|
|
99
|
+
docpull https://aptos.dev --max-pages 100 --output-dir ./docs
|
|
100
|
+
|
|
101
|
+
# Filter paths
|
|
102
|
+
docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
|
|
103
|
+
|
|
104
|
+
# Enable caching for incremental updates
|
|
105
|
+
docpull https://docs.example.com --cache
|
|
106
|
+
|
|
107
|
+
# JavaScript-heavy sites
|
|
108
|
+
pip install docpull[js]
|
|
109
|
+
docpull https://spa-site.com --js
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Profiles
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
|
|
116
|
+
docpull https://site.com --profile mirror # Full site archive with caching
|
|
117
|
+
docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Options
|
|
121
|
+
|
|
122
|
+
```
|
|
123
|
+
Crawl:
|
|
124
|
+
--max-pages N Maximum pages to fetch
|
|
125
|
+
--max-depth N Maximum crawl depth
|
|
126
|
+
--include-paths P Only crawl matching URL patterns
|
|
127
|
+
--exclude-paths P Skip matching URL patterns
|
|
128
|
+
--js Enable JavaScript rendering
|
|
129
|
+
|
|
130
|
+
Cache:
|
|
131
|
+
--cache Enable caching for incremental updates
|
|
132
|
+
--cache-dir DIR Cache directory (default: .docpull-cache)
|
|
133
|
+
--cache-ttl DAYS Days before cache expires (default: 30)
|
|
134
|
+
|
|
135
|
+
Content:
|
|
136
|
+
--streaming-dedup Real-time duplicate detection
|
|
137
|
+
--language CODE Filter by language (e.g., en)
|
|
138
|
+
|
|
139
|
+
Output:
|
|
140
|
+
--output-dir, -o DIR Output directory (default: ./docs)
|
|
141
|
+
--dry-run Show what would be fetched
|
|
142
|
+
--verbose, -v Verbose output
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
See `docpull --help` for all options.
|
|
146
|
+
|
|
147
|
+
## Python API
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
import asyncio
|
|
151
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
152
|
+
|
|
153
|
+
async def main():
|
|
154
|
+
config = DocpullConfig(
|
|
155
|
+
url="https://docs.example.com",
|
|
156
|
+
profile=ProfileName.RAG,
|
|
157
|
+
crawl={"max_pages": 100},
|
|
158
|
+
cache={"enabled": True},
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
async with Fetcher(config) as fetcher:
|
|
162
|
+
async for event in fetcher.run():
|
|
163
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
164
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
165
|
+
|
|
166
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
167
|
+
|
|
168
|
+
asyncio.run(main())
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Output
|
|
172
|
+
|
|
173
|
+
Each page becomes a Markdown file with YAML frontmatter:
|
|
174
|
+
|
|
175
|
+
```markdown
|
|
176
|
+
---
|
|
177
|
+
title: "Getting Started"
|
|
178
|
+
source: https://docs.example.com/guide
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
# Getting Started
|
|
182
|
+
...
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## Security
|
|
186
|
+
|
|
187
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
188
|
+
- Blocks private/internal network IPs
|
|
189
|
+
- Path traversal and XXE protection
|
|
190
|
+
|
|
191
|
+
## Troubleshooting
|
|
192
|
+
|
|
193
|
+
```bash
|
|
194
|
+
docpull --doctor # Check installation
|
|
195
|
+
docpull URL --verbose # Verbose output
|
|
196
|
+
docpull URL --dry-run # Test without downloading
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Links
|
|
200
|
+
|
|
201
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
202
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
203
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
204
|
+
|
|
205
|
+
## License
|
|
206
|
+
|
|
207
|
+
MIT
|
docpull-2.0.0/README.md
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# docpull
|
|
2
|
+
|
|
3
|
+
**Pull documentation from any website and convert it to clean, AI-ready Markdown.**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://badge.fury.io/py/docpull)
|
|
7
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install docpull
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Usage
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# Basic fetch
|
|
19
|
+
docpull https://docs.example.com
|
|
20
|
+
|
|
21
|
+
# With options
|
|
22
|
+
docpull https://aptos.dev --max-pages 100 --output-dir ./docs
|
|
23
|
+
|
|
24
|
+
# Filter paths
|
|
25
|
+
docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
|
|
26
|
+
|
|
27
|
+
# Enable caching for incremental updates
|
|
28
|
+
docpull https://docs.example.com --cache
|
|
29
|
+
|
|
30
|
+
# JavaScript-heavy sites
|
|
31
|
+
pip install docpull[js]
|
|
32
|
+
docpull https://spa-site.com --js
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Profiles
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
|
|
39
|
+
docpull https://site.com --profile mirror # Full site archive with caching
|
|
40
|
+
docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Options
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
Crawl:
|
|
47
|
+
--max-pages N Maximum pages to fetch
|
|
48
|
+
--max-depth N Maximum crawl depth
|
|
49
|
+
--include-paths P Only crawl matching URL patterns
|
|
50
|
+
--exclude-paths P Skip matching URL patterns
|
|
51
|
+
--js Enable JavaScript rendering
|
|
52
|
+
|
|
53
|
+
Cache:
|
|
54
|
+
--cache Enable caching for incremental updates
|
|
55
|
+
--cache-dir DIR Cache directory (default: .docpull-cache)
|
|
56
|
+
--cache-ttl DAYS Days before cache expires (default: 30)
|
|
57
|
+
|
|
58
|
+
Content:
|
|
59
|
+
--streaming-dedup Real-time duplicate detection
|
|
60
|
+
--language CODE Filter by language (e.g., en)
|
|
61
|
+
|
|
62
|
+
Output:
|
|
63
|
+
--output-dir, -o DIR Output directory (default: ./docs)
|
|
64
|
+
--dry-run Show what would be fetched
|
|
65
|
+
--verbose, -v Verbose output
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
See `docpull --help` for all options.
|
|
69
|
+
|
|
70
|
+
## Python API
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import asyncio
|
|
74
|
+
from docpull import Fetcher, DocpullConfig, ProfileName, EventType
|
|
75
|
+
|
|
76
|
+
async def main():
|
|
77
|
+
config = DocpullConfig(
|
|
78
|
+
url="https://docs.example.com",
|
|
79
|
+
profile=ProfileName.RAG,
|
|
80
|
+
crawl={"max_pages": 100},
|
|
81
|
+
cache={"enabled": True},
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async with Fetcher(config) as fetcher:
|
|
85
|
+
async for event in fetcher.run():
|
|
86
|
+
if event.type == EventType.FETCH_PROGRESS:
|
|
87
|
+
print(f"{event.current}/{event.total}: {event.url}")
|
|
88
|
+
|
|
89
|
+
print(f"Done: {fetcher.stats.pages_fetched} pages")
|
|
90
|
+
|
|
91
|
+
asyncio.run(main())
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
## Output
|
|
95
|
+
|
|
96
|
+
Each page becomes a Markdown file with YAML frontmatter:
|
|
97
|
+
|
|
98
|
+
```markdown
|
|
99
|
+
---
|
|
100
|
+
title: "Getting Started"
|
|
101
|
+
source: https://docs.example.com/guide
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
# Getting Started
|
|
105
|
+
...
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Security
|
|
109
|
+
|
|
110
|
+
- HTTPS-only, mandatory robots.txt compliance
|
|
111
|
+
- Blocks private/internal network IPs
|
|
112
|
+
- Path traversal and XXE protection
|
|
113
|
+
|
|
114
|
+
## Troubleshooting
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
docpull --doctor # Check installation
|
|
118
|
+
docpull URL --verbose # Verbose output
|
|
119
|
+
docpull URL --dry-run # Test without downloading
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Links
|
|
123
|
+
|
|
124
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
125
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
126
|
+
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
127
|
+
|
|
128
|
+
## License
|
|
129
|
+
|
|
130
|
+
MIT
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "2.0.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -72,14 +72,23 @@ dependencies = [
|
|
|
72
72
|
"rich>=13.0.0",
|
|
73
73
|
"pyyaml>=6.0",
|
|
74
74
|
"gitpython>=3.1.40",
|
|
75
|
+
"pydantic>=2.0",
|
|
75
76
|
]
|
|
76
77
|
|
|
77
78
|
[project.optional-dependencies]
|
|
78
79
|
js = [
|
|
79
80
|
"playwright>=1.40.0",
|
|
80
81
|
]
|
|
82
|
+
proxy = [
|
|
83
|
+
"aiohttp-socks>=0.8.0",
|
|
84
|
+
]
|
|
85
|
+
normalize = [
|
|
86
|
+
"url-normalize>=1.4.0",
|
|
87
|
+
]
|
|
81
88
|
all = [
|
|
82
89
|
"playwright>=1.40.0",
|
|
90
|
+
"aiohttp-socks>=0.8.0",
|
|
91
|
+
"url-normalize>=1.4.0",
|
|
83
92
|
]
|
|
84
93
|
dev = [
|
|
85
94
|
"pytest>=7.0.0",
|
|
@@ -106,10 +115,10 @@ Documentation = "https://github.com/raintree-technology/docpull#readme"
|
|
|
106
115
|
Repository = "https://github.com/raintree-technology/docpull"
|
|
107
116
|
"Source Code" = "https://github.com/raintree-technology/docpull"
|
|
108
117
|
"Bug Tracker" = "https://github.com/raintree-technology/docpull/issues"
|
|
109
|
-
"
|
|
118
|
+
"Releases" = "https://github.com/raintree-technology/docpull/releases"
|
|
110
119
|
|
|
111
120
|
[tool.setuptools.packages.find]
|
|
112
|
-
where = ["
|
|
121
|
+
where = ["src"]
|
|
113
122
|
include = ["docpull*"]
|
|
114
123
|
|
|
115
124
|
[tool.setuptools.package-data]
|
|
@@ -125,7 +134,7 @@ target-version = "py39"
|
|
|
125
134
|
|
|
126
135
|
[tool.ruff.lint]
|
|
127
136
|
select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
|
|
128
|
-
ignore = []
|
|
137
|
+
ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
|
|
129
138
|
|
|
130
139
|
[tool.mypy]
|
|
131
140
|
python_version = "3.9"
|
|
@@ -136,7 +145,14 @@ disallow_any_unimported = true
|
|
|
136
145
|
no_implicit_optional = true
|
|
137
146
|
strict_equality = true
|
|
138
147
|
warn_redundant_casts = true
|
|
148
|
+
ignore_missing_imports = true
|
|
139
149
|
exclude = ["tests/"]
|
|
150
|
+
plugins = ["pydantic.mypy"]
|
|
151
|
+
|
|
152
|
+
[tool.pydantic-mypy]
|
|
153
|
+
init_forbid_extra = true
|
|
154
|
+
init_typed = true
|
|
155
|
+
warn_required_dynamic_aliases = true
|
|
140
156
|
|
|
141
157
|
[[tool.mypy.overrides]]
|
|
142
158
|
module = "playwright.*"
|
|
@@ -146,6 +162,20 @@ ignore_missing_imports = true
|
|
|
146
162
|
module = "extruct.*"
|
|
147
163
|
ignore_missing_imports = true
|
|
148
164
|
|
|
165
|
+
[[tool.mypy.overrides]]
|
|
166
|
+
module = "url_normalize"
|
|
167
|
+
ignore_missing_imports = true
|
|
168
|
+
|
|
169
|
+
[[tool.mypy.overrides]]
|
|
170
|
+
module = "docpull.models.*"
|
|
171
|
+
disallow_any_unimported = false
|
|
172
|
+
warn_return_any = false
|
|
173
|
+
|
|
174
|
+
[[tool.mypy.overrides]]
|
|
175
|
+
module = "docpull.concurrency.browser_pool"
|
|
176
|
+
disallow_any_unimported = false
|
|
177
|
+
warn_return_any = false
|
|
178
|
+
|
|
149
179
|
[[tool.mypy.overrides]]
|
|
150
180
|
module = "tests.*"
|
|
151
181
|
disallow_untyped_defs = false
|
|
@@ -167,7 +197,7 @@ markers = [
|
|
|
167
197
|
]
|
|
168
198
|
|
|
169
199
|
[tool.coverage.run]
|
|
170
|
-
source = ["docpull"]
|
|
200
|
+
source = ["src/docpull"]
|
|
171
201
|
omit = ["tests/*", "*/test_*.py"]
|
|
172
202
|
|
|
173
203
|
[tool.coverage.report]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
docpull - Fetch and convert documentation from any URL to markdown.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from docpull import Fetcher, DocpullConfig, ProfileName
|
|
6
|
+
|
|
7
|
+
config = DocpullConfig(
|
|
8
|
+
url="https://docs.example.com",
|
|
9
|
+
profile=ProfileName.RAG,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
async with Fetcher(config) as fetcher:
|
|
13
|
+
async for event in fetcher.run():
|
|
14
|
+
print(event)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
__version__ = "2.0.0"
|
|
18
|
+
|
|
19
|
+
from .cache import CacheManager, StreamingDeduplicator
|
|
20
|
+
from .core.fetcher import Fetcher, fetch_blocking
|
|
21
|
+
from .models.config import (
|
|
22
|
+
CacheConfig,
|
|
23
|
+
ContentFilterConfig,
|
|
24
|
+
CrawlConfig,
|
|
25
|
+
DocpullConfig,
|
|
26
|
+
IntegrationConfig,
|
|
27
|
+
NetworkConfig,
|
|
28
|
+
OutputConfig,
|
|
29
|
+
PerformanceConfig,
|
|
30
|
+
ProfileName,
|
|
31
|
+
)
|
|
32
|
+
from .models.events import EventType, FetchEvent, FetchStats
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"__version__",
|
|
36
|
+
# Core
|
|
37
|
+
"Fetcher",
|
|
38
|
+
"fetch_blocking",
|
|
39
|
+
# Config
|
|
40
|
+
"DocpullConfig",
|
|
41
|
+
"ProfileName",
|
|
42
|
+
"CrawlConfig",
|
|
43
|
+
"ContentFilterConfig",
|
|
44
|
+
"OutputConfig",
|
|
45
|
+
"NetworkConfig",
|
|
46
|
+
"PerformanceConfig",
|
|
47
|
+
"IntegrationConfig",
|
|
48
|
+
"CacheConfig",
|
|
49
|
+
# Events
|
|
50
|
+
"EventType",
|
|
51
|
+
"FetchEvent",
|
|
52
|
+
"FetchStats",
|
|
53
|
+
# Cache
|
|
54
|
+
"CacheManager",
|
|
55
|
+
"StreamingDeduplicator",
|
|
56
|
+
]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Caching and deduplication for docpull."""
|
|
2
|
+
|
|
3
|
+
from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
|
|
4
|
+
from .streaming_dedup import StreamingDeduplicator
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"CacheManager",
|
|
8
|
+
"CacheState",
|
|
9
|
+
"ManifestEntry",
|
|
10
|
+
"StreamingDeduplicator",
|
|
11
|
+
"DEFAULT_TTL_DAYS",
|
|
12
|
+
]
|