docpull 1.0.1__tar.gz → 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-1.0.2/PKG-INFO +215 -0
- docpull-1.0.2/README.md +148 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/__init__.py +1 -1
- docpull-1.0.2/docpull.egg-info/PKG-INFO +215 -0
- {docpull-1.0.1 → docpull-1.0.2}/pyproject.toml +1 -1
- docpull-1.0.1/PKG-INFO +0 -440
- docpull-1.0.1/README.md +0 -373
- docpull-1.0.1/docpull.egg-info/PKG-INFO +0 -440
- {docpull-1.0.1 → docpull-1.0.2}/LICENSE +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/__main__.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/cli.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/config.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/async_fetcher.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/base.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/bun.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/d3.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/generic.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/generic_async.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/nextjs.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/plaid.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/react.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/stripe.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/tailwind.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/turborepo.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/base.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/bun.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/d3.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/nextjs.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/plaid.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/react.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/stripe.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/tailwind.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/turborepo.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/py.typed +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/file_utils.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/logging_config.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/SOURCES.txt +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/entry_points.txt +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/requires.txt +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/top_level.txt +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/setup.cfg +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/tests/test_async_fetcher.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/tests/test_config.py +0 -0
- {docpull-1.0.1 → docpull-1.0.2}/tests/test_fetchers.py +0 -0
docpull-1.0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
|
+
Classifier: Typing :: Typed
|
|
38
|
+
Requires-Python: >=3.9
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: requests>=2.31.0
|
|
42
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
43
|
+
Requires-Dist: html2text>=2020.1.16
|
|
44
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
45
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: rich>=13.0.0
|
|
47
|
+
Provides-Extra: yaml
|
|
48
|
+
Requires-Dist: pyyaml>=6.0; extra == "yaml"
|
|
49
|
+
Provides-Extra: js
|
|
50
|
+
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
51
|
+
Provides-Extra: all
|
|
52
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
53
|
+
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
58
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
59
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
63
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
64
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
65
|
+
Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
|
|
66
|
+
Dynamic: license-file
|
|
67
|
+
|
|
68
|
+
# docpull
|
|
69
|
+
|
|
70
|
+
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
71
|
+
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
72
|
+
|
|
73
|
+
[](https://www.python.org/downloads/)
|
|
74
|
+
[](https://badge.fury.io/py/docpull)
|
|
75
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
76
|
+
[](https://github.com/psf/black)
|
|
77
|
+
[](http://mypy-lang.org/)
|
|
78
|
+
[](https://github.com/PyCQA/bandit)
|
|
79
|
+
|
|
80
|
+
## Why docpull?
|
|
81
|
+
|
|
82
|
+
Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
|
|
83
|
+
|
|
84
|
+
## Key Features
|
|
85
|
+
|
|
86
|
+
- Works on any documentation site
|
|
87
|
+
- Smart extraction of main content
|
|
88
|
+
- Async + parallel fetching (up to 10× faster)
|
|
89
|
+
- Optional JavaScript rendering via Playwright
|
|
90
|
+
- Sitemap + link crawling
|
|
91
|
+
- URL-based filtering (include/exclude)
|
|
92
|
+
- Rate limiting, timeouts, content-type checks
|
|
93
|
+
- Saves docs in structured Markdown with YAML metadata
|
|
94
|
+
- Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install docpull
|
|
100
|
+
docpull https://aptos.dev
|
|
101
|
+
docpull stripe # use a built-in profile
|
|
102
|
+
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### JavaScript-heavy sites
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip install docpull[js]
|
|
109
|
+
python -m playwright install chromium
|
|
110
|
+
docpull https://site.com --js
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Python API
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from docpull import GenericAsyncFetcher
|
|
117
|
+
|
|
118
|
+
fetcher = GenericAsyncFetcher(
|
|
119
|
+
url_or_profile="https://aptos.dev",
|
|
120
|
+
output_dir="./docs",
|
|
121
|
+
max_pages=100,
|
|
122
|
+
max_concurrent=20,
|
|
123
|
+
)
|
|
124
|
+
fetcher.fetch()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Common Options
|
|
128
|
+
|
|
129
|
+
- `--max-pages N` – limit crawl size
|
|
130
|
+
- `--max-depth N` – restrict link depth
|
|
131
|
+
- `--max-concurrent N` – control parallel fetches
|
|
132
|
+
- `--js` – enable Playwright rendering
|
|
133
|
+
- `--output-dir DIR`
|
|
134
|
+
- `--rate-limit X`
|
|
135
|
+
- `--no-skip-existing`
|
|
136
|
+
- `--dry-run`
|
|
137
|
+
|
|
138
|
+
## Performance
|
|
139
|
+
|
|
140
|
+
Async fetching drastically reduces runtime:
|
|
141
|
+
|
|
142
|
+
| Pages | Sync | Async | Speedup |
|
|
143
|
+
|-------|------|-------|---------|
|
|
144
|
+
| 50 | ~50s | ~6s | 8× faster |
|
|
145
|
+
|
|
146
|
+
Higher concurrency yields even better results.
|
|
147
|
+
|
|
148
|
+
## Output Format
|
|
149
|
+
|
|
150
|
+
Each downloaded page becomes a Markdown file:
|
|
151
|
+
|
|
152
|
+
```markdown
|
|
153
|
+
---
|
|
154
|
+
url: https://stripe.com/docs/payments
|
|
155
|
+
fetched: 2025-11-13
|
|
156
|
+
---
|
|
157
|
+
# Payment Intents
|
|
158
|
+
...
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Directory layout mirrors the target site's structure.
|
|
162
|
+
|
|
163
|
+
## Configuration File (Optional)
|
|
164
|
+
|
|
165
|
+
```yaml
|
|
166
|
+
output_dir: ./docs
|
|
167
|
+
rate_limit: 0.5
|
|
168
|
+
sources:
|
|
169
|
+
- stripe
|
|
170
|
+
- nextjs
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Run with:
|
|
174
|
+
```bash
|
|
175
|
+
docpull --config config.yaml
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Custom Profiles
|
|
179
|
+
|
|
180
|
+
Easily define profiles for frequently scraped sites.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from docpull.profiles.base import SiteProfile
|
|
184
|
+
|
|
185
|
+
MY_PROFILE = SiteProfile(
|
|
186
|
+
name="mysite",
|
|
187
|
+
domains={"docs.mysite.com"},
|
|
188
|
+
include_patterns=["/docs/", "/api/"],
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Security
|
|
193
|
+
|
|
194
|
+
- HTTPS-only
|
|
195
|
+
- Blocks private network IPs
|
|
196
|
+
- 50MB page size limit
|
|
197
|
+
- Timeout controls
|
|
198
|
+
- Validates content-type
|
|
199
|
+
- Playwright sandboxing
|
|
200
|
+
|
|
201
|
+
## Troubleshooting
|
|
202
|
+
|
|
203
|
+
- **Site requires JS**: install Playwright + `--js`
|
|
204
|
+
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
205
|
+
- **Large sites**: set `--max-pages`
|
|
206
|
+
|
|
207
|
+
## Links
|
|
208
|
+
|
|
209
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
210
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
211
|
+
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
docpull-1.0.2/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# docpull
|
|
2
|
+
|
|
3
|
+
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
4
|
+
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
5
|
+
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://badge.fury.io/py/docpull)
|
|
8
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
9
|
+
[](https://github.com/psf/black)
|
|
10
|
+
[](http://mypy-lang.org/)
|
|
11
|
+
[](https://github.com/PyCQA/bandit)
|
|
12
|
+
|
|
13
|
+
## Why docpull?
|
|
14
|
+
|
|
15
|
+
Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
|
|
16
|
+
|
|
17
|
+
## Key Features
|
|
18
|
+
|
|
19
|
+
- Works on any documentation site
|
|
20
|
+
- Smart extraction of main content
|
|
21
|
+
- Async + parallel fetching (up to 10× faster)
|
|
22
|
+
- Optional JavaScript rendering via Playwright
|
|
23
|
+
- Sitemap + link crawling
|
|
24
|
+
- URL-based filtering (include/exclude)
|
|
25
|
+
- Rate limiting, timeouts, content-type checks
|
|
26
|
+
- Saves docs in structured Markdown with YAML metadata
|
|
27
|
+
- Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install docpull
|
|
33
|
+
docpull https://aptos.dev
|
|
34
|
+
docpull stripe # use a built-in profile
|
|
35
|
+
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
### JavaScript-heavy sites
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install docpull[js]
|
|
42
|
+
python -m playwright install chromium
|
|
43
|
+
docpull https://site.com --js
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Python API
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from docpull import GenericAsyncFetcher
|
|
50
|
+
|
|
51
|
+
fetcher = GenericAsyncFetcher(
|
|
52
|
+
url_or_profile="https://aptos.dev",
|
|
53
|
+
output_dir="./docs",
|
|
54
|
+
max_pages=100,
|
|
55
|
+
max_concurrent=20,
|
|
56
|
+
)
|
|
57
|
+
fetcher.fetch()
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Common Options
|
|
61
|
+
|
|
62
|
+
- `--max-pages N` – limit crawl size
|
|
63
|
+
- `--max-depth N` – restrict link depth
|
|
64
|
+
- `--max-concurrent N` – control parallel fetches
|
|
65
|
+
- `--js` – enable Playwright rendering
|
|
66
|
+
- `--output-dir DIR`
|
|
67
|
+
- `--rate-limit X`
|
|
68
|
+
- `--no-skip-existing`
|
|
69
|
+
- `--dry-run`
|
|
70
|
+
|
|
71
|
+
## Performance
|
|
72
|
+
|
|
73
|
+
Async fetching drastically reduces runtime:
|
|
74
|
+
|
|
75
|
+
| Pages | Sync | Async | Speedup |
|
|
76
|
+
|-------|------|-------|---------|
|
|
77
|
+
| 50 | ~50s | ~6s | 8× faster |
|
|
78
|
+
|
|
79
|
+
Higher concurrency yields even better results.
|
|
80
|
+
|
|
81
|
+
## Output Format
|
|
82
|
+
|
|
83
|
+
Each downloaded page becomes a Markdown file:
|
|
84
|
+
|
|
85
|
+
```markdown
|
|
86
|
+
---
|
|
87
|
+
url: https://stripe.com/docs/payments
|
|
88
|
+
fetched: 2025-11-13
|
|
89
|
+
---
|
|
90
|
+
# Payment Intents
|
|
91
|
+
...
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Directory layout mirrors the target site's structure.
|
|
95
|
+
|
|
96
|
+
## Configuration File (Optional)
|
|
97
|
+
|
|
98
|
+
```yaml
|
|
99
|
+
output_dir: ./docs
|
|
100
|
+
rate_limit: 0.5
|
|
101
|
+
sources:
|
|
102
|
+
- stripe
|
|
103
|
+
- nextjs
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Run with:
|
|
107
|
+
```bash
|
|
108
|
+
docpull --config config.yaml
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Custom Profiles
|
|
112
|
+
|
|
113
|
+
Easily define profiles for frequently scraped sites.
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from docpull.profiles.base import SiteProfile
|
|
117
|
+
|
|
118
|
+
MY_PROFILE = SiteProfile(
|
|
119
|
+
name="mysite",
|
|
120
|
+
domains={"docs.mysite.com"},
|
|
121
|
+
include_patterns=["/docs/", "/api/"],
|
|
122
|
+
)
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## Security
|
|
126
|
+
|
|
127
|
+
- HTTPS-only
|
|
128
|
+
- Blocks private network IPs
|
|
129
|
+
- 50MB page size limit
|
|
130
|
+
- Timeout controls
|
|
131
|
+
- Validates content-type
|
|
132
|
+
- Playwright sandboxing
|
|
133
|
+
|
|
134
|
+
## Troubleshooting
|
|
135
|
+
|
|
136
|
+
- **Site requires JS**: install Playwright + `--js`
|
|
137
|
+
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
138
|
+
- **Large sites**: set `--max-pages`
|
|
139
|
+
|
|
140
|
+
## Links
|
|
141
|
+
|
|
142
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
143
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
144
|
+
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
|
+
Classifier: Typing :: Typed
|
|
38
|
+
Requires-Python: >=3.9
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: requests>=2.31.0
|
|
42
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
43
|
+
Requires-Dist: html2text>=2020.1.16
|
|
44
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
45
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: rich>=13.0.0
|
|
47
|
+
Provides-Extra: yaml
|
|
48
|
+
Requires-Dist: pyyaml>=6.0; extra == "yaml"
|
|
49
|
+
Provides-Extra: js
|
|
50
|
+
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
51
|
+
Provides-Extra: all
|
|
52
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
53
|
+
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
58
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
59
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
63
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
64
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
65
|
+
Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
|
|
66
|
+
Dynamic: license-file
|
|
67
|
+
|
|
68
|
+
# docpull
|
|
69
|
+
|
|
70
|
+
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
71
|
+
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
72
|
+
|
|
73
|
+
[](https://www.python.org/downloads/)
|
|
74
|
+
[](https://badge.fury.io/py/docpull)
|
|
75
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
76
|
+
[](https://github.com/psf/black)
|
|
77
|
+
[](http://mypy-lang.org/)
|
|
78
|
+
[](https://github.com/PyCQA/bandit)
|
|
79
|
+
|
|
80
|
+
## Why docpull?
|
|
81
|
+
|
|
82
|
+
Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
|
|
83
|
+
|
|
84
|
+
## Key Features
|
|
85
|
+
|
|
86
|
+
- Works on any documentation site
|
|
87
|
+
- Smart extraction of main content
|
|
88
|
+
- Async + parallel fetching (up to 10× faster)
|
|
89
|
+
- Optional JavaScript rendering via Playwright
|
|
90
|
+
- Sitemap + link crawling
|
|
91
|
+
- URL-based filtering (include/exclude)
|
|
92
|
+
- Rate limiting, timeouts, content-type checks
|
|
93
|
+
- Saves docs in structured Markdown with YAML metadata
|
|
94
|
+
- Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install docpull
|
|
100
|
+
docpull https://aptos.dev
|
|
101
|
+
docpull stripe # use a built-in profile
|
|
102
|
+
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### JavaScript-heavy sites
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pip install docpull[js]
|
|
109
|
+
python -m playwright install chromium
|
|
110
|
+
docpull https://site.com --js
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Python API
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from docpull import GenericAsyncFetcher
|
|
117
|
+
|
|
118
|
+
fetcher = GenericAsyncFetcher(
|
|
119
|
+
url_or_profile="https://aptos.dev",
|
|
120
|
+
output_dir="./docs",
|
|
121
|
+
max_pages=100,
|
|
122
|
+
max_concurrent=20,
|
|
123
|
+
)
|
|
124
|
+
fetcher.fetch()
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Common Options
|
|
128
|
+
|
|
129
|
+
- `--max-pages N` – limit crawl size
|
|
130
|
+
- `--max-depth N` – restrict link depth
|
|
131
|
+
- `--max-concurrent N` – control parallel fetches
|
|
132
|
+
- `--js` – enable Playwright rendering
|
|
133
|
+
- `--output-dir DIR`
|
|
134
|
+
- `--rate-limit X`
|
|
135
|
+
- `--no-skip-existing`
|
|
136
|
+
- `--dry-run`
|
|
137
|
+
|
|
138
|
+
## Performance
|
|
139
|
+
|
|
140
|
+
Async fetching drastically reduces runtime:
|
|
141
|
+
|
|
142
|
+
| Pages | Sync | Async | Speedup |
|
|
143
|
+
|-------|------|-------|---------|
|
|
144
|
+
| 50 | ~50s | ~6s | 8× faster |
|
|
145
|
+
|
|
146
|
+
Higher concurrency yields even better results.
|
|
147
|
+
|
|
148
|
+
## Output Format
|
|
149
|
+
|
|
150
|
+
Each downloaded page becomes a Markdown file:
|
|
151
|
+
|
|
152
|
+
```markdown
|
|
153
|
+
---
|
|
154
|
+
url: https://stripe.com/docs/payments
|
|
155
|
+
fetched: 2025-11-13
|
|
156
|
+
---
|
|
157
|
+
# Payment Intents
|
|
158
|
+
...
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Directory layout mirrors the target site's structure.
|
|
162
|
+
|
|
163
|
+
## Configuration File (Optional)
|
|
164
|
+
|
|
165
|
+
```yaml
|
|
166
|
+
output_dir: ./docs
|
|
167
|
+
rate_limit: 0.5
|
|
168
|
+
sources:
|
|
169
|
+
- stripe
|
|
170
|
+
- nextjs
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Run with:
|
|
174
|
+
```bash
|
|
175
|
+
docpull --config config.yaml
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Custom Profiles
|
|
179
|
+
|
|
180
|
+
Easily define profiles for frequently scraped sites.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from docpull.profiles.base import SiteProfile
|
|
184
|
+
|
|
185
|
+
MY_PROFILE = SiteProfile(
|
|
186
|
+
name="mysite",
|
|
187
|
+
domains={"docs.mysite.com"},
|
|
188
|
+
include_patterns=["/docs/", "/api/"],
|
|
189
|
+
)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Security
|
|
193
|
+
|
|
194
|
+
- HTTPS-only
|
|
195
|
+
- Blocks private network IPs
|
|
196
|
+
- 50MB page size limit
|
|
197
|
+
- Timeout controls
|
|
198
|
+
- Validates content-type
|
|
199
|
+
- Playwright sandboxing
|
|
200
|
+
|
|
201
|
+
## Troubleshooting
|
|
202
|
+
|
|
203
|
+
- **Site requires JS**: install Playwright + `--js`
|
|
204
|
+
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
205
|
+
- **Large sites**: set `--max-pages`
|
|
206
|
+
|
|
207
|
+
## Links
|
|
208
|
+
|
|
209
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
210
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
211
|
+
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
212
|
+
|
|
213
|
+
## License
|
|
214
|
+
|
|
215
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "1.0.
|
|
7
|
+
version = "1.0.2"
|
|
8
8
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
9
9
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
10
|
requires-python = ">=3.9"
|