docpull 1.0.1__tar.gz → 1.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. docpull-1.0.2/PKG-INFO +215 -0
  2. docpull-1.0.2/README.md +148 -0
  3. {docpull-1.0.1 → docpull-1.0.2}/docpull/__init__.py +1 -1
  4. docpull-1.0.2/docpull.egg-info/PKG-INFO +215 -0
  5. {docpull-1.0.1 → docpull-1.0.2}/pyproject.toml +1 -1
  6. docpull-1.0.1/PKG-INFO +0 -440
  7. docpull-1.0.1/README.md +0 -373
  8. docpull-1.0.1/docpull.egg-info/PKG-INFO +0 -440
  9. {docpull-1.0.1 → docpull-1.0.2}/LICENSE +0 -0
  10. {docpull-1.0.1 → docpull-1.0.2}/docpull/__main__.py +0 -0
  11. {docpull-1.0.1 → docpull-1.0.2}/docpull/cli.py +0 -0
  12. {docpull-1.0.1 → docpull-1.0.2}/docpull/config.py +0 -0
  13. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/__init__.py +0 -0
  14. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/async_fetcher.py +0 -0
  15. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/base.py +0 -0
  16. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/bun.py +0 -0
  17. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/d3.py +0 -0
  18. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/generic.py +0 -0
  19. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/generic_async.py +0 -0
  20. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/nextjs.py +0 -0
  21. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/parallel_base.py +0 -0
  22. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/plaid.py +0 -0
  23. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/react.py +0 -0
  24. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/stripe.py +0 -0
  25. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/tailwind.py +0 -0
  26. {docpull-1.0.1 → docpull-1.0.2}/docpull/fetchers/turborepo.py +0 -0
  27. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/__init__.py +0 -0
  28. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/base.py +0 -0
  29. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/bun.py +0 -0
  30. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/d3.py +0 -0
  31. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/nextjs.py +0 -0
  32. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/plaid.py +0 -0
  33. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/react.py +0 -0
  34. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/stripe.py +0 -0
  35. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/tailwind.py +0 -0
  36. {docpull-1.0.1 → docpull-1.0.2}/docpull/profiles/turborepo.py +0 -0
  37. {docpull-1.0.1 → docpull-1.0.2}/docpull/py.typed +0 -0
  38. {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/__init__.py +0 -0
  39. {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/file_utils.py +0 -0
  40. {docpull-1.0.1 → docpull-1.0.2}/docpull/utils/logging_config.py +0 -0
  41. {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/SOURCES.txt +0 -0
  42. {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/dependency_links.txt +0 -0
  43. {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/entry_points.txt +0 -0
  44. {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/requires.txt +0 -0
  45. {docpull-1.0.1 → docpull-1.0.2}/docpull.egg-info/top_level.txt +0 -0
  46. {docpull-1.0.1 → docpull-1.0.2}/setup.cfg +0 -0
  47. {docpull-1.0.1 → docpull-1.0.2}/tests/test_async_fetcher.py +0 -0
  48. {docpull-1.0.1 → docpull-1.0.2}/tests/test_config.py +0 -0
  49. {docpull-1.0.1 → docpull-1.0.2}/tests/test_fetchers.py +0 -0
docpull-1.0.2/PKG-INFO ADDED
@@ -0,0 +1,215 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 1.0.2
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Typing :: Typed
38
+ Requires-Python: >=3.9
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: requests>=2.31.0
42
+ Requires-Dist: beautifulsoup4>=4.12.0
43
+ Requires-Dist: html2text>=2020.1.16
44
+ Requires-Dist: defusedxml>=0.7.1
45
+ Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: rich>=13.0.0
47
+ Provides-Extra: yaml
48
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
49
+ Provides-Extra: js
50
+ Requires-Dist: playwright>=1.40.0; extra == "js"
51
+ Provides-Extra: all
52
+ Requires-Dist: pyyaml>=6.0; extra == "all"
53
+ Requires-Dist: playwright>=1.40.0; extra == "all"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
56
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
57
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
58
+ Requires-Dist: black>=23.0.0; extra == "dev"
59
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
60
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
61
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
62
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
63
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
64
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
65
+ Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
66
+ Dynamic: license-file
67
+
68
+ # docpull
69
+
70
+ **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
71
+ Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
72
+
73
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
74
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
75
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
76
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
77
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
78
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
79
+
80
+ ## Why docpull?
81
+
82
+ Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
83
+
84
+ ## Key Features
85
+
86
+ - Works on any documentation site
87
+ - Smart extraction of main content
88
+ - Async + parallel fetching (up to 10× faster)
89
+ - Optional JavaScript rendering via Playwright
90
+ - Sitemap + link crawling
91
+ - URL-based filtering (include/exclude)
92
+ - Rate limiting, timeouts, content-type checks
93
+ - Saves docs in structured Markdown with YAML metadata
94
+ - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
95
+
96
+ ## Quick Start
97
+
98
+ ```bash
99
+ pip install docpull
100
+ docpull https://aptos.dev
101
+ docpull stripe # use a built-in profile
102
+ docpull https://site.com/docs --max-pages 100 --max-concurrent 20
103
+ ```
104
+
105
+ ### JavaScript-heavy sites
106
+
107
+ ```bash
108
+ pip install docpull[js]
109
+ python -m playwright install chromium
110
+ docpull https://site.com --js
111
+ ```
112
+
113
+ ## Python API
114
+
115
+ ```python
116
+ from docpull import GenericAsyncFetcher
117
+
118
+ fetcher = GenericAsyncFetcher(
119
+ url_or_profile="https://aptos.dev",
120
+ output_dir="./docs",
121
+ max_pages=100,
122
+ max_concurrent=20,
123
+ )
124
+ fetcher.fetch()
125
+ ```
126
+
127
+ ## Common Options
128
+
129
+ - `--max-pages N` – limit crawl size
130
+ - `--max-depth N` – restrict link depth
131
+ - `--max-concurrent N` – control parallel fetches
132
+ - `--js` – enable Playwright rendering
133
+ - `--output-dir DIR`
134
+ - `--rate-limit X`
135
+ - `--no-skip-existing`
136
+ - `--dry-run`
137
+
138
+ ## Performance
139
+
140
+ Async fetching drastically reduces runtime:
141
+
142
+ | Pages | Sync | Async | Speedup |
143
+ |-------|------|-------|---------|
144
+ | 50 | ~50s | ~6s | 8× faster |
145
+
146
+ Higher concurrency yields even better results.
147
+
148
+ ## Output Format
149
+
150
+ Each downloaded page becomes a Markdown file:
151
+
152
+ ```markdown
153
+ ---
154
+ url: https://stripe.com/docs/payments
155
+ fetched: 2025-11-13
156
+ ---
157
+ # Payment Intents
158
+ ...
159
+ ```
160
+
161
+ Directory layout mirrors the target site's structure.
162
+
163
+ ## Configuration File (Optional)
164
+
165
+ ```yaml
166
+ output_dir: ./docs
167
+ rate_limit: 0.5
168
+ sources:
169
+ - stripe
170
+ - nextjs
171
+ ```
172
+
173
+ Run with:
174
+ ```bash
175
+ docpull --config config.yaml
176
+ ```
177
+
178
+ ## Custom Profiles
179
+
180
+ Easily define profiles for frequently scraped sites.
181
+
182
+ ```python
183
+ from docpull.profiles.base import SiteProfile
184
+
185
+ MY_PROFILE = SiteProfile(
186
+ name="mysite",
187
+ domains={"docs.mysite.com"},
188
+ include_patterns=["/docs/", "/api/"],
189
+ )
190
+ ```
191
+
192
+ ## Security
193
+
194
+ - HTTPS-only
195
+ - Blocks private network IPs
196
+ - 50MB page size limit
197
+ - Timeout controls
198
+ - Validates content-type
199
+ - Playwright sandboxing
200
+
201
+ ## Troubleshooting
202
+
203
+ - **Site requires JS**: install Playwright + `--js`
204
+ - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
205
+ - **Large sites**: set `--max-pages`
206
+
207
+ ## Links
208
+
209
+ - [PyPI](https://pypi.org/project/docpull/)
210
+ - [GitHub](https://github.com/raintree-technology/docpull)
211
+ - [Issues](https://github.com/raintree-technology/docpull/issues)
212
+
213
+ ## License
214
+
215
+ MIT License - see [LICENSE](LICENSE) file for details
@@ -0,0 +1,148 @@
1
+ # docpull
2
+
3
+ **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
4
+ Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
5
+
6
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
7
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
8
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
9
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
10
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
11
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
12
+
13
+ ## Why docpull?
14
+
15
+ Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
16
+
17
+ ## Key Features
18
+
19
+ - Works on any documentation site
20
+ - Smart extraction of main content
21
+ - Async + parallel fetching (up to 10× faster)
22
+ - Optional JavaScript rendering via Playwright
23
+ - Sitemap + link crawling
24
+ - URL-based filtering (include/exclude)
25
+ - Rate limiting, timeouts, content-type checks
26
+ - Saves docs in structured Markdown with YAML metadata
27
+ - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install docpull
33
+ docpull https://aptos.dev
34
+ docpull stripe # use a built-in profile
35
+ docpull https://site.com/docs --max-pages 100 --max-concurrent 20
36
+ ```
37
+
38
+ ### JavaScript-heavy sites
39
+
40
+ ```bash
41
+ pip install docpull[js]
42
+ python -m playwright install chromium
43
+ docpull https://site.com --js
44
+ ```
45
+
46
+ ## Python API
47
+
48
+ ```python
49
+ from docpull import GenericAsyncFetcher
50
+
51
+ fetcher = GenericAsyncFetcher(
52
+ url_or_profile="https://aptos.dev",
53
+ output_dir="./docs",
54
+ max_pages=100,
55
+ max_concurrent=20,
56
+ )
57
+ fetcher.fetch()
58
+ ```
59
+
60
+ ## Common Options
61
+
62
+ - `--max-pages N` – limit crawl size
63
+ - `--max-depth N` – restrict link depth
64
+ - `--max-concurrent N` – control parallel fetches
65
+ - `--js` – enable Playwright rendering
66
+ - `--output-dir DIR`
67
+ - `--rate-limit X`
68
+ - `--no-skip-existing`
69
+ - `--dry-run`
70
+
71
+ ## Performance
72
+
73
+ Async fetching drastically reduces runtime:
74
+
75
+ | Pages | Sync | Async | Speedup |
76
+ |-------|------|-------|---------|
77
+ | 50 | ~50s | ~6s | 8× faster |
78
+
79
+ Higher concurrency yields even better results.
80
+
81
+ ## Output Format
82
+
83
+ Each downloaded page becomes a Markdown file:
84
+
85
+ ```markdown
86
+ ---
87
+ url: https://stripe.com/docs/payments
88
+ fetched: 2025-11-13
89
+ ---
90
+ # Payment Intents
91
+ ...
92
+ ```
93
+
94
+ Directory layout mirrors the target site's structure.
95
+
96
+ ## Configuration File (Optional)
97
+
98
+ ```yaml
99
+ output_dir: ./docs
100
+ rate_limit: 0.5
101
+ sources:
102
+ - stripe
103
+ - nextjs
104
+ ```
105
+
106
+ Run with:
107
+ ```bash
108
+ docpull --config config.yaml
109
+ ```
110
+
111
+ ## Custom Profiles
112
+
113
+ Easily define profiles for frequently scraped sites.
114
+
115
+ ```python
116
+ from docpull.profiles.base import SiteProfile
117
+
118
+ MY_PROFILE = SiteProfile(
119
+ name="mysite",
120
+ domains={"docs.mysite.com"},
121
+ include_patterns=["/docs/", "/api/"],
122
+ )
123
+ ```
124
+
125
+ ## Security
126
+
127
+ - HTTPS-only
128
+ - Blocks private network IPs
129
+ - 50MB page size limit
130
+ - Timeout controls
131
+ - Validates content-type
132
+ - Playwright sandboxing
133
+
134
+ ## Troubleshooting
135
+
136
+ - **Site requires JS**: install Playwright + `--js`
137
+ - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
138
+ - **Large sites**: set `--max-pages`
139
+
140
+ ## Links
141
+
142
+ - [PyPI](https://pypi.org/project/docpull/)
143
+ - [GitHub](https://github.com/raintree-technology/docpull)
144
+ - [Issues](https://github.com/raintree-technology/docpull/issues)
145
+
146
+ ## License
147
+
148
+ MIT License - see [LICENSE](LICENSE) file for details
@@ -1,4 +1,4 @@
1
- __version__ = "1.0.1"
1
+ __version__ = "1.0.2"
2
2
 
3
3
  from .fetchers.base import BaseFetcher
4
4
  from .fetchers.bun import BunFetcher
@@ -0,0 +1,215 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 1.0.2
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Typing :: Typed
38
+ Requires-Python: >=3.9
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: requests>=2.31.0
42
+ Requires-Dist: beautifulsoup4>=4.12.0
43
+ Requires-Dist: html2text>=2020.1.16
44
+ Requires-Dist: defusedxml>=0.7.1
45
+ Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: rich>=13.0.0
47
+ Provides-Extra: yaml
48
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
49
+ Provides-Extra: js
50
+ Requires-Dist: playwright>=1.40.0; extra == "js"
51
+ Provides-Extra: all
52
+ Requires-Dist: pyyaml>=6.0; extra == "all"
53
+ Requires-Dist: playwright>=1.40.0; extra == "all"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
56
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
57
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
58
+ Requires-Dist: black>=23.0.0; extra == "dev"
59
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
60
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
61
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
62
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
63
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
64
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
65
+ Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
66
+ Dynamic: license-file
67
+
68
+ # docpull
69
+
70
+ **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
71
+ Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
72
+
73
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
74
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
75
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
76
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
77
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
78
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
79
+
80
+ ## Why docpull?
81
+
82
+ Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
83
+
84
+ ## Key Features
85
+
86
+ - Works on any documentation site
87
+ - Smart extraction of main content
88
+ - Async + parallel fetching (up to 10× faster)
89
+ - Optional JavaScript rendering via Playwright
90
+ - Sitemap + link crawling
91
+ - URL-based filtering (include/exclude)
92
+ - Rate limiting, timeouts, content-type checks
93
+ - Saves docs in structured Markdown with YAML metadata
94
+ - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
95
+
96
+ ## Quick Start
97
+
98
+ ```bash
99
+ pip install docpull
100
+ docpull https://aptos.dev
101
+ docpull stripe # use a built-in profile
102
+ docpull https://site.com/docs --max-pages 100 --max-concurrent 20
103
+ ```
104
+
105
+ ### JavaScript-heavy sites
106
+
107
+ ```bash
108
+ pip install docpull[js]
109
+ python -m playwright install chromium
110
+ docpull https://site.com --js
111
+ ```
112
+
113
+ ## Python API
114
+
115
+ ```python
116
+ from docpull import GenericAsyncFetcher
117
+
118
+ fetcher = GenericAsyncFetcher(
119
+ url_or_profile="https://aptos.dev",
120
+ output_dir="./docs",
121
+ max_pages=100,
122
+ max_concurrent=20,
123
+ )
124
+ fetcher.fetch()
125
+ ```
126
+
127
+ ## Common Options
128
+
129
+ - `--max-pages N` – limit crawl size
130
+ - `--max-depth N` – restrict link depth
131
+ - `--max-concurrent N` – control parallel fetches
132
+ - `--js` – enable Playwright rendering
133
+ - `--output-dir DIR`
134
+ - `--rate-limit X`
135
+ - `--no-skip-existing`
136
+ - `--dry-run`
137
+
138
+ ## Performance
139
+
140
+ Async fetching drastically reduces runtime:
141
+
142
+ | Pages | Sync | Async | Speedup |
143
+ |-------|------|-------|---------|
144
+ | 50 | ~50s | ~6s | 8× faster |
145
+
146
+ Higher concurrency yields even better results.
147
+
148
+ ## Output Format
149
+
150
+ Each downloaded page becomes a Markdown file:
151
+
152
+ ```markdown
153
+ ---
154
+ url: https://stripe.com/docs/payments
155
+ fetched: 2025-11-13
156
+ ---
157
+ # Payment Intents
158
+ ...
159
+ ```
160
+
161
+ Directory layout mirrors the target site's structure.
162
+
163
+ ## Configuration File (Optional)
164
+
165
+ ```yaml
166
+ output_dir: ./docs
167
+ rate_limit: 0.5
168
+ sources:
169
+ - stripe
170
+ - nextjs
171
+ ```
172
+
173
+ Run with:
174
+ ```bash
175
+ docpull --config config.yaml
176
+ ```
177
+
178
+ ## Custom Profiles
179
+
180
+ Easily define profiles for frequently scraped sites.
181
+
182
+ ```python
183
+ from docpull.profiles.base import SiteProfile
184
+
185
+ MY_PROFILE = SiteProfile(
186
+ name="mysite",
187
+ domains={"docs.mysite.com"},
188
+ include_patterns=["/docs/", "/api/"],
189
+ )
190
+ ```
191
+
192
+ ## Security
193
+
194
+ - HTTPS-only
195
+ - Blocks private network IPs
196
+ - 50MB page size limit
197
+ - Timeout controls
198
+ - Validates content-type
199
+ - Playwright sandboxing
200
+
201
+ ## Troubleshooting
202
+
203
+ - **Site requires JS**: install Playwright + `--js`
204
+ - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
205
+ - **Large sites**: set `--max-pages`
206
+
207
+ ## Links
208
+
209
+ - [PyPI](https://pypi.org/project/docpull/)
210
+ - [GitHub](https://github.com/raintree-technology/docpull)
211
+ - [Issues](https://github.com/raintree-technology/docpull/issues)
212
+
213
+ ## License
214
+
215
+ MIT License - see [LICENSE](LICENSE) file for details
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "1.0.1"
7
+ version = "1.0.2"
8
8
  description = "Pull documentation from the web and convert to clean markdown"
9
9
  readme = {file = "README.md", content-type = "text/markdown"}
10
10
  requires-python = ">=3.9"