docpull 1.5.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. docpull-2.2.0/PKG-INFO +208 -0
  2. docpull-2.2.0/README.md +131 -0
  3. {docpull-1.5.0 → docpull-2.2.0}/pyproject.toml +28 -6
  4. docpull-2.2.0/src/docpull/__init__.py +56 -0
  5. docpull-2.2.0/src/docpull/cache/__init__.py +12 -0
  6. docpull-2.2.0/src/docpull/cache/manager.py +502 -0
  7. docpull-2.2.0/src/docpull/cache/streaming_dedup.py +136 -0
  8. docpull-2.2.0/src/docpull/cli.py +522 -0
  9. docpull-2.2.0/src/docpull/concurrency/__init__.py +15 -0
  10. docpull-2.2.0/src/docpull/concurrency/browser_pool.py +336 -0
  11. docpull-2.2.0/src/docpull/concurrency/manager.py +111 -0
  12. docpull-2.2.0/src/docpull/conversion/__init__.py +15 -0
  13. docpull-2.2.0/src/docpull/conversion/extractor.py +247 -0
  14. docpull-2.2.0/src/docpull/conversion/markdown.py +203 -0
  15. docpull-2.2.0/src/docpull/conversion/protocols.py +46 -0
  16. docpull-2.2.0/src/docpull/core/__init__.py +5 -0
  17. docpull-2.2.0/src/docpull/core/fetcher.py +648 -0
  18. docpull-2.2.0/src/docpull/discovery/__init__.py +46 -0
  19. docpull-2.2.0/src/docpull/discovery/composite.py +127 -0
  20. docpull-2.2.0/src/docpull/discovery/crawler.py +255 -0
  21. docpull-2.2.0/src/docpull/discovery/filters.py +231 -0
  22. docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +22 -0
  23. docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +294 -0
  24. docpull-2.2.0/src/docpull/discovery/link_extractors/enhanced.py +315 -0
  25. docpull-2.2.0/src/docpull/discovery/link_extractors/protocols.py +33 -0
  26. docpull-2.2.0/src/docpull/discovery/link_extractors/static.py +160 -0
  27. docpull-2.2.0/src/docpull/discovery/protocols.py +52 -0
  28. docpull-2.2.0/src/docpull/discovery/sitemap.py +287 -0
  29. {docpull-1.5.0 → docpull-2.2.0/src}/docpull/doctor.py +5 -4
  30. docpull-2.2.0/src/docpull/http/__init__.py +13 -0
  31. docpull-2.2.0/src/docpull/http/client.py +353 -0
  32. docpull-2.2.0/src/docpull/http/protocols.py +78 -0
  33. docpull-2.2.0/src/docpull/http/rate_limiter.py +259 -0
  34. {docpull-1.5.0 → docpull-2.2.0/src}/docpull/metadata_extractor.py +16 -14
  35. docpull-2.2.0/src/docpull/models/__init__.py +42 -0
  36. docpull-2.2.0/src/docpull/models/config.py +340 -0
  37. docpull-2.2.0/src/docpull/models/events.py +162 -0
  38. docpull-2.2.0/src/docpull/models/profiles.py +103 -0
  39. docpull-2.2.0/src/docpull/pipeline/__init__.py +5 -0
  40. docpull-2.2.0/src/docpull/pipeline/base.py +189 -0
  41. docpull-2.2.0/src/docpull/pipeline/steps/__init__.py +21 -0
  42. docpull-2.2.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
  43. docpull-2.2.0/src/docpull/pipeline/steps/convert.py +134 -0
  44. docpull-2.2.0/src/docpull/pipeline/steps/dedup.py +96 -0
  45. docpull-2.2.0/src/docpull/pipeline/steps/fetch.py +192 -0
  46. docpull-2.2.0/src/docpull/pipeline/steps/metadata.py +139 -0
  47. docpull-2.2.0/src/docpull/pipeline/steps/save.py +167 -0
  48. docpull-2.2.0/src/docpull/pipeline/steps/save_json.py +191 -0
  49. docpull-2.2.0/src/docpull/pipeline/steps/save_sqlite.py +171 -0
  50. docpull-2.2.0/src/docpull/pipeline/steps/validate.py +140 -0
  51. docpull-2.2.0/src/docpull/security/__init__.py +6 -0
  52. docpull-2.2.0/src/docpull/security/robots.py +193 -0
  53. docpull-2.2.0/src/docpull/security/url_validator.py +175 -0
  54. docpull-2.2.0/src/docpull.egg-info/PKG-INFO +208 -0
  55. docpull-2.2.0/src/docpull.egg-info/SOURCES.txt +67 -0
  56. {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/requires.txt +1 -0
  57. docpull-2.2.0/tests/test_link_extractors.py +270 -0
  58. docpull-2.2.0/tests/test_v2_conversion.py +293 -0
  59. docpull-2.2.0/tests/test_v2_discovery.py +356 -0
  60. docpull-2.2.0/tests/test_v2_integration.py +360 -0
  61. docpull-2.2.0/tests/test_v2_pipeline.py +370 -0
  62. docpull-1.5.0/PKG-INFO +0 -478
  63. docpull-1.5.0/README.md +0 -402
  64. docpull-1.5.0/docpull/__init__.py +0 -13
  65. docpull-1.5.0/docpull/archive.py +0 -186
  66. docpull-1.5.0/docpull/cache.py +0 -256
  67. docpull-1.5.0/docpull/cli.py +0 -782
  68. docpull-1.5.0/docpull/config.py +0 -332
  69. docpull-1.5.0/docpull/fetchers/__init__.py +0 -11
  70. docpull-1.5.0/docpull/fetchers/async_fetcher.py +0 -463
  71. docpull-1.5.0/docpull/fetchers/base.py +0 -686
  72. docpull-1.5.0/docpull/fetchers/generic.py +0 -215
  73. docpull-1.5.0/docpull/fetchers/generic_async.py +0 -324
  74. docpull-1.5.0/docpull/fetchers/parallel_base.py +0 -93
  75. docpull-1.5.0/docpull/file_utils.py +0 -97
  76. docpull-1.5.0/docpull/formatters/__init__.py +0 -50
  77. docpull-1.5.0/docpull/formatters/base.py +0 -102
  78. docpull-1.5.0/docpull/formatters/json.py +0 -100
  79. docpull-1.5.0/docpull/formatters/markdown.py +0 -49
  80. docpull-1.5.0/docpull/formatters/sqlite.py +0 -266
  81. docpull-1.5.0/docpull/formatters/toon.py +0 -90
  82. docpull-1.5.0/docpull/hooks.py +0 -222
  83. docpull-1.5.0/docpull/indexer.py +0 -410
  84. docpull-1.5.0/docpull/metadata.py +0 -224
  85. docpull-1.5.0/docpull/naming.py +0 -259
  86. docpull-1.5.0/docpull/orchestrator.py +0 -254
  87. docpull-1.5.0/docpull/processors/__init__.py +0 -18
  88. docpull-1.5.0/docpull/processors/base.py +0 -151
  89. docpull-1.5.0/docpull/processors/content_filter.py +0 -292
  90. docpull-1.5.0/docpull/processors/deduplicator.py +0 -233
  91. docpull-1.5.0/docpull/processors/language_filter.py +0 -181
  92. docpull-1.5.0/docpull/processors/size_limiter.py +0 -221
  93. docpull-1.5.0/docpull/sources_config.py +0 -446
  94. docpull-1.5.0/docpull/vcs.py +0 -224
  95. docpull-1.5.0/docpull.egg-info/PKG-INFO +0 -478
  96. docpull-1.5.0/docpull.egg-info/SOURCES.txt +0 -49
  97. docpull-1.5.0/tests/test_config.py +0 -39
  98. docpull-1.5.0/tests/test_metadata_extractor.py +0 -233
  99. docpull-1.5.0/tests/test_orchestrator.py +0 -331
  100. docpull-1.5.0/tests/test_sources_config.py +0 -348
  101. {docpull-1.5.0 → docpull-2.2.0}/LICENSE +0 -0
  102. {docpull-1.5.0 → docpull-2.2.0}/setup.cfg +0 -0
  103. {docpull-1.5.0 → docpull-2.2.0/src}/docpull/__main__.py +0 -0
  104. {docpull-1.5.0 → docpull-2.2.0/src}/docpull/logging_config.py +0 -0
  105. {docpull-1.5.0 → docpull-2.2.0/src}/docpull/py.typed +0 -0
  106. {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/dependency_links.txt +0 -0
  107. {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/entry_points.txt +0 -0
  108. {docpull-1.5.0 → docpull-2.2.0/src}/docpull.egg-info/top_level.txt +0 -0
docpull-2.2.0/PKG-INFO ADDED
@@ -0,0 +1,208 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 2.2.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3.14
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Typing :: Typed
39
+ Requires-Python: >=3.10
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: requests>=2.31.0
43
+ Requires-Dist: beautifulsoup4>=4.12.0
44
+ Requires-Dist: html2text>=2020.1.16
45
+ Requires-Dist: defusedxml>=0.7.1
46
+ Requires-Dist: extruct>=0.15.0
47
+ Requires-Dist: aiohttp>=3.9.0
48
+ Requires-Dist: rich>=13.0.0
49
+ Requires-Dist: pyyaml>=6.0
50
+ Requires-Dist: gitpython>=3.1.40
51
+ Requires-Dist: pydantic>=2.0
52
+ Provides-Extra: js
53
+ Requires-Dist: playwright>=1.40.0; extra == "js"
54
+ Provides-Extra: proxy
55
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
56
+ Provides-Extra: normalize
57
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
58
+ Provides-Extra: all
59
+ Requires-Dist: playwright>=1.40.0; extra == "all"
60
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
61
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
62
+ Provides-Extra: dev
63
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
64
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
65
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
66
+ Requires-Dist: black>=23.0.0; extra == "dev"
67
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
68
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
69
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
70
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
71
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
72
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
73
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
74
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
75
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
76
+ Dynamic: license-file
77
+
78
+ # docpull
79
+
80
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
81
+
82
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
83
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
84
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
85
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
86
+
87
+ ## Install
88
+
89
+ ```bash
90
+ pip install docpull
91
+ ```
92
+
93
+ ## Usage
94
+
95
+ ```bash
96
+ # Basic fetch
97
+ docpull https://docs.example.com
98
+
99
+ # With options
100
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
101
+
102
+ # Filter paths
103
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
104
+
105
+ # Enable caching for incremental updates
106
+ docpull https://docs.example.com --cache
107
+
108
+ # JavaScript-heavy sites
109
+ pip install docpull[js]
110
+ docpull https://spa-site.com --js
111
+ ```
112
+
113
+ ## Profiles
114
+
115
+ ```bash
116
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
117
+ docpull https://site.com --profile mirror # Full site archive with caching
118
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
119
+ ```
120
+
121
+ ## Options
122
+
123
+ ```
124
+ Crawl:
125
+ --max-pages N Maximum pages to fetch
126
+ --max-depth N Maximum crawl depth
127
+ --include-paths P Only crawl matching URL patterns
128
+ --exclude-paths P Skip matching URL patterns
129
+ --js Enable JavaScript rendering
130
+
131
+ Cache:
132
+ --cache Enable caching for incremental updates
133
+ --cache-dir DIR Cache directory (default: .docpull-cache)
134
+ --cache-ttl DAYS Days before cache expires (default: 30)
135
+
136
+ Content:
137
+ --streaming-dedup Real-time duplicate detection
138
+ --language CODE Filter by language (e.g., en)
139
+
140
+ Output:
141
+ --output-dir, -o DIR Output directory (default: ./docs)
142
+ --dry-run Show what would be fetched
143
+ --verbose, -v Verbose output
144
+ ```
145
+
146
+ See `docpull --help` for all options.
147
+
148
+ ## Python API
149
+
150
+ ```python
151
+ import asyncio
152
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
153
+
154
+ async def main():
155
+ config = DocpullConfig(
156
+ url="https://docs.example.com",
157
+ profile=ProfileName.RAG,
158
+ crawl={"max_pages": 100},
159
+ cache={"enabled": True},
160
+ )
161
+
162
+ async with Fetcher(config) as fetcher:
163
+ async for event in fetcher.run():
164
+ if event.type == EventType.FETCH_PROGRESS:
165
+ print(f"{event.current}/{event.total}: {event.url}")
166
+
167
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
168
+
169
+ asyncio.run(main())
170
+ ```
171
+
172
+ ## Output
173
+
174
+ Each page becomes a Markdown file with YAML frontmatter:
175
+
176
+ ```markdown
177
+ ---
178
+ title: "Getting Started"
179
+ source: https://docs.example.com/guide
180
+ ---
181
+
182
+ # Getting Started
183
+ ...
184
+ ```
185
+
186
+ ## Security
187
+
188
+ - HTTPS-only, mandatory robots.txt compliance
189
+ - Blocks private/internal network IPs
190
+ - Path traversal and XXE protection
191
+
192
+ ## Troubleshooting
193
+
194
+ ```bash
195
+ docpull --doctor # Check installation
196
+ docpull URL --verbose # Verbose output
197
+ docpull URL --dry-run # Test without downloading
198
+ ```
199
+
200
+ ## Links
201
+
202
+ - [PyPI](https://pypi.org/project/docpull/)
203
+ - [GitHub](https://github.com/raintree-technology/docpull)
204
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
205
+
206
+ ## License
207
+
208
+ MIT
@@ -0,0 +1,131 @@
1
+ # docpull
2
+
3
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
4
+
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
8
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install docpull
14
+ ```
15
+
16
+ ## Usage
17
+
18
+ ```bash
19
+ # Basic fetch
20
+ docpull https://docs.example.com
21
+
22
+ # With options
23
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
24
+
25
+ # Filter paths
26
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
27
+
28
+ # Enable caching for incremental updates
29
+ docpull https://docs.example.com --cache
30
+
31
+ # JavaScript-heavy sites
32
+ pip install docpull[js]
33
+ docpull https://spa-site.com --js
34
+ ```
35
+
36
+ ## Profiles
37
+
38
+ ```bash
39
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
40
+ docpull https://site.com --profile mirror # Full site archive with caching
41
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
42
+ ```
43
+
44
+ ## Options
45
+
46
+ ```
47
+ Crawl:
48
+ --max-pages N Maximum pages to fetch
49
+ --max-depth N Maximum crawl depth
50
+ --include-paths P Only crawl matching URL patterns
51
+ --exclude-paths P Skip matching URL patterns
52
+ --js Enable JavaScript rendering
53
+
54
+ Cache:
55
+ --cache Enable caching for incremental updates
56
+ --cache-dir DIR Cache directory (default: .docpull-cache)
57
+ --cache-ttl DAYS Days before cache expires (default: 30)
58
+
59
+ Content:
60
+ --streaming-dedup Real-time duplicate detection
61
+ --language CODE Filter by language (e.g., en)
62
+
63
+ Output:
64
+ --output-dir, -o DIR Output directory (default: ./docs)
65
+ --dry-run Show what would be fetched
66
+ --verbose, -v Verbose output
67
+ ```
68
+
69
+ See `docpull --help` for all options.
70
+
71
+ ## Python API
72
+
73
+ ```python
74
+ import asyncio
75
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
76
+
77
+ async def main():
78
+ config = DocpullConfig(
79
+ url="https://docs.example.com",
80
+ profile=ProfileName.RAG,
81
+ crawl={"max_pages": 100},
82
+ cache={"enabled": True},
83
+ )
84
+
85
+ async with Fetcher(config) as fetcher:
86
+ async for event in fetcher.run():
87
+ if event.type == EventType.FETCH_PROGRESS:
88
+ print(f"{event.current}/{event.total}: {event.url}")
89
+
90
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
91
+
92
+ asyncio.run(main())
93
+ ```
94
+
95
+ ## Output
96
+
97
+ Each page becomes a Markdown file with YAML frontmatter:
98
+
99
+ ```markdown
100
+ ---
101
+ title: "Getting Started"
102
+ source: https://docs.example.com/guide
103
+ ---
104
+
105
+ # Getting Started
106
+ ...
107
+ ```
108
+
109
+ ## Security
110
+
111
+ - HTTPS-only, mandatory robots.txt compliance
112
+ - Blocks private/internal network IPs
113
+ - Path traversal and XXE protection
114
+
115
+ ## Troubleshooting
116
+
117
+ ```bash
118
+ docpull --doctor # Check installation
119
+ docpull URL --verbose # Verbose output
120
+ docpull URL --dry-run # Test without downloading
121
+ ```
122
+
123
+ ## Links
124
+
125
+ - [PyPI](https://pypi.org/project/docpull/)
126
+ - [GitHub](https://github.com/raintree-technology/docpull)
127
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
128
+
129
+ ## License
130
+
131
+ MIT
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "1.5.0"
7
+ version = "2.2.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
11
- requires-python = ">=3.9"
11
+ requires-python = ">=3.10"
12
12
  license = "MIT"
13
13
  license-files = ["LICENSE"]
14
14
  authors = [
@@ -72,6 +72,7 @@ dependencies = [
72
72
  "rich>=13.0.0",
73
73
  "pyyaml>=6.0",
74
74
  "gitpython>=3.1.40",
75
+ "pydantic>=2.0",
75
76
  ]
76
77
 
77
78
  [project.optional-dependencies]
@@ -117,7 +118,7 @@ Repository = "https://github.com/raintree-technology/docpull"
117
118
  "Releases" = "https://github.com/raintree-technology/docpull/releases"
118
119
 
119
120
  [tool.setuptools.packages.find]
120
- where = ["."]
121
+ where = ["src"]
121
122
  include = ["docpull*"]
122
123
 
123
124
  [tool.setuptools.package-data]
@@ -133,10 +134,10 @@ target-version = "py39"
133
134
 
134
135
  [tool.ruff.lint]
135
136
  select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
136
- ignore = []
137
+ ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
137
138
 
138
139
  [tool.mypy]
139
- python_version = "3.9"
140
+ python_version = "3.10"
140
141
  warn_return_any = true
141
142
  warn_unused_configs = true
142
143
  disallow_untyped_defs = true
@@ -144,7 +145,14 @@ disallow_any_unimported = true
144
145
  no_implicit_optional = true
145
146
  strict_equality = true
146
147
  warn_redundant_casts = true
148
+ ignore_missing_imports = true
147
149
  exclude = ["tests/"]
150
+ plugins = ["pydantic.mypy"]
151
+
152
+ [tool.pydantic-mypy]
153
+ init_forbid_extra = true
154
+ init_typed = true
155
+ warn_required_dynamic_aliases = true
148
156
 
149
157
  [[tool.mypy.overrides]]
150
158
  module = "playwright.*"
@@ -154,6 +162,20 @@ ignore_missing_imports = true
154
162
  module = "extruct.*"
155
163
  ignore_missing_imports = true
156
164
 
165
+ [[tool.mypy.overrides]]
166
+ module = "url_normalize"
167
+ ignore_missing_imports = true
168
+
169
+ [[tool.mypy.overrides]]
170
+ module = "docpull.models.*"
171
+ disallow_any_unimported = false
172
+ warn_return_any = false
173
+
174
+ [[tool.mypy.overrides]]
175
+ module = "docpull.concurrency.browser_pool"
176
+ disallow_any_unimported = false
177
+ warn_return_any = false
178
+
157
179
  [[tool.mypy.overrides]]
158
180
  module = "tests.*"
159
181
  disallow_untyped_defs = false
@@ -175,7 +197,7 @@ markers = [
175
197
  ]
176
198
 
177
199
  [tool.coverage.run]
178
- source = ["docpull"]
200
+ source = ["src/docpull"]
179
201
  omit = ["tests/*", "*/test_*.py"]
180
202
 
181
203
  [tool.coverage.report]
@@ -0,0 +1,56 @@
1
+ """
2
+ docpull - Fetch and convert documentation from any URL to markdown.
3
+
4
+ Usage:
5
+ from docpull import Fetcher, DocpullConfig, ProfileName
6
+
7
+ config = DocpullConfig(
8
+ url="https://docs.example.com",
9
+ profile=ProfileName.RAG,
10
+ )
11
+
12
+ async with Fetcher(config) as fetcher:
13
+ async for event in fetcher.run():
14
+ print(event)
15
+ """
16
+
17
+ __version__ = "2.2.0"
18
+
19
+ from .cache import CacheManager, StreamingDeduplicator
20
+ from .core.fetcher import Fetcher, fetch_blocking
21
+ from .models.config import (
22
+ CacheConfig,
23
+ ContentFilterConfig,
24
+ CrawlConfig,
25
+ DocpullConfig,
26
+ IntegrationConfig,
27
+ NetworkConfig,
28
+ OutputConfig,
29
+ PerformanceConfig,
30
+ ProfileName,
31
+ )
32
+ from .models.events import EventType, FetchEvent, FetchStats
33
+
34
+ __all__ = [
35
+ "__version__",
36
+ # Core
37
+ "Fetcher",
38
+ "fetch_blocking",
39
+ # Config
40
+ "DocpullConfig",
41
+ "ProfileName",
42
+ "CrawlConfig",
43
+ "ContentFilterConfig",
44
+ "OutputConfig",
45
+ "NetworkConfig",
46
+ "PerformanceConfig",
47
+ "IntegrationConfig",
48
+ "CacheConfig",
49
+ # Events
50
+ "EventType",
51
+ "FetchEvent",
52
+ "FetchStats",
53
+ # Cache
54
+ "CacheManager",
55
+ "StreamingDeduplicator",
56
+ ]
@@ -0,0 +1,12 @@
1
+ """Caching and deduplication for docpull."""
2
+
3
+ from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
4
+ from .streaming_dedup import StreamingDeduplicator
5
+
6
+ __all__ = [
7
+ "CacheManager",
8
+ "CacheState",
9
+ "ManifestEntry",
10
+ "StreamingDeduplicator",
11
+ "DEFAULT_TTL_DAYS",
12
+ ]