docpull 1.3.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. docpull-2.0.0/PKG-INFO +207 -0
  2. docpull-2.0.0/README.md +130 -0
  3. {docpull-1.3.0 → docpull-2.0.0}/pyproject.toml +35 -5
  4. docpull-2.0.0/src/docpull/__init__.py +56 -0
  5. docpull-2.0.0/src/docpull/cache/__init__.py +12 -0
  6. docpull-2.0.0/src/docpull/cache/manager.py +388 -0
  7. docpull-2.0.0/src/docpull/cache/streaming_dedup.py +135 -0
  8. docpull-2.0.0/src/docpull/cli.py +408 -0
  9. docpull-2.0.0/src/docpull/concurrency/__init__.py +15 -0
  10. docpull-2.0.0/src/docpull/concurrency/browser_pool.py +337 -0
  11. docpull-2.0.0/src/docpull/concurrency/manager.py +111 -0
  12. docpull-2.0.0/src/docpull/conversion/__init__.py +15 -0
  13. docpull-2.0.0/src/docpull/conversion/extractor.py +246 -0
  14. docpull-2.0.0/src/docpull/conversion/markdown.py +201 -0
  15. docpull-2.0.0/src/docpull/conversion/protocols.py +46 -0
  16. docpull-2.0.0/src/docpull/core/__init__.py +5 -0
  17. docpull-2.0.0/src/docpull/core/fetcher.py +501 -0
  18. docpull-2.0.0/src/docpull/discovery/__init__.py +29 -0
  19. docpull-2.0.0/src/docpull/discovery/composite.py +127 -0
  20. docpull-2.0.0/src/docpull/discovery/crawler.py +242 -0
  21. docpull-2.0.0/src/docpull/discovery/filters.py +230 -0
  22. docpull-2.0.0/src/docpull/discovery/protocols.py +52 -0
  23. docpull-2.0.0/src/docpull/discovery/sitemap.py +258 -0
  24. docpull-2.0.0/src/docpull/http/__init__.py +12 -0
  25. docpull-2.0.0/src/docpull/http/client.py +321 -0
  26. docpull-2.0.0/src/docpull/http/protocols.py +76 -0
  27. docpull-2.0.0/src/docpull/http/rate_limiter.py +148 -0
  28. {docpull-1.3.0 → docpull-2.0.0/src}/docpull/metadata_extractor.py +3 -3
  29. docpull-2.0.0/src/docpull/models/__init__.py +37 -0
  30. docpull-2.0.0/src/docpull/models/config.py +265 -0
  31. docpull-2.0.0/src/docpull/models/events.py +145 -0
  32. docpull-2.0.0/src/docpull/models/profiles.py +101 -0
  33. docpull-2.0.0/src/docpull/pipeline/__init__.py +5 -0
  34. docpull-2.0.0/src/docpull/pipeline/base.py +187 -0
  35. docpull-2.0.0/src/docpull/pipeline/steps/__init__.py +17 -0
  36. docpull-2.0.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
  37. docpull-2.0.0/src/docpull/pipeline/steps/convert.py +134 -0
  38. docpull-2.0.0/src/docpull/pipeline/steps/dedup.py +96 -0
  39. docpull-2.0.0/src/docpull/pipeline/steps/fetch.py +192 -0
  40. docpull-2.0.0/src/docpull/pipeline/steps/metadata.py +139 -0
  41. docpull-2.0.0/src/docpull/pipeline/steps/save.py +167 -0
  42. docpull-2.0.0/src/docpull/pipeline/steps/validate.py +140 -0
  43. docpull-2.0.0/src/docpull/security/__init__.py +6 -0
  44. docpull-2.0.0/src/docpull/security/robots.py +192 -0
  45. docpull-2.0.0/src/docpull/security/url_validator.py +174 -0
  46. docpull-2.0.0/src/docpull.egg-info/PKG-INFO +207 -0
  47. docpull-2.0.0/src/docpull.egg-info/SOURCES.txt +59 -0
  48. docpull-2.0.0/src/docpull.egg-info/dependency_links.txt +1 -0
  49. docpull-2.0.0/src/docpull.egg-info/entry_points.txt +2 -0
  50. docpull-2.0.0/src/docpull.egg-info/requires.txt +39 -0
  51. docpull-2.0.0/src/docpull.egg-info/top_level.txt +1 -0
  52. docpull-2.0.0/tests/test_v2_conversion.py +294 -0
  53. docpull-2.0.0/tests/test_v2_discovery.py +355 -0
  54. docpull-2.0.0/tests/test_v2_integration.py +359 -0
  55. docpull-2.0.0/tests/test_v2_pipeline.py +369 -0
  56. docpull-1.3.0/.editorconfig +0 -30
  57. docpull-1.3.0/.pre-commit-config.yaml +0 -30
  58. docpull-1.3.0/CHANGELOG.md +0 -403
  59. docpull-1.3.0/CONTRIBUTING.md +0 -189
  60. docpull-1.3.0/MANIFEST.in +0 -49
  61. docpull-1.3.0/Makefile +0 -44
  62. docpull-1.3.0/PKG-INFO +0 -459
  63. docpull-1.3.0/README.md +0 -389
  64. docpull-1.3.0/SECURITY.md +0 -206
  65. docpull-1.3.0/TROUBLESHOOTING.md +0 -348
  66. docpull-1.3.0/docpull/__init__.py +0 -15
  67. docpull-1.3.0/docpull/archive.py +0 -186
  68. docpull-1.3.0/docpull/cache.py +0 -256
  69. docpull-1.3.0/docpull/cli.py +0 -851
  70. docpull-1.3.0/docpull/config.py +0 -316
  71. docpull-1.3.0/docpull/fetchers/__init__.py +0 -9
  72. docpull-1.3.0/docpull/fetchers/async_fetcher.py +0 -322
  73. docpull-1.3.0/docpull/fetchers/base.py +0 -502
  74. docpull-1.3.0/docpull/fetchers/generic.py +0 -255
  75. docpull-1.3.0/docpull/fetchers/generic_async.py +0 -290
  76. docpull-1.3.0/docpull/fetchers/parallel_base.py +0 -93
  77. docpull-1.3.0/docpull/fetchers/stripe.py +0 -49
  78. docpull-1.3.0/docpull/formatters/__init__.py +0 -50
  79. docpull-1.3.0/docpull/formatters/base.py +0 -102
  80. docpull-1.3.0/docpull/formatters/json.py +0 -100
  81. docpull-1.3.0/docpull/formatters/markdown.py +0 -49
  82. docpull-1.3.0/docpull/formatters/sqlite.py +0 -266
  83. docpull-1.3.0/docpull/formatters/toon.py +0 -90
  84. docpull-1.3.0/docpull/hooks.py +0 -222
  85. docpull-1.3.0/docpull/indexer.py +0 -410
  86. docpull-1.3.0/docpull/metadata.py +0 -224
  87. docpull-1.3.0/docpull/naming.py +0 -259
  88. docpull-1.3.0/docpull/orchestrator.py +0 -254
  89. docpull-1.3.0/docpull/processors/__init__.py +0 -18
  90. docpull-1.3.0/docpull/processors/base.py +0 -151
  91. docpull-1.3.0/docpull/processors/content_filter.py +0 -292
  92. docpull-1.3.0/docpull/processors/deduplicator.py +0 -233
  93. docpull-1.3.0/docpull/processors/language_filter.py +0 -181
  94. docpull-1.3.0/docpull/processors/size_limiter.py +0 -221
  95. docpull-1.3.0/docpull/profiles/__init__.py +0 -53
  96. docpull-1.3.0/docpull/profiles/base.py +0 -64
  97. docpull-1.3.0/docpull/profiles/stripe.py +0 -14
  98. docpull-1.3.0/docpull/sources_config.py +0 -446
  99. docpull-1.3.0/docpull/utils/__init__.py +0 -6
  100. docpull-1.3.0/docpull/utils/file_utils.py +0 -97
  101. docpull-1.3.0/docpull/vcs.py +0 -224
  102. docpull-1.3.0/docpull.egg-info/SOURCES.txt +0 -64
  103. docpull-1.3.0/examples/README.md +0 -280
  104. docpull-1.3.0/examples/deduplication-strategies.yaml +0 -29
  105. docpull-1.3.0/examples/format-conversion.yaml +0 -25
  106. docpull-1.3.0/examples/incremental-updates.yaml +0 -26
  107. docpull-1.3.0/examples/multi-source-optimized.yaml +0 -45
  108. docpull-1.3.0/examples/selective-crawling.yaml +0 -26
  109. docpull-1.3.0/examples/simple-optimization.yaml +0 -14
  110. docpull-1.3.0/requirements.txt +0 -34
  111. docpull-1.3.0/tests/test_config.py +0 -43
  112. docpull-1.3.0/tests/test_metadata_extractor.py +0 -233
  113. docpull-1.3.0/tests/test_orchestrator.py +0 -331
  114. docpull-1.3.0/tests/test_sources_config.py +0 -348
  115. {docpull-1.3.0 → docpull-2.0.0}/LICENSE +0 -0
  116. {docpull-1.3.0 → docpull-2.0.0}/setup.cfg +0 -0
  117. {docpull-1.3.0 → docpull-2.0.0/src}/docpull/__main__.py +0 -0
  118. {docpull-1.3.0 → docpull-2.0.0/src}/docpull/doctor.py +0 -0
  119. {docpull-1.3.0/docpull/utils → docpull-2.0.0/src/docpull}/logging_config.py +0 -0
  120. {docpull-1.3.0 → docpull-2.0.0/src}/docpull/py.typed +0 -0
docpull-2.0.0/PKG-INFO ADDED
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 2.0.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3.14
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Typing :: Typed
39
+ Requires-Python: >=3.9
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: requests>=2.31.0
43
+ Requires-Dist: beautifulsoup4>=4.12.0
44
+ Requires-Dist: html2text>=2020.1.16
45
+ Requires-Dist: defusedxml>=0.7.1
46
+ Requires-Dist: extruct>=0.15.0
47
+ Requires-Dist: aiohttp>=3.9.0
48
+ Requires-Dist: rich>=13.0.0
49
+ Requires-Dist: pyyaml>=6.0
50
+ Requires-Dist: gitpython>=3.1.40
51
+ Requires-Dist: pydantic>=2.0
52
+ Provides-Extra: js
53
+ Requires-Dist: playwright>=1.40.0; extra == "js"
54
+ Provides-Extra: proxy
55
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
56
+ Provides-Extra: normalize
57
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
58
+ Provides-Extra: all
59
+ Requires-Dist: playwright>=1.40.0; extra == "all"
60
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
61
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
62
+ Provides-Extra: dev
63
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
64
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
65
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
66
+ Requires-Dist: black>=23.0.0; extra == "dev"
67
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
68
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
69
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
70
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
71
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
72
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
73
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
74
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
75
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
76
+ Dynamic: license-file
77
+
78
+ # docpull
79
+
80
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
81
+
82
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
83
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
84
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
85
+
86
+ ## Install
87
+
88
+ ```bash
89
+ pip install docpull
90
+ ```
91
+
92
+ ## Usage
93
+
94
+ ```bash
95
+ # Basic fetch
96
+ docpull https://docs.example.com
97
+
98
+ # With options
99
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
100
+
101
+ # Filter paths
102
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
103
+
104
+ # Enable caching for incremental updates
105
+ docpull https://docs.example.com --cache
106
+
107
+ # JavaScript-heavy sites
108
+ pip install docpull[js]
109
+ docpull https://spa-site.com --js
110
+ ```
111
+
112
+ ## Profiles
113
+
114
+ ```bash
115
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
116
+ docpull https://site.com --profile mirror # Full site archive with caching
117
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
118
+ ```
119
+
120
+ ## Options
121
+
122
+ ```
123
+ Crawl:
124
+ --max-pages N Maximum pages to fetch
125
+ --max-depth N Maximum crawl depth
126
+ --include-paths P Only crawl matching URL patterns
127
+ --exclude-paths P Skip matching URL patterns
128
+ --js Enable JavaScript rendering
129
+
130
+ Cache:
131
+ --cache Enable caching for incremental updates
132
+ --cache-dir DIR Cache directory (default: .docpull-cache)
133
+ --cache-ttl DAYS Days before cache expires (default: 30)
134
+
135
+ Content:
136
+ --streaming-dedup Real-time duplicate detection
137
+ --language CODE Filter by language (e.g., en)
138
+
139
+ Output:
140
+ --output-dir, -o DIR Output directory (default: ./docs)
141
+ --dry-run Show what would be fetched
142
+ --verbose, -v Verbose output
143
+ ```
144
+
145
+ See `docpull --help` for all options.
146
+
147
+ ## Python API
148
+
149
+ ```python
150
+ import asyncio
151
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
152
+
153
+ async def main():
154
+ config = DocpullConfig(
155
+ url="https://docs.example.com",
156
+ profile=ProfileName.RAG,
157
+ crawl={"max_pages": 100},
158
+ cache={"enabled": True},
159
+ )
160
+
161
+ async with Fetcher(config) as fetcher:
162
+ async for event in fetcher.run():
163
+ if event.type == EventType.FETCH_PROGRESS:
164
+ print(f"{event.current}/{event.total}: {event.url}")
165
+
166
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
167
+
168
+ asyncio.run(main())
169
+ ```
170
+
171
+ ## Output
172
+
173
+ Each page becomes a Markdown file with YAML frontmatter:
174
+
175
+ ```markdown
176
+ ---
177
+ title: "Getting Started"
178
+ source: https://docs.example.com/guide
179
+ ---
180
+
181
+ # Getting Started
182
+ ...
183
+ ```
184
+
185
+ ## Security
186
+
187
+ - HTTPS-only, mandatory robots.txt compliance
188
+ - Blocks private/internal network IPs
189
+ - Path traversal and XXE protection
190
+
191
+ ## Troubleshooting
192
+
193
+ ```bash
194
+ docpull --doctor # Check installation
195
+ docpull URL --verbose # Verbose output
196
+ docpull URL --dry-run # Test without downloading
197
+ ```
198
+
199
+ ## Links
200
+
201
+ - [PyPI](https://pypi.org/project/docpull/)
202
+ - [GitHub](https://github.com/raintree-technology/docpull)
203
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
204
+
205
+ ## License
206
+
207
+ MIT
@@ -0,0 +1,130 @@
1
+ # docpull
2
+
3
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
4
+
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install docpull
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ # Basic fetch
19
+ docpull https://docs.example.com
20
+
21
+ # With options
22
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
23
+
24
+ # Filter paths
25
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
26
+
27
+ # Enable caching for incremental updates
28
+ docpull https://docs.example.com --cache
29
+
30
+ # JavaScript-heavy sites
31
+ pip install docpull[js]
32
+ docpull https://spa-site.com --js
33
+ ```
34
+
35
+ ## Profiles
36
+
37
+ ```bash
38
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
39
+ docpull https://site.com --profile mirror # Full site archive with caching
40
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
41
+ ```
42
+
43
+ ## Options
44
+
45
+ ```
46
+ Crawl:
47
+ --max-pages N Maximum pages to fetch
48
+ --max-depth N Maximum crawl depth
49
+ --include-paths P Only crawl matching URL patterns
50
+ --exclude-paths P Skip matching URL patterns
51
+ --js Enable JavaScript rendering
52
+
53
+ Cache:
54
+ --cache Enable caching for incremental updates
55
+ --cache-dir DIR Cache directory (default: .docpull-cache)
56
+ --cache-ttl DAYS Days before cache expires (default: 30)
57
+
58
+ Content:
59
+ --streaming-dedup Real-time duplicate detection
60
+ --language CODE Filter by language (e.g., en)
61
+
62
+ Output:
63
+ --output-dir, -o DIR Output directory (default: ./docs)
64
+ --dry-run Show what would be fetched
65
+ --verbose, -v Verbose output
66
+ ```
67
+
68
+ See `docpull --help` for all options.
69
+
70
+ ## Python API
71
+
72
+ ```python
73
+ import asyncio
74
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
75
+
76
+ async def main():
77
+ config = DocpullConfig(
78
+ url="https://docs.example.com",
79
+ profile=ProfileName.RAG,
80
+ crawl={"max_pages": 100},
81
+ cache={"enabled": True},
82
+ )
83
+
84
+ async with Fetcher(config) as fetcher:
85
+ async for event in fetcher.run():
86
+ if event.type == EventType.FETCH_PROGRESS:
87
+ print(f"{event.current}/{event.total}: {event.url}")
88
+
89
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
90
+
91
+ asyncio.run(main())
92
+ ```
93
+
94
+ ## Output
95
+
96
+ Each page becomes a Markdown file with YAML frontmatter:
97
+
98
+ ```markdown
99
+ ---
100
+ title: "Getting Started"
101
+ source: https://docs.example.com/guide
102
+ ---
103
+
104
+ # Getting Started
105
+ ...
106
+ ```
107
+
108
+ ## Security
109
+
110
+ - HTTPS-only, mandatory robots.txt compliance
111
+ - Blocks private/internal network IPs
112
+ - Path traversal and XXE protection
113
+
114
+ ## Troubleshooting
115
+
116
+ ```bash
117
+ docpull --doctor # Check installation
118
+ docpull URL --verbose # Verbose output
119
+ docpull URL --dry-run # Test without downloading
120
+ ```
121
+
122
+ ## Links
123
+
124
+ - [PyPI](https://pypi.org/project/docpull/)
125
+ - [GitHub](https://github.com/raintree-technology/docpull)
126
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
127
+
128
+ ## License
129
+
130
+ MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "1.3.0"
7
+ version = "2.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -72,14 +72,23 @@ dependencies = [
72
72
  "rich>=13.0.0",
73
73
  "pyyaml>=6.0",
74
74
  "gitpython>=3.1.40",
75
+ "pydantic>=2.0",
75
76
  ]
76
77
 
77
78
  [project.optional-dependencies]
78
79
  js = [
79
80
  "playwright>=1.40.0",
80
81
  ]
82
+ proxy = [
83
+ "aiohttp-socks>=0.8.0",
84
+ ]
85
+ normalize = [
86
+ "url-normalize>=1.4.0",
87
+ ]
81
88
  all = [
82
89
  "playwright>=1.40.0",
90
+ "aiohttp-socks>=0.8.0",
91
+ "url-normalize>=1.4.0",
83
92
  ]
84
93
  dev = [
85
94
  "pytest>=7.0.0",
@@ -106,10 +115,10 @@ Documentation = "https://github.com/raintree-technology/docpull#readme"
106
115
  Repository = "https://github.com/raintree-technology/docpull"
107
116
  "Source Code" = "https://github.com/raintree-technology/docpull"
108
117
  "Bug Tracker" = "https://github.com/raintree-technology/docpull/issues"
109
- "Changelog" = "https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md"
118
+ "Releases" = "https://github.com/raintree-technology/docpull/releases"
110
119
 
111
120
  [tool.setuptools.packages.find]
112
- where = ["."]
121
+ where = ["src"]
113
122
  include = ["docpull*"]
114
123
 
115
124
  [tool.setuptools.package-data]
@@ -125,7 +134,7 @@ target-version = "py39"
125
134
 
126
135
  [tool.ruff.lint]
127
136
  select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
128
- ignore = []
137
+ ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
129
138
 
130
139
  [tool.mypy]
131
140
  python_version = "3.9"
@@ -136,7 +145,14 @@ disallow_any_unimported = true
136
145
  no_implicit_optional = true
137
146
  strict_equality = true
138
147
  warn_redundant_casts = true
148
+ ignore_missing_imports = true
139
149
  exclude = ["tests/"]
150
+ plugins = ["pydantic.mypy"]
151
+
152
+ [tool.pydantic-mypy]
153
+ init_forbid_extra = true
154
+ init_typed = true
155
+ warn_required_dynamic_aliases = true
140
156
 
141
157
  [[tool.mypy.overrides]]
142
158
  module = "playwright.*"
@@ -146,6 +162,20 @@ ignore_missing_imports = true
146
162
  module = "extruct.*"
147
163
  ignore_missing_imports = true
148
164
 
165
+ [[tool.mypy.overrides]]
166
+ module = "url_normalize"
167
+ ignore_missing_imports = true
168
+
169
+ [[tool.mypy.overrides]]
170
+ module = "docpull.models.*"
171
+ disallow_any_unimported = false
172
+ warn_return_any = false
173
+
174
+ [[tool.mypy.overrides]]
175
+ module = "docpull.concurrency.browser_pool"
176
+ disallow_any_unimported = false
177
+ warn_return_any = false
178
+
149
179
  [[tool.mypy.overrides]]
150
180
  module = "tests.*"
151
181
  disallow_untyped_defs = false
@@ -167,7 +197,7 @@ markers = [
167
197
  ]
168
198
 
169
199
  [tool.coverage.run]
170
- source = ["docpull"]
200
+ source = ["src/docpull"]
171
201
  omit = ["tests/*", "*/test_*.py"]
172
202
 
173
203
  [tool.coverage.report]
@@ -0,0 +1,56 @@
1
+ """
2
+ docpull - Fetch and convert documentation from any URL to markdown.
3
+
4
+ Usage:
5
+ from docpull import Fetcher, DocpullConfig, ProfileName
6
+
7
+ config = DocpullConfig(
8
+ url="https://docs.example.com",
9
+ profile=ProfileName.RAG,
10
+ )
11
+
12
+ async with Fetcher(config) as fetcher:
13
+ async for event in fetcher.run():
14
+ print(event)
15
+ """
16
+
17
+ __version__ = "2.0.0"
18
+
19
+ from .cache import CacheManager, StreamingDeduplicator
20
+ from .core.fetcher import Fetcher, fetch_blocking
21
+ from .models.config import (
22
+ CacheConfig,
23
+ ContentFilterConfig,
24
+ CrawlConfig,
25
+ DocpullConfig,
26
+ IntegrationConfig,
27
+ NetworkConfig,
28
+ OutputConfig,
29
+ PerformanceConfig,
30
+ ProfileName,
31
+ )
32
+ from .models.events import EventType, FetchEvent, FetchStats
33
+
34
+ __all__ = [
35
+ "__version__",
36
+ # Core
37
+ "Fetcher",
38
+ "fetch_blocking",
39
+ # Config
40
+ "DocpullConfig",
41
+ "ProfileName",
42
+ "CrawlConfig",
43
+ "ContentFilterConfig",
44
+ "OutputConfig",
45
+ "NetworkConfig",
46
+ "PerformanceConfig",
47
+ "IntegrationConfig",
48
+ "CacheConfig",
49
+ # Events
50
+ "EventType",
51
+ "FetchEvent",
52
+ "FetchStats",
53
+ # Cache
54
+ "CacheManager",
55
+ "StreamingDeduplicator",
56
+ ]
@@ -0,0 +1,12 @@
1
+ """Caching and deduplication for docpull."""
2
+
3
+ from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
4
+ from .streaming_dedup import StreamingDeduplicator
5
+
6
+ __all__ = [
7
+ "CacheManager",
8
+ "CacheState",
9
+ "ManifestEntry",
10
+ "StreamingDeduplicator",
11
+ "DEFAULT_TTL_DAYS",
12
+ ]