docpull 1.5.0__tar.gz → 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (100) hide show
  1. docpull-2.0.0/PKG-INFO +207 -0
  2. docpull-2.0.0/README.md +130 -0
  3. {docpull-1.5.0 → docpull-2.0.0}/pyproject.toml +26 -4
  4. docpull-2.0.0/src/docpull/__init__.py +56 -0
  5. docpull-2.0.0/src/docpull/cache/__init__.py +12 -0
  6. docpull-2.0.0/src/docpull/cache/manager.py +388 -0
  7. docpull-2.0.0/src/docpull/cache/streaming_dedup.py +135 -0
  8. docpull-2.0.0/src/docpull/cli.py +408 -0
  9. docpull-2.0.0/src/docpull/concurrency/__init__.py +15 -0
  10. docpull-2.0.0/src/docpull/concurrency/browser_pool.py +337 -0
  11. docpull-2.0.0/src/docpull/concurrency/manager.py +111 -0
  12. docpull-2.0.0/src/docpull/conversion/__init__.py +15 -0
  13. docpull-2.0.0/src/docpull/conversion/extractor.py +246 -0
  14. docpull-2.0.0/src/docpull/conversion/markdown.py +201 -0
  15. docpull-2.0.0/src/docpull/conversion/protocols.py +46 -0
  16. docpull-2.0.0/src/docpull/core/__init__.py +5 -0
  17. docpull-2.0.0/src/docpull/core/fetcher.py +501 -0
  18. docpull-2.0.0/src/docpull/discovery/__init__.py +29 -0
  19. docpull-2.0.0/src/docpull/discovery/composite.py +127 -0
  20. docpull-2.0.0/src/docpull/discovery/crawler.py +242 -0
  21. docpull-2.0.0/src/docpull/discovery/filters.py +230 -0
  22. docpull-2.0.0/src/docpull/discovery/protocols.py +52 -0
  23. docpull-2.0.0/src/docpull/discovery/sitemap.py +258 -0
  24. docpull-2.0.0/src/docpull/http/__init__.py +12 -0
  25. docpull-2.0.0/src/docpull/http/client.py +321 -0
  26. docpull-2.0.0/src/docpull/http/protocols.py +76 -0
  27. docpull-2.0.0/src/docpull/http/rate_limiter.py +148 -0
  28. docpull-2.0.0/src/docpull/models/__init__.py +37 -0
  29. docpull-2.0.0/src/docpull/models/config.py +265 -0
  30. docpull-2.0.0/src/docpull/models/events.py +145 -0
  31. docpull-2.0.0/src/docpull/models/profiles.py +101 -0
  32. docpull-2.0.0/src/docpull/pipeline/__init__.py +5 -0
  33. docpull-2.0.0/src/docpull/pipeline/base.py +187 -0
  34. docpull-2.0.0/src/docpull/pipeline/steps/__init__.py +17 -0
  35. docpull-2.0.0/src/docpull/pipeline/steps/browser_fetch.py +141 -0
  36. docpull-2.0.0/src/docpull/pipeline/steps/convert.py +134 -0
  37. docpull-2.0.0/src/docpull/pipeline/steps/dedup.py +96 -0
  38. docpull-2.0.0/src/docpull/pipeline/steps/fetch.py +192 -0
  39. docpull-2.0.0/src/docpull/pipeline/steps/metadata.py +139 -0
  40. docpull-2.0.0/src/docpull/pipeline/steps/save.py +167 -0
  41. docpull-2.0.0/src/docpull/pipeline/steps/validate.py +140 -0
  42. docpull-2.0.0/src/docpull/security/__init__.py +6 -0
  43. docpull-2.0.0/src/docpull/security/robots.py +192 -0
  44. docpull-2.0.0/src/docpull/security/url_validator.py +174 -0
  45. docpull-2.0.0/src/docpull.egg-info/PKG-INFO +207 -0
  46. docpull-2.0.0/src/docpull.egg-info/SOURCES.txt +59 -0
  47. {docpull-1.5.0 → docpull-2.0.0/src}/docpull.egg-info/requires.txt +1 -0
  48. docpull-2.0.0/tests/test_v2_conversion.py +294 -0
  49. docpull-2.0.0/tests/test_v2_discovery.py +355 -0
  50. docpull-2.0.0/tests/test_v2_integration.py +359 -0
  51. docpull-2.0.0/tests/test_v2_pipeline.py +369 -0
  52. docpull-1.5.0/PKG-INFO +0 -478
  53. docpull-1.5.0/README.md +0 -402
  54. docpull-1.5.0/docpull/__init__.py +0 -13
  55. docpull-1.5.0/docpull/archive.py +0 -186
  56. docpull-1.5.0/docpull/cache.py +0 -256
  57. docpull-1.5.0/docpull/cli.py +0 -782
  58. docpull-1.5.0/docpull/config.py +0 -332
  59. docpull-1.5.0/docpull/fetchers/__init__.py +0 -11
  60. docpull-1.5.0/docpull/fetchers/async_fetcher.py +0 -463
  61. docpull-1.5.0/docpull/fetchers/base.py +0 -686
  62. docpull-1.5.0/docpull/fetchers/generic.py +0 -215
  63. docpull-1.5.0/docpull/fetchers/generic_async.py +0 -324
  64. docpull-1.5.0/docpull/fetchers/parallel_base.py +0 -93
  65. docpull-1.5.0/docpull/file_utils.py +0 -97
  66. docpull-1.5.0/docpull/formatters/__init__.py +0 -50
  67. docpull-1.5.0/docpull/formatters/base.py +0 -102
  68. docpull-1.5.0/docpull/formatters/json.py +0 -100
  69. docpull-1.5.0/docpull/formatters/markdown.py +0 -49
  70. docpull-1.5.0/docpull/formatters/sqlite.py +0 -266
  71. docpull-1.5.0/docpull/formatters/toon.py +0 -90
  72. docpull-1.5.0/docpull/hooks.py +0 -222
  73. docpull-1.5.0/docpull/indexer.py +0 -410
  74. docpull-1.5.0/docpull/metadata.py +0 -224
  75. docpull-1.5.0/docpull/naming.py +0 -259
  76. docpull-1.5.0/docpull/orchestrator.py +0 -254
  77. docpull-1.5.0/docpull/processors/__init__.py +0 -18
  78. docpull-1.5.0/docpull/processors/base.py +0 -151
  79. docpull-1.5.0/docpull/processors/content_filter.py +0 -292
  80. docpull-1.5.0/docpull/processors/deduplicator.py +0 -233
  81. docpull-1.5.0/docpull/processors/language_filter.py +0 -181
  82. docpull-1.5.0/docpull/processors/size_limiter.py +0 -221
  83. docpull-1.5.0/docpull/sources_config.py +0 -446
  84. docpull-1.5.0/docpull/vcs.py +0 -224
  85. docpull-1.5.0/docpull.egg-info/PKG-INFO +0 -478
  86. docpull-1.5.0/docpull.egg-info/SOURCES.txt +0 -49
  87. docpull-1.5.0/tests/test_config.py +0 -39
  88. docpull-1.5.0/tests/test_metadata_extractor.py +0 -233
  89. docpull-1.5.0/tests/test_orchestrator.py +0 -331
  90. docpull-1.5.0/tests/test_sources_config.py +0 -348
  91. {docpull-1.5.0 → docpull-2.0.0}/LICENSE +0 -0
  92. {docpull-1.5.0 → docpull-2.0.0}/setup.cfg +0 -0
  93. {docpull-1.5.0 → docpull-2.0.0/src}/docpull/__main__.py +0 -0
  94. {docpull-1.5.0 → docpull-2.0.0/src}/docpull/doctor.py +0 -0
  95. {docpull-1.5.0 → docpull-2.0.0/src}/docpull/logging_config.py +0 -0
  96. {docpull-1.5.0 → docpull-2.0.0/src}/docpull/metadata_extractor.py +0 -0
  97. {docpull-1.5.0 → docpull-2.0.0/src}/docpull/py.typed +0 -0
  98. {docpull-1.5.0 → docpull-2.0.0/src}/docpull.egg-info/dependency_links.txt +0 -0
  99. {docpull-1.5.0 → docpull-2.0.0/src}/docpull.egg-info/entry_points.txt +0 -0
  100. {docpull-1.5.0 → docpull-2.0.0/src}/docpull.egg-info/top_level.txt +0 -0
docpull-2.0.0/PKG-INFO ADDED
@@ -0,0 +1,207 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 2.0.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Releases, https://github.com/raintree-technology/docpull/releases
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3.14
37
+ Classifier: Programming Language :: Python :: 3 :: Only
38
+ Classifier: Typing :: Typed
39
+ Requires-Python: >=3.9
40
+ Description-Content-Type: text/markdown
41
+ License-File: LICENSE
42
+ Requires-Dist: requests>=2.31.0
43
+ Requires-Dist: beautifulsoup4>=4.12.0
44
+ Requires-Dist: html2text>=2020.1.16
45
+ Requires-Dist: defusedxml>=0.7.1
46
+ Requires-Dist: extruct>=0.15.0
47
+ Requires-Dist: aiohttp>=3.9.0
48
+ Requires-Dist: rich>=13.0.0
49
+ Requires-Dist: pyyaml>=6.0
50
+ Requires-Dist: gitpython>=3.1.40
51
+ Requires-Dist: pydantic>=2.0
52
+ Provides-Extra: js
53
+ Requires-Dist: playwright>=1.40.0; extra == "js"
54
+ Provides-Extra: proxy
55
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
56
+ Provides-Extra: normalize
57
+ Requires-Dist: url-normalize>=1.4.0; extra == "normalize"
58
+ Provides-Extra: all
59
+ Requires-Dist: playwright>=1.40.0; extra == "all"
60
+ Requires-Dist: aiohttp-socks>=0.8.0; extra == "all"
61
+ Requires-Dist: url-normalize>=1.4.0; extra == "all"
62
+ Provides-Extra: dev
63
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
64
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
65
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
66
+ Requires-Dist: black>=23.0.0; extra == "dev"
67
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
68
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
69
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
70
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
71
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
72
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
73
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
74
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
75
+ Requires-Dist: types-pyyaml>=6.0.0; extra == "dev"
76
+ Dynamic: license-file
77
+
78
+ # docpull
79
+
80
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
81
+
82
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
83
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
84
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
85
+
86
+ ## Install
87
+
88
+ ```bash
89
+ pip install docpull
90
+ ```
91
+
92
+ ## Usage
93
+
94
+ ```bash
95
+ # Basic fetch
96
+ docpull https://docs.example.com
97
+
98
+ # With options
99
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
100
+
101
+ # Filter paths
102
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
103
+
104
+ # Enable caching for incremental updates
105
+ docpull https://docs.example.com --cache
106
+
107
+ # JavaScript-heavy sites
108
+ pip install docpull[js]
109
+ docpull https://spa-site.com --js
110
+ ```
111
+
112
+ ## Profiles
113
+
114
+ ```bash
115
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
116
+ docpull https://site.com --profile mirror # Full site archive with caching
117
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
118
+ ```
119
+
120
+ ## Options
121
+
122
+ ```
123
+ Crawl:
124
+ --max-pages N Maximum pages to fetch
125
+ --max-depth N Maximum crawl depth
126
+ --include-paths P Only crawl matching URL patterns
127
+ --exclude-paths P Skip matching URL patterns
128
+ --js Enable JavaScript rendering
129
+
130
+ Cache:
131
+ --cache Enable caching for incremental updates
132
+ --cache-dir DIR Cache directory (default: .docpull-cache)
133
+ --cache-ttl DAYS Days before cache expires (default: 30)
134
+
135
+ Content:
136
+ --streaming-dedup Real-time duplicate detection
137
+ --language CODE Filter by language (e.g., en)
138
+
139
+ Output:
140
+ --output-dir, -o DIR Output directory (default: ./docs)
141
+ --dry-run Show what would be fetched
142
+ --verbose, -v Verbose output
143
+ ```
144
+
145
+ See `docpull --help` for all options.
146
+
147
+ ## Python API
148
+
149
+ ```python
150
+ import asyncio
151
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
152
+
153
+ async def main():
154
+ config = DocpullConfig(
155
+ url="https://docs.example.com",
156
+ profile=ProfileName.RAG,
157
+ crawl={"max_pages": 100},
158
+ cache={"enabled": True},
159
+ )
160
+
161
+ async with Fetcher(config) as fetcher:
162
+ async for event in fetcher.run():
163
+ if event.type == EventType.FETCH_PROGRESS:
164
+ print(f"{event.current}/{event.total}: {event.url}")
165
+
166
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
167
+
168
+ asyncio.run(main())
169
+ ```
170
+
171
+ ## Output
172
+
173
+ Each page becomes a Markdown file with YAML frontmatter:
174
+
175
+ ```markdown
176
+ ---
177
+ title: "Getting Started"
178
+ source: https://docs.example.com/guide
179
+ ---
180
+
181
+ # Getting Started
182
+ ...
183
+ ```
184
+
185
+ ## Security
186
+
187
+ - HTTPS-only, mandatory robots.txt compliance
188
+ - Blocks private/internal network IPs
189
+ - Path traversal and XXE protection
190
+
191
+ ## Troubleshooting
192
+
193
+ ```bash
194
+ docpull --doctor # Check installation
195
+ docpull URL --verbose # Verbose output
196
+ docpull URL --dry-run # Test without downloading
197
+ ```
198
+
199
+ ## Links
200
+
201
+ - [PyPI](https://pypi.org/project/docpull/)
202
+ - [GitHub](https://github.com/raintree-technology/docpull)
203
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
204
+
205
+ ## License
206
+
207
+ MIT
@@ -0,0 +1,130 @@
1
+ # docpull
2
+
3
+ **Pull documentation from any website and convert it to clean, AI-ready Markdown.**
4
+
5
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install docpull
13
+ ```
14
+
15
+ ## Usage
16
+
17
+ ```bash
18
+ # Basic fetch
19
+ docpull https://docs.example.com
20
+
21
+ # With options
22
+ docpull https://aptos.dev --max-pages 100 --output-dir ./docs
23
+
24
+ # Filter paths
25
+ docpull https://docs.example.com --include-paths "/api/*" --exclude-paths "/changelog/*"
26
+
27
+ # Enable caching for incremental updates
28
+ docpull https://docs.example.com --cache
29
+
30
+ # JavaScript-heavy sites
31
+ pip install docpull[js]
32
+ docpull https://spa-site.com --js
33
+ ```
34
+
35
+ ## Profiles
36
+
37
+ ```bash
38
+ docpull https://site.com --profile rag # Optimized for RAG/LLM (default)
39
+ docpull https://site.com --profile mirror # Full site archive with caching
40
+ docpull https://site.com --profile quick # Fast sampling (50 pages, depth 2)
41
+ ```
42
+
43
+ ## Options
44
+
45
+ ```
46
+ Crawl:
47
+ --max-pages N Maximum pages to fetch
48
+ --max-depth N Maximum crawl depth
49
+ --include-paths P Only crawl matching URL patterns
50
+ --exclude-paths P Skip matching URL patterns
51
+ --js Enable JavaScript rendering
52
+
53
+ Cache:
54
+ --cache Enable caching for incremental updates
55
+ --cache-dir DIR Cache directory (default: .docpull-cache)
56
+ --cache-ttl DAYS Days before cache expires (default: 30)
57
+
58
+ Content:
59
+ --streaming-dedup Real-time duplicate detection
60
+ --language CODE Filter by language (e.g., en)
61
+
62
+ Output:
63
+ --output-dir, -o DIR Output directory (default: ./docs)
64
+ --dry-run Show what would be fetched
65
+ --verbose, -v Verbose output
66
+ ```
67
+
68
+ See `docpull --help` for all options.
69
+
70
+ ## Python API
71
+
72
+ ```python
73
+ import asyncio
74
+ from docpull import Fetcher, DocpullConfig, ProfileName, EventType
75
+
76
+ async def main():
77
+ config = DocpullConfig(
78
+ url="https://docs.example.com",
79
+ profile=ProfileName.RAG,
80
+ crawl={"max_pages": 100},
81
+ cache={"enabled": True},
82
+ )
83
+
84
+ async with Fetcher(config) as fetcher:
85
+ async for event in fetcher.run():
86
+ if event.type == EventType.FETCH_PROGRESS:
87
+ print(f"{event.current}/{event.total}: {event.url}")
88
+
89
+ print(f"Done: {fetcher.stats.pages_fetched} pages")
90
+
91
+ asyncio.run(main())
92
+ ```
93
+
94
+ ## Output
95
+
96
+ Each page becomes a Markdown file with YAML frontmatter:
97
+
98
+ ```markdown
99
+ ---
100
+ title: "Getting Started"
101
+ source: https://docs.example.com/guide
102
+ ---
103
+
104
+ # Getting Started
105
+ ...
106
+ ```
107
+
108
+ ## Security
109
+
110
+ - HTTPS-only, mandatory robots.txt compliance
111
+ - Blocks private/internal network IPs
112
+ - Path traversal and XXE protection
113
+
114
+ ## Troubleshooting
115
+
116
+ ```bash
117
+ docpull --doctor # Check installation
118
+ docpull URL --verbose # Verbose output
119
+ docpull URL --dry-run # Test without downloading
120
+ ```
121
+
122
+ ## Links
123
+
124
+ - [PyPI](https://pypi.org/project/docpull/)
125
+ - [GitHub](https://github.com/raintree-technology/docpull)
126
+ - [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
127
+
128
+ ## License
129
+
130
+ MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "1.5.0"
7
+ version = "2.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -72,6 +72,7 @@ dependencies = [
72
72
  "rich>=13.0.0",
73
73
  "pyyaml>=6.0",
74
74
  "gitpython>=3.1.40",
75
+ "pydantic>=2.0",
75
76
  ]
76
77
 
77
78
  [project.optional-dependencies]
@@ -117,7 +118,7 @@ Repository = "https://github.com/raintree-technology/docpull"
117
118
  "Releases" = "https://github.com/raintree-technology/docpull/releases"
118
119
 
119
120
  [tool.setuptools.packages.find]
120
- where = ["."]
121
+ where = ["src"]
121
122
  include = ["docpull*"]
122
123
 
123
124
  [tool.setuptools.package-data]
@@ -133,7 +134,7 @@ target-version = "py39"
133
134
 
134
135
  [tool.ruff.lint]
135
136
  select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
136
- ignore = []
137
+ ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
137
138
 
138
139
  [tool.mypy]
139
140
  python_version = "3.9"
@@ -144,7 +145,14 @@ disallow_any_unimported = true
144
145
  no_implicit_optional = true
145
146
  strict_equality = true
146
147
  warn_redundant_casts = true
148
+ ignore_missing_imports = true
147
149
  exclude = ["tests/"]
150
+ plugins = ["pydantic.mypy"]
151
+
152
+ [tool.pydantic-mypy]
153
+ init_forbid_extra = true
154
+ init_typed = true
155
+ warn_required_dynamic_aliases = true
148
156
 
149
157
  [[tool.mypy.overrides]]
150
158
  module = "playwright.*"
@@ -154,6 +162,20 @@ ignore_missing_imports = true
154
162
  module = "extruct.*"
155
163
  ignore_missing_imports = true
156
164
 
165
+ [[tool.mypy.overrides]]
166
+ module = "url_normalize"
167
+ ignore_missing_imports = true
168
+
169
+ [[tool.mypy.overrides]]
170
+ module = "docpull.models.*"
171
+ disallow_any_unimported = false
172
+ warn_return_any = false
173
+
174
+ [[tool.mypy.overrides]]
175
+ module = "docpull.concurrency.browser_pool"
176
+ disallow_any_unimported = false
177
+ warn_return_any = false
178
+
157
179
  [[tool.mypy.overrides]]
158
180
  module = "tests.*"
159
181
  disallow_untyped_defs = false
@@ -175,7 +197,7 @@ markers = [
175
197
  ]
176
198
 
177
199
  [tool.coverage.run]
178
- source = ["docpull"]
200
+ source = ["src/docpull"]
179
201
  omit = ["tests/*", "*/test_*.py"]
180
202
 
181
203
  [tool.coverage.report]
@@ -0,0 +1,56 @@
1
+ """
2
+ docpull - Fetch and convert documentation from any URL to markdown.
3
+
4
+ Usage:
5
+ from docpull import Fetcher, DocpullConfig, ProfileName
6
+
7
+ config = DocpullConfig(
8
+ url="https://docs.example.com",
9
+ profile=ProfileName.RAG,
10
+ )
11
+
12
+ async with Fetcher(config) as fetcher:
13
+ async for event in fetcher.run():
14
+ print(event)
15
+ """
16
+
17
+ __version__ = "2.0.0"
18
+
19
+ from .cache import CacheManager, StreamingDeduplicator
20
+ from .core.fetcher import Fetcher, fetch_blocking
21
+ from .models.config import (
22
+ CacheConfig,
23
+ ContentFilterConfig,
24
+ CrawlConfig,
25
+ DocpullConfig,
26
+ IntegrationConfig,
27
+ NetworkConfig,
28
+ OutputConfig,
29
+ PerformanceConfig,
30
+ ProfileName,
31
+ )
32
+ from .models.events import EventType, FetchEvent, FetchStats
33
+
34
+ __all__ = [
35
+ "__version__",
36
+ # Core
37
+ "Fetcher",
38
+ "fetch_blocking",
39
+ # Config
40
+ "DocpullConfig",
41
+ "ProfileName",
42
+ "CrawlConfig",
43
+ "ContentFilterConfig",
44
+ "OutputConfig",
45
+ "NetworkConfig",
46
+ "PerformanceConfig",
47
+ "IntegrationConfig",
48
+ "CacheConfig",
49
+ # Events
50
+ "EventType",
51
+ "FetchEvent",
52
+ "FetchStats",
53
+ # Cache
54
+ "CacheManager",
55
+ "StreamingDeduplicator",
56
+ ]
@@ -0,0 +1,12 @@
1
+ """Caching and deduplication for docpull."""
2
+
3
+ from .manager import DEFAULT_TTL_DAYS, CacheManager, CacheState, ManifestEntry
4
+ from .streaming_dedup import StreamingDeduplicator
5
+
6
+ __all__ = [
7
+ "CacheManager",
8
+ "CacheState",
9
+ "ManifestEntry",
10
+ "StreamingDeduplicator",
11
+ "DEFAULT_TTL_DAYS",
12
+ ]