docpull 2.5.1__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-2.5.1/src/docpull.egg-info → docpull-3.0.0}/PKG-INFO +13 -2
- {docpull-2.5.1 → docpull-3.0.0}/README.md +12 -0
- {docpull-2.5.1 → docpull-3.0.0}/pyproject.toml +1 -6
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/__init__.py +1 -1
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cli.py +1 -2
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/special_cases.py +7 -18
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/core/fetcher.py +10 -26
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/filters.py +8 -9
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/client.py +20 -19
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/server.py +5 -3
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/tools.py +16 -45
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/config.py +6 -73
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/profiles.py +1 -5
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/convert.py +12 -12
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/fetch.py +2 -3
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save.py +1 -1
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/robots.py +13 -5
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/url_validator.py +2 -2
- {docpull-2.5.1 → docpull-3.0.0/src/docpull.egg-info}/PKG-INFO +13 -2
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/SOURCES.txt +6 -6
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/requires.txt +0 -1
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_cache_conditional_get.py +5 -12
- docpull-2.5.1/tests/test_v2_conversion.py → docpull-3.0.0/tests/test_conversion.py +26 -42
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_convert_step_new.py +2 -6
- docpull-2.5.1/tests/test_v2_discovery.py → docpull-3.0.0/tests/test_discovery.py +1 -1
- docpull-2.5.1/tests/test_v2_integration.py → docpull-3.0.0/tests/test_integration.py +4 -5
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_mcp_tools.py +9 -25
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_naming.py +4 -23
- docpull-2.5.1/tests/test_v2_pipeline.py → docpull-3.0.0/tests/test_pipeline.py +2 -4
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_special_cases.py +5 -17
- {docpull-2.5.1 → docpull-3.0.0}/LICENSE +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/setup.cfg +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/__main__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/manager.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/concurrency/manager.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/chunking.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/markdown.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/core/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/composite.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/crawler.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/enhanced.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/link_extractors/static.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/discovery/sitemap.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/doctor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/protocols.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/logging_config.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/mcp/sources.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/metadata_extractor.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/models/events.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/base.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/py.typed +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull/security/__init__.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_chunking.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_cli.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_link_extractors.py +0 -0
- /docpull-2.5.1/tests/test_fixes_v2_3_0.py → /docpull-3.0.0/tests/test_real_site_regressions.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_save_ndjson.py +0 -0
- {docpull-2.5.1 → docpull-3.0.0}/tests/test_security_hardening.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -68,7 +68,6 @@ Provides-Extra: dev
|
|
|
68
68
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
69
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
70
70
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
71
|
-
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
72
71
|
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
73
72
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
74
73
|
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
@@ -280,6 +279,17 @@ sources:
|
|
|
280
279
|
maxPages: 200
|
|
281
280
|
```
|
|
282
281
|
|
|
282
|
+
### About the `mcp/` directory in this repo
|
|
283
|
+
|
|
284
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
285
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
286
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
287
|
+
— that one is the right choice for almost every user and is installed
|
|
288
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
289
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
290
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
291
|
+
and use `docpull mcp`.
|
|
292
|
+
|
|
283
293
|
## Output
|
|
284
294
|
|
|
285
295
|
Markdown files with YAML frontmatter:
|
|
@@ -376,6 +386,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
376
386
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
377
387
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
378
388
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
389
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
379
390
|
|
|
380
391
|
## License
|
|
381
392
|
|
|
@@ -198,6 +198,17 @@ sources:
|
|
|
198
198
|
maxPages: 200
|
|
199
199
|
```
|
|
200
200
|
|
|
201
|
+
### About the `mcp/` directory in this repo
|
|
202
|
+
|
|
203
|
+
The `mcp/` directory at the repo root is a separate TypeScript + Bun MCP
|
|
204
|
+
server backed by PostgreSQL with pgvector for semantic search. It is not
|
|
205
|
+
the Python MCP server shipped in the `docpull` package described above
|
|
206
|
+
— that one is the right choice for almost every user and is installed
|
|
207
|
+
with `pip install 'docpull[mcp]'`. The `mcp/` tree is mirrored to its
|
|
208
|
+
own repo at [`raintree-technology/docpull-mcp`](https://github.com/raintree-technology/docpull-mcp);
|
|
209
|
+
unless you specifically need pgvector-backed semantic search, ignore it
|
|
210
|
+
and use `docpull mcp`.
|
|
211
|
+
|
|
201
212
|
## Output
|
|
202
213
|
|
|
203
214
|
Markdown files with YAML frontmatter:
|
|
@@ -294,6 +305,7 @@ docpull URL --preview-urls # List URLs without fetching
|
|
|
294
305
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
295
306
|
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
296
307
|
- [Changelog](https://github.com/raintree-technology/docpull/blob/main/docs/CHANGELOG.md)
|
|
308
|
+
- [Metrics](https://github.com/raintree-technology/docpull/blob/main/METRICS.md) — auto-refreshed daily (PyPI downloads, plugin installs via clone count, traffic)
|
|
297
309
|
|
|
298
310
|
## License
|
|
299
311
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "3.0.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -102,7 +102,6 @@ dev = [
|
|
|
102
102
|
"pytest>=7.0.0",
|
|
103
103
|
"pytest-cov>=4.0.0",
|
|
104
104
|
"pytest-asyncio>=0.21.0",
|
|
105
|
-
"black>=23.0.0",
|
|
106
105
|
"mypy>=1.0.0",
|
|
107
106
|
"ruff>=0.1.0",
|
|
108
107
|
"bandit>=1.7.0",
|
|
@@ -132,10 +131,6 @@ include = ["docpull*"]
|
|
|
132
131
|
[tool.setuptools.package-data]
|
|
133
132
|
docpull = ["py.typed"]
|
|
134
133
|
|
|
135
|
-
[tool.black]
|
|
136
|
-
line-length = 110
|
|
137
|
-
target-version = ["py310", "py311", "py312", "py313", "py314"]
|
|
138
|
-
|
|
139
134
|
[tool.ruff]
|
|
140
135
|
line-length = 110
|
|
141
136
|
target-version = "py310"
|
|
@@ -562,8 +562,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
562
562
|
n_chunks = len(ctx.chunks) if ctx.chunks else 0
|
|
563
563
|
extra = f" ({n_chunks} chunks)" if n_chunks else ""
|
|
564
564
|
console.print(
|
|
565
|
-
f"[green]Saved:[/green] {ctx.output_path} "
|
|
566
|
-
f"[{ctx.source_type or 'generic'}]{extra}"
|
|
565
|
+
f"[green]Saved:[/green] {ctx.output_path} [{ctx.source_type or 'generic'}]{extra}"
|
|
567
566
|
)
|
|
568
567
|
return 0
|
|
569
568
|
|
|
@@ -246,7 +246,8 @@ def _describe_type(schema: Any, spec: dict[str, Any]) -> str:
|
|
|
246
246
|
if not isinstance(schema, dict):
|
|
247
247
|
return "?"
|
|
248
248
|
if "$ref" in schema:
|
|
249
|
-
|
|
249
|
+
ref: str = schema["$ref"]
|
|
250
|
+
return ref.rsplit("/", 1)[-1]
|
|
250
251
|
for key in ("oneOf", "anyOf", "allOf"):
|
|
251
252
|
if isinstance(schema.get(key), list) and schema[key]:
|
|
252
253
|
seen: list[str] = []
|
|
@@ -349,9 +350,7 @@ class OpenApiExtractor:
|
|
|
349
350
|
for method, op in ops.items():
|
|
350
351
|
if method.lower() not in _HTTP_METHODS or not isinstance(op, dict):
|
|
351
352
|
continue
|
|
352
|
-
self._render_operation(
|
|
353
|
-
lines, path, method, op, shared_params, data
|
|
354
|
-
)
|
|
353
|
+
self._render_operation(lines, path, method, op, shared_params, data)
|
|
355
354
|
|
|
356
355
|
return SpecialCaseResult(
|
|
357
356
|
markdown="\n".join(lines).strip() + "\n",
|
|
@@ -410,9 +409,7 @@ class OpenApiExtractor:
|
|
|
410
409
|
lines.append(bullet)
|
|
411
410
|
lines.append("")
|
|
412
411
|
|
|
413
|
-
def _render_request_body(
|
|
414
|
-
self, lines: list[str], body: Any, spec: dict[str, Any]
|
|
415
|
-
) -> None:
|
|
412
|
+
def _render_request_body(self, lines: list[str], body: Any, spec: dict[str, Any]) -> None:
|
|
416
413
|
if not isinstance(body, dict):
|
|
417
414
|
return
|
|
418
415
|
if "$ref" in body:
|
|
@@ -455,9 +452,7 @@ class OpenApiExtractor:
|
|
|
455
452
|
lines.append(f"- body: {_describe_type(schema, spec)}")
|
|
456
453
|
lines.append("")
|
|
457
454
|
|
|
458
|
-
def _render_responses(
|
|
459
|
-
self, lines: list[str], responses: Any, spec: dict[str, Any]
|
|
460
|
-
) -> None:
|
|
455
|
+
def _render_responses(self, lines: list[str], responses: Any, spec: dict[str, Any]) -> None:
|
|
461
456
|
if not isinstance(responses, dict) or not responses:
|
|
462
457
|
return
|
|
463
458
|
lines.append("**Responses:**")
|
|
@@ -535,11 +530,7 @@ class MdxSourceExtractor:
|
|
|
535
530
|
for pattern in self._EDIT_PATTERNS:
|
|
536
531
|
match = pattern.search(text)
|
|
537
532
|
if match:
|
|
538
|
-
raw_url = (
|
|
539
|
-
match.group(1)
|
|
540
|
-
.replace("/blob/", "/raw/")
|
|
541
|
-
.replace("/edit/", "/raw/")
|
|
542
|
-
)
|
|
533
|
+
raw_url = match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
543
534
|
# Return None so downstream runs, but attach hint via a cache
|
|
544
535
|
# mechanism. Simpler: return None always; step reads the URL
|
|
545
536
|
# if needed by re-running the regex.
|
|
@@ -567,9 +558,7 @@ def find_mdx_source_url(html: bytes) -> str | None:
|
|
|
567
558
|
for pattern in MdxSourceExtractor._EDIT_PATTERNS:
|
|
568
559
|
match = pattern.search(text)
|
|
569
560
|
if match:
|
|
570
|
-
return (
|
|
571
|
-
match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
572
|
-
)
|
|
561
|
+
return match.group(1).replace("/blob/", "/raw/").replace("/edit/", "/raw/")
|
|
573
562
|
return None
|
|
574
563
|
|
|
575
564
|
|
|
@@ -265,9 +265,7 @@ class Fetcher:
|
|
|
265
265
|
# built-in 50 MB ceiling.
|
|
266
266
|
max_content_size_kw: dict[str, int] = {}
|
|
267
267
|
if self.config.content_filter.max_file_size is not None:
|
|
268
|
-
max_content_size_kw["max_content_size"] = int(
|
|
269
|
-
self.config.content_filter.max_file_size
|
|
270
|
-
)
|
|
268
|
+
max_content_size_kw["max_content_size"] = int(self.config.content_filter.max_file_size)
|
|
271
269
|
self._http_client = AsyncHttpClient(
|
|
272
270
|
rate_limiter=self._rate_limiter,
|
|
273
271
|
max_retries=self.config.network.max_retries,
|
|
@@ -509,11 +507,7 @@ class Fetcher:
|
|
|
509
507
|
|
|
510
508
|
steps = self._pipeline.steps
|
|
511
509
|
if not save:
|
|
512
|
-
steps = [
|
|
513
|
-
s
|
|
514
|
-
for s in steps
|
|
515
|
-
if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}
|
|
516
|
-
]
|
|
510
|
+
steps = [s for s in steps if s.name not in {"save", "save_json", "save_ndjson", "save_sqlite"}]
|
|
517
511
|
pipeline = type(self._pipeline)(steps=steps)
|
|
518
512
|
ctx = await pipeline.execute(url, output_path)
|
|
519
513
|
if ctx.error:
|
|
@@ -531,8 +525,8 @@ class Fetcher:
|
|
|
531
525
|
"""
|
|
532
526
|
Compute output path for a URL using the configured naming strategy.
|
|
533
527
|
|
|
534
|
-
- ``full
|
|
535
|
-
|
|
528
|
+
- ``full``: a single flattened filename (URL path joined with
|
|
529
|
+
underscores).
|
|
536
530
|
- ``hierarchical``: URL path preserved as nested directories,
|
|
537
531
|
terminating in ``<segment>.md`` or ``index.md`` for trailing
|
|
538
532
|
slashes. The leaf is `_validate_output_path`-safe — every segment
|
|
@@ -545,7 +539,6 @@ class Fetcher:
|
|
|
545
539
|
parts = _url_to_path_parts(url, self.config.url)
|
|
546
540
|
return output_dir.joinpath(*parts)
|
|
547
541
|
|
|
548
|
-
# full / flat / short: aliased to full until 3.0
|
|
549
542
|
filename = _url_to_filename(url, self.config.url)
|
|
550
543
|
return output_dir / filename
|
|
551
544
|
|
|
@@ -638,9 +631,7 @@ class Fetcher:
|
|
|
638
631
|
)
|
|
639
632
|
|
|
640
633
|
discovered: list[str] = []
|
|
641
|
-
async for url in self._discoverer.discover(
|
|
642
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
643
|
-
):
|
|
634
|
+
async for url in self._discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
644
635
|
discovered.append(url)
|
|
645
636
|
if self._cancelled:
|
|
646
637
|
yield FetchEvent(
|
|
@@ -756,9 +747,7 @@ class Fetcher:
|
|
|
756
747
|
)
|
|
757
748
|
)
|
|
758
749
|
try:
|
|
759
|
-
async for url in discoverer.discover(
|
|
760
|
-
start_url, max_urls=self.config.crawl.max_pages
|
|
761
|
-
):
|
|
750
|
+
async for url in discoverer.discover(start_url, max_urls=self.config.crawl.max_pages):
|
|
762
751
|
if self._cancelled:
|
|
763
752
|
break
|
|
764
753
|
await url_queue.put(url)
|
|
@@ -770,14 +759,10 @@ class Fetcher:
|
|
|
770
759
|
and self._cache_manager
|
|
771
760
|
and len(discovered_for_resume) % 200 == 0
|
|
772
761
|
):
|
|
773
|
-
self._cache_manager.save_discovered_urls(
|
|
774
|
-
list(discovered_for_resume), start_url
|
|
775
|
-
)
|
|
762
|
+
self._cache_manager.save_discovered_urls(list(discovered_for_resume), start_url)
|
|
776
763
|
finally:
|
|
777
764
|
if self.config.cache.enabled and self._cache_manager:
|
|
778
|
-
self._cache_manager.save_discovered_urls(
|
|
779
|
-
discovered_for_resume, start_url
|
|
780
|
-
)
|
|
765
|
+
self._cache_manager.save_discovered_urls(discovered_for_resume, start_url)
|
|
781
766
|
self._stats.urls_discovered = len(discovered_for_resume)
|
|
782
767
|
await event_queue.put(
|
|
783
768
|
FetchEvent(
|
|
@@ -810,6 +795,7 @@ class Fetcher:
|
|
|
810
795
|
continue
|
|
811
796
|
|
|
812
797
|
local_events: list[FetchEvent] = []
|
|
798
|
+
|
|
813
799
|
# Bind the per-iteration list as a default arg so ruff B023
|
|
814
800
|
# is happy. Closure is consumed synchronously by execute()
|
|
815
801
|
# before the next iteration anyway, so capture order is safe.
|
|
@@ -936,9 +922,7 @@ def fetch_one(url: str, **kwargs: object) -> PageContext:
|
|
|
936
922
|
"""
|
|
937
923
|
try:
|
|
938
924
|
asyncio.get_running_loop()
|
|
939
|
-
raise RuntimeError(
|
|
940
|
-
"fetch_one() called from async context. Use Fetcher.fetch_one() instead."
|
|
941
|
-
)
|
|
925
|
+
raise RuntimeError("fetch_one() called from async context. Use Fetcher.fetch_one() instead.")
|
|
942
926
|
except RuntimeError as exc:
|
|
943
927
|
if "no running event loop" not in str(exc).lower():
|
|
944
928
|
raise
|
|
@@ -29,19 +29,20 @@ def normalize_url(url: str) -> str:
|
|
|
29
29
|
Returns:
|
|
30
30
|
Normalized URL string
|
|
31
31
|
"""
|
|
32
|
-
# Use url_normalize library if available
|
|
32
|
+
# Use url_normalize library if available for case / percent-encoding
|
|
33
|
+
# cleanup. It does NOT strip fragments, so we always do that ourselves
|
|
34
|
+
# below — keeping behavior consistent whether the optional dep is
|
|
35
|
+
# installed or not.
|
|
33
36
|
if URL_NORMALIZE_AVAILABLE:
|
|
34
37
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
38
|
+
normalized = url_normalize(url)
|
|
39
|
+
if normalized:
|
|
40
|
+
url = normalized
|
|
37
41
|
except ValueError:
|
|
38
42
|
logger.debug("url_normalize rejected URL during normalization", exc_info=True)
|
|
39
43
|
|
|
40
|
-
# Basic normalization
|
|
41
44
|
parsed = urlparse(url)
|
|
42
|
-
|
|
43
|
-
# Remove fragment
|
|
44
|
-
normalized = urlunparse(
|
|
45
|
+
return urlunparse(
|
|
45
46
|
(
|
|
46
47
|
parsed.scheme.lower(),
|
|
47
48
|
parsed.netloc.lower(),
|
|
@@ -52,8 +53,6 @@ def normalize_url(url: str) -> str:
|
|
|
52
53
|
)
|
|
53
54
|
)
|
|
54
55
|
|
|
55
|
-
return normalized
|
|
56
|
-
|
|
57
56
|
|
|
58
57
|
class PatternFilter:
|
|
59
58
|
"""
|
|
@@ -12,7 +12,7 @@ from types import TracebackType
|
|
|
12
12
|
from urllib.parse import urljoin, urlparse
|
|
13
13
|
|
|
14
14
|
import aiohttp
|
|
15
|
-
from aiohttp.abc import AbstractResolver
|
|
15
|
+
from aiohttp.abc import AbstractResolver, ResolveResult
|
|
16
16
|
|
|
17
17
|
from ..security.url_validator import UrlValidator
|
|
18
18
|
from .protocols import HttpResponse
|
|
@@ -45,14 +45,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
45
45
|
self,
|
|
46
46
|
host: str,
|
|
47
47
|
port: int = 0,
|
|
48
|
-
family:
|
|
49
|
-
) -> list[
|
|
48
|
+
family: socket.AddressFamily = socket.AF_UNSPEC,
|
|
49
|
+
) -> list[ResolveResult]:
|
|
50
50
|
try:
|
|
51
51
|
addresses = self._url_validator.resolve_allowed_addresses(host)
|
|
52
52
|
except ValueError as err:
|
|
53
53
|
raise OSError(str(err)) from err
|
|
54
54
|
|
|
55
|
-
results: list[
|
|
55
|
+
results: list[ResolveResult] = []
|
|
56
56
|
for address in addresses:
|
|
57
57
|
ip = ipaddress.ip_address(address)
|
|
58
58
|
entry_family = socket.AF_INET6 if ip.version == 6 else socket.AF_INET
|
|
@@ -60,14 +60,14 @@ class _ValidatedResolver(AbstractResolver):
|
|
|
60
60
|
continue
|
|
61
61
|
|
|
62
62
|
results.append(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
63
|
+
ResolveResult(
|
|
64
|
+
hostname=host,
|
|
65
|
+
host=address,
|
|
66
|
+
port=port,
|
|
67
|
+
family=entry_family,
|
|
68
|
+
proto=socket.IPPROTO_TCP,
|
|
69
|
+
flags=socket.AI_NUMERICHOST,
|
|
70
|
+
)
|
|
71
71
|
)
|
|
72
72
|
|
|
73
73
|
if not results:
|
|
@@ -236,20 +236,21 @@ class AsyncHttpClient:
|
|
|
236
236
|
|
|
237
237
|
async def __aenter__(self) -> AsyncHttpClient:
|
|
238
238
|
"""Enter async context and create session."""
|
|
239
|
-
|
|
240
|
-
"limit": 100, # Total connection limit
|
|
241
|
-
"limit_per_host": 10, # Per-host connection limit
|
|
242
|
-
"ttl_dns_cache": 300, # DNS cache TTL
|
|
243
|
-
}
|
|
239
|
+
resolver: AbstractResolver | None = None
|
|
244
240
|
if self._url_validator is not None and self._proxy is None:
|
|
245
|
-
|
|
241
|
+
resolver = _ValidatedResolver(self._url_validator)
|
|
246
242
|
elif self._proxy is not None and self._url_validator is not None:
|
|
247
243
|
logger.warning(
|
|
248
244
|
"Proxy mode: DNS-pinning resolver is not active. "
|
|
249
245
|
"URL validation still runs pre-flight, but the proxy resolves DNS independently."
|
|
250
246
|
)
|
|
251
247
|
|
|
252
|
-
connector = aiohttp.TCPConnector(
|
|
248
|
+
connector = aiohttp.TCPConnector(
|
|
249
|
+
limit=100,
|
|
250
|
+
limit_per_host=10,
|
|
251
|
+
ttl_dns_cache=300,
|
|
252
|
+
resolver=resolver,
|
|
253
|
+
)
|
|
253
254
|
self._session = aiohttp.ClientSession(
|
|
254
255
|
connector=connector,
|
|
255
256
|
headers={"User-Agent": self._user_agent},
|
|
@@ -215,8 +215,7 @@ async def _run_stdio() -> int:
|
|
|
215
215
|
from mcp.types import CallToolResult, TextContent, Tool, ToolAnnotations
|
|
216
216
|
except ImportError:
|
|
217
217
|
print(
|
|
218
|
-
"docpull mcp requires the 'mcp' package. Install with: "
|
|
219
|
-
"pip install docpull[mcp]",
|
|
218
|
+
"docpull mcp requires the 'mcp' package. Install with: pip install docpull[mcp]",
|
|
220
219
|
file=sys.stderr,
|
|
221
220
|
)
|
|
222
221
|
return 1
|
|
@@ -590,7 +589,10 @@ async def _run_stdio() -> int:
|
|
|
590
589
|
# isError=False), and
|
|
591
590
|
# (b) errors on tools with an outputSchema don't fail the validator
|
|
592
591
|
# for "missing structured content."
|
|
593
|
-
content
|
|
592
|
+
# `content` is typed `list[TextContent | ImageContent | ...]` on the SDK
|
|
593
|
+
# side; list invariance means we have to widen the local annotation
|
|
594
|
+
# explicitly even though TextContent is one of the valid variants.
|
|
595
|
+
content: list[Any] = [TextContent(type="text", text=result.text)]
|
|
594
596
|
return CallToolResult(
|
|
595
597
|
content=content,
|
|
596
598
|
structuredContent=result.data if not result.is_error else None,
|
|
@@ -26,7 +26,7 @@ from typing import Any
|
|
|
26
26
|
import yaml
|
|
27
27
|
|
|
28
28
|
from ..core.fetcher import Fetcher
|
|
29
|
-
from ..models.config import DocpullConfig, ProfileName
|
|
29
|
+
from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
|
|
30
30
|
from ..security.url_validator import UrlValidator
|
|
31
31
|
from .sources import (
|
|
32
32
|
_URL_SCHEME_RE,
|
|
@@ -195,16 +195,10 @@ async def ensure_docs(
|
|
|
195
195
|
target_dir = _source_dir(docs_dir, source)
|
|
196
196
|
meta_path = _meta_path(docs_dir, source)
|
|
197
197
|
|
|
198
|
-
if (
|
|
199
|
-
not force
|
|
200
|
-
and _cache_fresh(meta_path)
|
|
201
|
-
and target_dir.exists()
|
|
202
|
-
and any(target_dir.rglob("*.md"))
|
|
203
|
-
):
|
|
198
|
+
if not force and _cache_fresh(meta_path) and target_dir.exists() and any(target_dir.rglob("*.md")):
|
|
204
199
|
files = list(target_dir.rglob("*.md"))
|
|
205
200
|
return ToolResult(
|
|
206
|
-
f"Cached: {source} ({len(files)} files at {target_dir}). "
|
|
207
|
-
"Call with force=true to refresh.",
|
|
201
|
+
f"Cached: {source} ({len(files)} files at {target_dir}). Call with force=true to refresh.",
|
|
208
202
|
data={
|
|
209
203
|
"source": source,
|
|
210
204
|
"cached": True,
|
|
@@ -216,8 +210,8 @@ async def ensure_docs(
|
|
|
216
210
|
config = DocpullConfig(
|
|
217
211
|
url=resolved.url,
|
|
218
212
|
profile=profile_enum,
|
|
219
|
-
crawl=
|
|
220
|
-
output=
|
|
213
|
+
crawl=CrawlConfig(max_pages=resolved.max_pages) if resolved.max_pages else CrawlConfig(),
|
|
214
|
+
output=OutputConfig(directory=target_dir),
|
|
221
215
|
)
|
|
222
216
|
fetched = 0
|
|
223
217
|
crashed = False
|
|
@@ -264,13 +258,11 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
|
|
|
264
258
|
if not validation.is_valid:
|
|
265
259
|
return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
|
|
266
260
|
|
|
267
|
-
|
|
268
|
-
if max_tokens:
|
|
269
|
-
output_kwargs["max_tokens_per_file"] = max_tokens
|
|
261
|
+
output_cfg = OutputConfig(max_tokens_per_file=max_tokens) if max_tokens else OutputConfig()
|
|
270
262
|
config = DocpullConfig(
|
|
271
263
|
url=url,
|
|
272
264
|
profile=ProfileName.CUSTOM,
|
|
273
|
-
output=
|
|
265
|
+
output=output_cfg,
|
|
274
266
|
)
|
|
275
267
|
async with Fetcher(config) as fetcher:
|
|
276
268
|
ctx = await fetcher.fetch_one(url, save=False)
|
|
@@ -288,10 +280,7 @@ async def fetch_url(url: str, *, max_tokens: int | None = None) -> ToolResult:
|
|
|
288
280
|
]
|
|
289
281
|
body = "\n\n".join(parts)
|
|
290
282
|
chunks_meta = f" _chunks: {len(ctx.chunks)}_" if ctx.chunks else ""
|
|
291
|
-
header =
|
|
292
|
-
f"# {ctx.title or url}\n"
|
|
293
|
-
f"_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
|
|
294
|
-
)
|
|
283
|
+
header = f"# {ctx.title or url}\n_source: {url}_ _type: {ctx.source_type or 'generic'}_{chunks_meta}\n\n"
|
|
295
284
|
return ToolResult(header + body)
|
|
296
285
|
|
|
297
286
|
|
|
@@ -471,16 +460,9 @@ def grep_docs(
|
|
|
471
460
|
matches: list[tuple[int, list[str], str, list[str]]] = []
|
|
472
461
|
for idx, line in enumerate(lines):
|
|
473
462
|
if regex.search(line):
|
|
474
|
-
before = (
|
|
475
|
-
[lines[i].rstrip() for i in range(max(0, idx - context), idx)]
|
|
476
|
-
if context
|
|
477
|
-
else []
|
|
478
|
-
)
|
|
463
|
+
before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
|
|
479
464
|
after = (
|
|
480
|
-
[
|
|
481
|
-
lines[i].rstrip()
|
|
482
|
-
for i in range(idx + 1, min(len(lines), idx + 1 + context))
|
|
483
|
-
]
|
|
465
|
+
[lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
|
|
484
466
|
if context
|
|
485
467
|
else []
|
|
486
468
|
)
|
|
@@ -532,9 +514,7 @@ def grep_docs(
|
|
|
532
514
|
for off, line in enumerate(after, start=1):
|
|
533
515
|
chunk.append(f" {lineno + off:>4}- {line}")
|
|
534
516
|
block_lines.append("\n".join(chunk))
|
|
535
|
-
rendered_matches.append(
|
|
536
|
-
{"lineno": lineno, "before": before, "line": hit, "after": after}
|
|
537
|
-
)
|
|
517
|
+
rendered_matches.append({"lineno": lineno, "before": before, "line": hit, "after": after})
|
|
538
518
|
rendered += 1
|
|
539
519
|
blocks.append("\n\n".join(block_lines))
|
|
540
520
|
files_payload.append(
|
|
@@ -710,28 +690,19 @@ def add_source(
|
|
|
710
690
|
)
|
|
711
691
|
validation = _ADD_SOURCE_VALIDATOR.validate(url)
|
|
712
692
|
if not validation.is_valid:
|
|
713
|
-
return ToolResult(
|
|
714
|
-
f"URL rejected: {validation.rejection_reason}", is_error=True
|
|
715
|
-
)
|
|
693
|
+
return ToolResult(f"URL rejected: {validation.rejection_reason}", is_error=True)
|
|
716
694
|
if description is not None and len(description) > MAX_DESCRIPTION_LEN:
|
|
717
|
-
return ToolResult(
|
|
718
|
-
f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True
|
|
719
|
-
)
|
|
695
|
+
return ToolResult(f"Description too long (>{MAX_DESCRIPTION_LEN} chars).", is_error=True)
|
|
720
696
|
if category is not None and category not in ALLOWED_USER_CATEGORIES:
|
|
721
697
|
valid = ", ".join(sorted(ALLOWED_USER_CATEGORIES))
|
|
722
|
-
return ToolResult(
|
|
723
|
-
f"Unknown category '{category}'. Valid: {valid}", is_error=True
|
|
724
|
-
)
|
|
698
|
+
return ToolResult(f"Unknown category '{category}'. Valid: {valid}", is_error=True)
|
|
725
699
|
if max_pages is not None and (max_pages < 1 or max_pages > 100_000):
|
|
726
|
-
return ToolResult(
|
|
727
|
-
"max_pages must be between 1 and 100000.", is_error=True
|
|
728
|
-
)
|
|
700
|
+
return ToolResult("max_pages must be between 1 and 100000.", is_error=True)
|
|
729
701
|
|
|
730
702
|
is_builtin = name in BUILTIN_SOURCES
|
|
731
703
|
if is_builtin and not force:
|
|
732
704
|
return ToolResult(
|
|
733
|
-
f"'{name}' is a builtin source. Pass force=true to shadow it with a "
|
|
734
|
-
"user override.",
|
|
705
|
+
f"'{name}' is a builtin source. Pass force=true to shadow it with a user override.",
|
|
735
706
|
is_error=True,
|
|
736
707
|
)
|
|
737
708
|
|