docpull 3.0.0__tar.gz → 3.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-3.0.0/src/docpull.egg-info → docpull-3.0.1}/PKG-INFO +8 -1
- {docpull-3.0.0 → docpull-3.0.1}/pyproject.toml +24 -5
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/manager.py +10 -8
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cli.py +4 -6
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/special_cases.py +6 -4
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/enhanced.py +2 -1
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/static.py +2 -1
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/sources.py +63 -6
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/tools.py +22 -11
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/config.py +4 -2
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_json.py +4 -4
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_ndjson.py +2 -2
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save_sqlite.py +2 -2
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/robots.py +4 -2
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/url_validator.py +5 -5
- docpull-3.0.1/src/docpull/time_utils.py +29 -0
- {docpull-3.0.0 → docpull-3.0.1/src/docpull.egg-info}/PKG-INFO +8 -1
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/SOURCES.txt +3 -1
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/requires.txt +7 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_mcp_tools.py +51 -0
- docpull-3.0.1/tests/test_time_utils.py +23 -0
- {docpull-3.0.0 → docpull-3.0.1}/LICENSE +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/README.md +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/setup.cfg +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/__main__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/cache/streaming_dedup.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/concurrency/manager.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/chunking.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/extractor.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/markdown.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/protocols.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/core/fetcher.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/composite.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/crawler.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/filters.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/protocols.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/discovery/sitemap.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/doctor.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/client.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/protocols.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/logging_config.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/mcp/server.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/metadata_extractor.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/events.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/models/profiles.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/base.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/convert.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/fetch.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/save.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/py.typed +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull/security/__init__.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_cache_conditional_get.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_chunking.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_cli.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_conversion.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_convert_step_new.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_discovery.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_integration.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_link_extractors.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_naming.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_pipeline.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_real_site_regressions.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_save_ndjson.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_security_hardening.py +0 -0
- {docpull-3.0.0 → docpull-3.0.1}/tests/test_special_cases.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
43
43
|
Requires-Dist: defusedxml>=0.7.1
|
|
44
44
|
Requires-Dist: extruct>=0.15.0
|
|
45
45
|
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: idna>=3.15
|
|
47
|
+
Requires-Dist: regex>=2024.11.6
|
|
46
48
|
Requires-Dist: rich>=13.0.0
|
|
47
49
|
Requires-Dist: pyyaml>=6.0
|
|
48
50
|
Requires-Dist: pydantic>=2.0
|
|
51
|
+
Requires-Dist: urllib3>=2.7.0
|
|
49
52
|
Provides-Extra: proxy
|
|
50
53
|
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
51
54
|
Provides-Extra: normalize
|
|
@@ -56,6 +59,8 @@ Provides-Extra: tokens
|
|
|
56
59
|
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
57
60
|
Provides-Extra: mcp
|
|
58
61
|
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
62
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
|
|
63
|
+
Requires-Dist: starlette>=1.0.1; extra == "mcp"
|
|
59
64
|
Provides-Extra: llm
|
|
60
65
|
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
61
66
|
Provides-Extra: all
|
|
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
|
64
69
|
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
65
70
|
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
66
71
|
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
72
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "all"
|
|
73
|
+
Requires-Dist: starlette>=1.0.1; extra == "all"
|
|
67
74
|
Provides-Extra: dev
|
|
68
75
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
76
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "3.0.
|
|
7
|
+
version = "3.0.1"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -67,9 +67,12 @@ dependencies = [
|
|
|
67
67
|
"defusedxml>=0.7.1",
|
|
68
68
|
"extruct>=0.15.0",
|
|
69
69
|
"aiohttp>=3.9.0",
|
|
70
|
+
"idna>=3.15",
|
|
71
|
+
"regex>=2024.11.6",
|
|
70
72
|
"rich>=13.0.0",
|
|
71
73
|
"pyyaml>=6.0",
|
|
72
74
|
"pydantic>=2.0",
|
|
75
|
+
"urllib3>=2.7.0",
|
|
73
76
|
]
|
|
74
77
|
|
|
75
78
|
[project.optional-dependencies]
|
|
@@ -87,6 +90,8 @@ tokens = [
|
|
|
87
90
|
]
|
|
88
91
|
mcp = [
|
|
89
92
|
"mcp>=1.0.0",
|
|
93
|
+
"python-multipart>=0.0.27",
|
|
94
|
+
"starlette>=1.0.1",
|
|
90
95
|
]
|
|
91
96
|
llm = [
|
|
92
97
|
"tiktoken>=0.7.0",
|
|
@@ -97,6 +102,8 @@ all = [
|
|
|
97
102
|
"trafilatura>=1.12.0",
|
|
98
103
|
"tiktoken>=0.7.0",
|
|
99
104
|
"mcp>=1.0.0",
|
|
105
|
+
"python-multipart>=0.0.27",
|
|
106
|
+
"starlette>=1.0.1",
|
|
100
107
|
]
|
|
101
108
|
dev = [
|
|
102
109
|
"pytest>=7.0.0",
|
|
@@ -170,10 +177,22 @@ module = "docpull.models.*"
|
|
|
170
177
|
disallow_any_unimported = false
|
|
171
178
|
warn_return_any = false
|
|
172
179
|
|
|
173
|
-
[
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
180
|
+
[tool.bandit]
|
|
181
|
+
# Policy: every entry in `skips` MUST have a one-line justification
|
|
182
|
+
# above it explaining what bandit found, why it's a false positive
|
|
183
|
+
# *for this codebase*, and (if narrow) why a `# nosec BXXX # reason`
|
|
184
|
+
# annotation at the call site would have been worse. Bandit skips
|
|
185
|
+
# silence findings repo-wide, so the bar to add one is higher than
|
|
186
|
+
# silencing a single line. If a new skip is unavoidable, add it here
|
|
187
|
+
# in PR review, not as a drive-by.
|
|
188
|
+
#
|
|
189
|
+
# B101 (assert_used) — flags every `assert x is not None` we use for
|
|
190
|
+
# type narrowing. Bandit's concern is that assertions vanish under
|
|
191
|
+
# `python -O`. docpull is a CLI / SDK, never invoked with -O, and the
|
|
192
|
+
# narrowing asserts are not load-bearing safety checks. Skipping the
|
|
193
|
+
# rule globally keeps the existing idiom without 8+ inline `# nosec`
|
|
194
|
+
# annotations in fetcher.py / pipeline/steps/convert.py.
|
|
195
|
+
skips = ["B101"]
|
|
177
196
|
|
|
178
197
|
[tool.pytest.ini_options]
|
|
179
198
|
minversion = "7.0"
|
|
@@ -5,10 +5,12 @@ from __future__ import annotations
|
|
|
5
5
|
import hashlib
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
from datetime import
|
|
8
|
+
from datetime import timedelta
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import TypedDict
|
|
11
11
|
|
|
12
|
+
from ..time_utils import parse_persisted_datetime, utc_now, utc_now_iso
|
|
13
|
+
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
14
16
|
# Default TTL for cache entries (30 days)
|
|
@@ -257,7 +259,7 @@ class CacheManager:
|
|
|
257
259
|
self.manifest[url] = {
|
|
258
260
|
"checksum": self.compute_checksum(content),
|
|
259
261
|
"file_path": str(file_path),
|
|
260
|
-
"fetched_at":
|
|
262
|
+
"fetched_at": utc_now_iso(),
|
|
261
263
|
"size": len(content),
|
|
262
264
|
}
|
|
263
265
|
|
|
@@ -314,7 +316,7 @@ class CacheManager:
|
|
|
314
316
|
Note:
|
|
315
317
|
Changes are batched. Call flush() to persist to disk.
|
|
316
318
|
"""
|
|
317
|
-
self._state.last_run =
|
|
319
|
+
self._state.last_run = utc_now_iso()
|
|
318
320
|
self._state_dirty = True
|
|
319
321
|
|
|
320
322
|
def clear_state(self) -> None:
|
|
@@ -354,18 +356,18 @@ class CacheManager:
|
|
|
354
356
|
if ttl is None:
|
|
355
357
|
return 0
|
|
356
358
|
|
|
357
|
-
cutoff =
|
|
359
|
+
cutoff = utc_now() - timedelta(days=ttl)
|
|
358
360
|
to_remove = []
|
|
359
361
|
|
|
360
362
|
for url, entry in self.manifest.items():
|
|
361
363
|
fetched_at = entry.get("fetched_at")
|
|
362
364
|
if fetched_at:
|
|
363
365
|
try:
|
|
364
|
-
entry_time =
|
|
366
|
+
entry_time = parse_persisted_datetime(fetched_at)
|
|
365
367
|
if entry_time < cutoff:
|
|
366
368
|
to_remove.append(url)
|
|
367
|
-
except ValueError:
|
|
368
|
-
|
|
369
|
+
except ValueError as err:
|
|
370
|
+
logger.warning("Invalid cache timestamp for %s: %s", url, err)
|
|
369
371
|
|
|
370
372
|
for url in to_remove:
|
|
371
373
|
del self.manifest[url]
|
|
@@ -413,7 +415,7 @@ class CacheManager:
|
|
|
413
415
|
"""
|
|
414
416
|
data: DiscoveredUrlsState = {
|
|
415
417
|
"start_url": start_url,
|
|
416
|
-
"discovered_at":
|
|
418
|
+
"discovered_at": utc_now_iso(),
|
|
417
419
|
"urls": urls,
|
|
418
420
|
}
|
|
419
421
|
try:
|
|
@@ -13,12 +13,10 @@ if "--doctor" in sys.argv:
|
|
|
13
13
|
|
|
14
14
|
output_dir = None
|
|
15
15
|
if "--output-dir" in sys.argv or "-o" in sys.argv:
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
except (ValueError, IndexError):
|
|
21
|
-
pass
|
|
16
|
+
flag = "--output-dir" if "--output-dir" in sys.argv else "-o"
|
|
17
|
+
flag_idx = sys.argv.index(flag)
|
|
18
|
+
if flag_idx + 1 < len(sys.argv):
|
|
19
|
+
output_dir = Path(sys.argv[flag_idx + 1])
|
|
22
20
|
sys.exit(run_doctor(output_dir=output_dir))
|
|
23
21
|
|
|
24
22
|
# Verify core dependencies
|
|
@@ -320,7 +320,8 @@ class OpenApiExtractor:
|
|
|
320
320
|
return None
|
|
321
321
|
try:
|
|
322
322
|
data = json.loads(text)
|
|
323
|
-
except json.JSONDecodeError:
|
|
323
|
+
except json.JSONDecodeError as err:
|
|
324
|
+
logger.debug("OpenAPI extractor skipped %s: JSON parse failed: %s", url, err)
|
|
324
325
|
return None
|
|
325
326
|
if not isinstance(data, dict):
|
|
326
327
|
return None
|
|
@@ -569,8 +570,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
|
|
|
569
570
|
"""Heuristic: does this HTML appear to be a JS-only SPA?
|
|
570
571
|
|
|
571
572
|
True when the non-script body text is very small relative to the overall
|
|
572
|
-
page size and the page contains script tags.
|
|
573
|
-
|
|
573
|
+
page size and the page contains script tags. This is a conservative signal
|
|
574
|
+
for warning an agent before it consumes empty Markdown.
|
|
574
575
|
"""
|
|
575
576
|
if len(html) < 500:
|
|
576
577
|
return False
|
|
@@ -578,7 +579,8 @@ def looks_like_spa(html: bytes, min_body_ratio: float = 0.05) -> bool:
|
|
|
578
579
|
return False
|
|
579
580
|
try:
|
|
580
581
|
soup = _soup(html)
|
|
581
|
-
except Exception: # noqa: BLE001
|
|
582
|
+
except Exception as err: # noqa: BLE001
|
|
583
|
+
logger.debug("SPA heuristic skipped malformed HTML: %s", err)
|
|
582
584
|
return False
|
|
583
585
|
# Remove scripts/styles before measuring.
|
|
584
586
|
for tag in soup(["script", "style", "noscript"]):
|
|
@@ -295,7 +295,8 @@ class EnhancedLinkExtractor:
|
|
|
295
295
|
|
|
296
296
|
try:
|
|
297
297
|
absolute_url = urljoin(base_url, href)
|
|
298
|
-
except Exception:
|
|
298
|
+
except Exception as err:
|
|
299
|
+
logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
|
|
299
300
|
return None
|
|
300
301
|
|
|
301
302
|
# Validate it's a proper URL
|
|
@@ -148,7 +148,8 @@ class StaticLinkExtractor:
|
|
|
148
148
|
"""
|
|
149
149
|
try:
|
|
150
150
|
absolute_url = urljoin(base_url, href)
|
|
151
|
-
except Exception:
|
|
151
|
+
except Exception as err:
|
|
152
|
+
logger.debug("Could not resolve href %r against %s: %s", href, base_url, err)
|
|
152
153
|
return None
|
|
153
154
|
|
|
154
155
|
# Remove fragment
|
|
@@ -7,9 +7,12 @@ import os
|
|
|
7
7
|
import re
|
|
8
8
|
from dataclasses import dataclass
|
|
9
9
|
from pathlib import Path
|
|
10
|
+
from urllib.parse import urlparse
|
|
10
11
|
|
|
11
12
|
import yaml
|
|
12
13
|
|
|
14
|
+
from ..security.url_validator import UrlValidator
|
|
15
|
+
|
|
13
16
|
logger = logging.getLogger(__name__)
|
|
14
17
|
|
|
15
18
|
|
|
@@ -54,6 +57,9 @@ BUILTIN_SOURCES: dict[str, SourceConfig] = {
|
|
|
54
57
|
|
|
55
58
|
_URL_SCHEME_RE = re.compile(r"^[a-z][a-z0-9+.-]*://", re.IGNORECASE)
|
|
56
59
|
_LIBRARY_NAME_RE = re.compile(r"^[a-zA-Z0-9_.-]+$")
|
|
60
|
+
MAX_LIBRARY_NAME_LENGTH = 128
|
|
61
|
+
MAX_USER_SOURCE_PAGES = 100_000
|
|
62
|
+
_USER_SOURCE_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
|
|
57
63
|
|
|
58
64
|
|
|
59
65
|
def is_safe_library_name(name: str) -> bool:
|
|
@@ -61,11 +67,44 @@ def is_safe_library_name(name: str) -> bool:
|
|
|
61
67
|
|
|
62
68
|
Allows alnum + ``_ . -``; rejects separators, ``..``, leading dot.
|
|
63
69
|
"""
|
|
64
|
-
if not name or name.startswith(".") or name == ".." or len(name) >
|
|
70
|
+
if not name or name.startswith(".") or name == ".." or len(name) > MAX_LIBRARY_NAME_LENGTH:
|
|
65
71
|
return False
|
|
66
72
|
return bool(_LIBRARY_NAME_RE.fullmatch(name))
|
|
67
73
|
|
|
68
74
|
|
|
75
|
+
def _is_https_url(url: str) -> bool:
|
|
76
|
+
parsed = urlparse(url)
|
|
77
|
+
return parsed.scheme.lower() == "https" and parsed.hostname is not None
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _is_allowed_source_url(url: str) -> tuple[bool, str | None]:
|
|
81
|
+
if not _is_https_url(url):
|
|
82
|
+
return (False, "url must be an HTTPS URL")
|
|
83
|
+
validation = _USER_SOURCE_URL_VALIDATOR.validate(url)
|
|
84
|
+
if not validation.is_valid:
|
|
85
|
+
return (False, validation.rejection_reason or "url rejected by validator")
|
|
86
|
+
return (True, None)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _coerce_max_pages(value: object, source_name: str) -> int | None:
|
|
90
|
+
if value is None:
|
|
91
|
+
return None
|
|
92
|
+
if isinstance(value, bool):
|
|
93
|
+
raise ValueError(f"source '{source_name}' max_pages must be an integer")
|
|
94
|
+
if isinstance(value, int):
|
|
95
|
+
parsed = value
|
|
96
|
+
elif isinstance(value, str):
|
|
97
|
+
try:
|
|
98
|
+
parsed = int(value)
|
|
99
|
+
except ValueError as err:
|
|
100
|
+
raise ValueError(f"source '{source_name}' max_pages must be an integer") from err
|
|
101
|
+
else:
|
|
102
|
+
raise ValueError(f"source '{source_name}' max_pages must be an integer")
|
|
103
|
+
if parsed < 1 or parsed > MAX_USER_SOURCE_PAGES:
|
|
104
|
+
raise ValueError(f"source '{source_name}' max_pages must be between 1 and {MAX_USER_SOURCE_PAGES}")
|
|
105
|
+
return parsed
|
|
106
|
+
|
|
107
|
+
|
|
69
108
|
def default_config_dir() -> Path:
|
|
70
109
|
env = os.environ.get("XDG_CONFIG_HOME")
|
|
71
110
|
base = Path(env) if env else Path.home() / ".config"
|
|
@@ -98,13 +137,31 @@ def load_user_sources(path: Path | None = None) -> dict[str, SourceConfig]:
|
|
|
98
137
|
entries = raw.get("sources") or {}
|
|
99
138
|
result: dict[str, SourceConfig] = {}
|
|
100
139
|
for name, cfg in entries.items():
|
|
101
|
-
|
|
140
|
+
source_name = str(name)
|
|
141
|
+
if not is_safe_library_name(source_name):
|
|
142
|
+
logger.warning("Ignoring unsafe source name in %s: %r", path, source_name)
|
|
143
|
+
continue
|
|
144
|
+
if not isinstance(cfg, dict):
|
|
145
|
+
logger.warning("Ignoring source %s in %s: entry must be a mapping", source_name, path)
|
|
146
|
+
continue
|
|
147
|
+
url = cfg.get("url")
|
|
148
|
+
if not isinstance(url, str):
|
|
149
|
+
logger.warning("Ignoring source %s in %s: url must be an HTTPS URL", source_name, path)
|
|
150
|
+
continue
|
|
151
|
+
url_allowed, url_reason = _is_allowed_source_url(url)
|
|
152
|
+
if not url_allowed:
|
|
153
|
+
logger.warning("Ignoring source %s in %s: %s", source_name, path, url_reason)
|
|
154
|
+
continue
|
|
155
|
+
try:
|
|
156
|
+
max_pages = _coerce_max_pages(cfg.get("maxPages") or cfg.get("max_pages"), source_name)
|
|
157
|
+
except ValueError as err:
|
|
158
|
+
logger.warning("Ignoring source %s in %s: %s", source_name, path, err)
|
|
102
159
|
continue
|
|
103
|
-
result[
|
|
104
|
-
url=
|
|
160
|
+
result[source_name] = SourceConfig(
|
|
161
|
+
url=url,
|
|
105
162
|
description=str(cfg.get("description", "")),
|
|
106
163
|
category=str(cfg.get("category", "user")),
|
|
107
|
-
max_pages=
|
|
164
|
+
max_pages=max_pages,
|
|
108
165
|
)
|
|
109
166
|
return result
|
|
110
167
|
|
|
@@ -122,7 +179,7 @@ def resolve_source(name: str) -> SourceConfig | None:
|
|
|
122
179
|
be routed through configured aliases so that policy (max_pages, category)
|
|
123
180
|
lives in one place.
|
|
124
181
|
"""
|
|
125
|
-
if _URL_SCHEME_RE.match(name):
|
|
182
|
+
if _URL_SCHEME_RE.match(name) or not is_safe_library_name(name):
|
|
126
183
|
return None
|
|
127
184
|
return all_sources().get(name)
|
|
128
185
|
|
|
@@ -14,20 +14,20 @@ from __future__ import annotations
|
|
|
14
14
|
import json
|
|
15
15
|
import logging
|
|
16
16
|
import os
|
|
17
|
-
import re
|
|
18
17
|
import shutil
|
|
19
18
|
import time
|
|
20
19
|
from collections.abc import Awaitable, Callable
|
|
21
20
|
from dataclasses import dataclass
|
|
22
|
-
from datetime import datetime
|
|
23
21
|
from pathlib import Path
|
|
24
22
|
from typing import Any
|
|
25
23
|
|
|
24
|
+
import regex
|
|
26
25
|
import yaml
|
|
27
26
|
|
|
28
27
|
from ..core.fetcher import Fetcher
|
|
29
28
|
from ..models.config import CrawlConfig, DocpullConfig, OutputConfig, ProfileName
|
|
30
29
|
from ..security.url_validator import UrlValidator
|
|
30
|
+
from ..time_utils import utc_now_iso
|
|
31
31
|
from .sources import (
|
|
32
32
|
_URL_SCHEME_RE,
|
|
33
33
|
BUILTIN_SOURCES,
|
|
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)
|
|
|
44
44
|
CACHE_TTL_SECONDS = 7 * 24 * 60 * 60 # 7 days
|
|
45
45
|
MAX_GREP_PATTERN_LEN = 1000
|
|
46
46
|
GREP_TIMEOUT_SECONDS = 10.0
|
|
47
|
+
GREP_LINE_TIMEOUT_SECONDS = 0.05
|
|
47
48
|
MAX_READ_DOC_BYTES = 1_000_000
|
|
48
49
|
|
|
49
50
|
_FETCH_URL_VALIDATOR = UrlValidator(allowed_schemes={"https"})
|
|
@@ -99,7 +100,7 @@ def _write_meta(meta_path: Path, source: str, url: str, pages: int) -> None:
|
|
|
99
100
|
"source": source,
|
|
100
101
|
"url": url,
|
|
101
102
|
"fetched_at_epoch": time.time(),
|
|
102
|
-
"fetched_at":
|
|
103
|
+
"fetched_at": utc_now_iso(),
|
|
103
104
|
"page_count": pages,
|
|
104
105
|
},
|
|
105
106
|
indent=2,
|
|
@@ -118,7 +119,7 @@ def _write_partial_meta(meta_path: Path, source: str, url: str, pages: int) -> N
|
|
|
118
119
|
"source": source,
|
|
119
120
|
"url": url,
|
|
120
121
|
"fetched_at_epoch": time.time(),
|
|
121
|
-
"fetched_at":
|
|
122
|
+
"fetched_at": utc_now_iso(),
|
|
122
123
|
"page_count": pages,
|
|
123
124
|
"partial": True,
|
|
124
125
|
},
|
|
@@ -182,6 +183,11 @@ async def ensure_docs(
|
|
|
182
183
|
"and call ensure_docs with that name.",
|
|
183
184
|
is_error=True,
|
|
184
185
|
)
|
|
186
|
+
if not is_safe_library_name(source):
|
|
187
|
+
return ToolResult(
|
|
188
|
+
f"Invalid source name '{source}'. Use names from list_sources.",
|
|
189
|
+
is_error=True,
|
|
190
|
+
)
|
|
185
191
|
available = ", ".join(sorted(all_sources().keys()))
|
|
186
192
|
return ToolResult(
|
|
187
193
|
f"Unknown source '{source}'. Available: {available}",
|
|
@@ -409,9 +415,7 @@ def grep_docs(
|
|
|
409
415
|
|
|
410
416
|
Hardened against (a) path traversal via ``library`` (rejected by
|
|
411
417
|
``is_safe_library_name``) and (b) catastrophic regex via a pattern
|
|
412
|
-
length cap
|
|
413
|
-
timeout, so the budget is checked between files; a single pathological
|
|
414
|
-
pattern+line combination can still wedge for one file's worth of work.
|
|
418
|
+
length cap, a total wall-clock budget, and a per-line regex timeout.
|
|
415
419
|
"""
|
|
416
420
|
docs_dir = docs_dir or default_docs_dir()
|
|
417
421
|
if not docs_dir.exists():
|
|
@@ -430,9 +434,9 @@ def grep_docs(
|
|
|
430
434
|
context = max(0, min(context, 3))
|
|
431
435
|
|
|
432
436
|
try:
|
|
433
|
-
flags = 0 if case_sensitive else
|
|
434
|
-
|
|
435
|
-
except
|
|
437
|
+
flags = 0 if case_sensitive else regex.IGNORECASE
|
|
438
|
+
compiled = regex.compile(pattern, flags)
|
|
439
|
+
except regex.error as err:
|
|
436
440
|
return ToolResult(f"Invalid pattern: {err}", is_error=True)
|
|
437
441
|
|
|
438
442
|
roots = (
|
|
@@ -459,7 +463,12 @@ def grep_docs(
|
|
|
459
463
|
continue
|
|
460
464
|
matches: list[tuple[int, list[str], str, list[str]]] = []
|
|
461
465
|
for idx, line in enumerate(lines):
|
|
462
|
-
|
|
466
|
+
try:
|
|
467
|
+
matched = compiled.search(line, timeout=GREP_LINE_TIMEOUT_SECONDS) is not None
|
|
468
|
+
except TimeoutError:
|
|
469
|
+
timed_out = True
|
|
470
|
+
break
|
|
471
|
+
if matched:
|
|
463
472
|
before = [lines[i].rstrip() for i in range(max(0, idx - context), idx)] if context else []
|
|
464
473
|
after = (
|
|
465
474
|
[lines[i].rstrip() for i in range(idx + 1, min(len(lines), idx + 1 + context))]
|
|
@@ -476,6 +485,8 @@ def grep_docs(
|
|
|
476
485
|
)
|
|
477
486
|
)
|
|
478
487
|
total += len(matches)
|
|
488
|
+
if timed_out:
|
|
489
|
+
break
|
|
479
490
|
if timed_out:
|
|
480
491
|
break
|
|
481
492
|
|
|
@@ -71,8 +71,10 @@ class ByteSize(int):
|
|
|
71
71
|
# Try parsing as plain number
|
|
72
72
|
try:
|
|
73
73
|
return int(v)
|
|
74
|
-
except ValueError:
|
|
75
|
-
|
|
74
|
+
except ValueError as err:
|
|
75
|
+
raise ValueError(
|
|
76
|
+
f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes."
|
|
77
|
+
) from err
|
|
76
78
|
raise ValueError(f"Invalid byte size: {v}. Use format like '200kb', '1mb', or integer bytes.")
|
|
77
79
|
|
|
78
80
|
|
|
@@ -7,11 +7,11 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import tempfile
|
|
10
|
-
from datetime import datetime
|
|
11
10
|
from pathlib import Path
|
|
12
11
|
from typing import TextIO
|
|
13
12
|
|
|
14
13
|
from ...models.events import EventType, FetchEvent
|
|
14
|
+
from ...time_utils import utc_now_iso
|
|
15
15
|
from ..base import EventEmitter, PageContext
|
|
16
16
|
|
|
17
17
|
logger = logging.getLogger(__name__)
|
|
@@ -102,7 +102,7 @@ class JsonSaveStep:
|
|
|
102
102
|
"title": ctx.title,
|
|
103
103
|
"content": ctx.markdown,
|
|
104
104
|
"metadata": ctx.metadata,
|
|
105
|
-
"fetched_at":
|
|
105
|
+
"fetched_at": utc_now_iso(),
|
|
106
106
|
}
|
|
107
107
|
|
|
108
108
|
f = self._ensure_temp_file()
|
|
@@ -142,7 +142,7 @@ class JsonSaveStep:
|
|
|
142
142
|
# No documents written - create empty structure
|
|
143
143
|
self._base_dir.mkdir(parents=True, exist_ok=True)
|
|
144
144
|
output = {
|
|
145
|
-
"generated_at":
|
|
145
|
+
"generated_at": utc_now_iso(),
|
|
146
146
|
"document_count": 0,
|
|
147
147
|
"documents": [],
|
|
148
148
|
}
|
|
@@ -154,7 +154,7 @@ class JsonSaveStep:
|
|
|
154
154
|
try:
|
|
155
155
|
# Close the documents array and add metadata
|
|
156
156
|
self._temp_file.write("\n ],\n")
|
|
157
|
-
self._temp_file.write(f' "generated_at": "{
|
|
157
|
+
self._temp_file.write(f' "generated_at": "{utc_now_iso()}",\n')
|
|
158
158
|
self._temp_file.write(f' "document_count": {self._document_count}\n')
|
|
159
159
|
self._temp_file.write("}\n")
|
|
160
160
|
self._temp_file.close()
|
|
@@ -12,11 +12,11 @@ import hashlib
|
|
|
12
12
|
import json
|
|
13
13
|
import logging
|
|
14
14
|
import sys
|
|
15
|
-
from datetime import datetime
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from typing import IO
|
|
18
17
|
|
|
19
18
|
from ...models.events import EventType, FetchEvent
|
|
19
|
+
from ...time_utils import utc_now_iso
|
|
20
20
|
from ..base import EventEmitter, PageContext
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
@@ -77,7 +77,7 @@ class NdjsonSaveStep:
|
|
|
77
77
|
"title": ctx.title,
|
|
78
78
|
"source_type": ctx.source_type,
|
|
79
79
|
"metadata": ctx.metadata,
|
|
80
|
-
"fetched_at":
|
|
80
|
+
"fetched_at": utc_now_iso(),
|
|
81
81
|
}
|
|
82
82
|
|
|
83
83
|
async with self._lock:
|
|
@@ -5,10 +5,10 @@ from __future__ import annotations
|
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
7
|
import sqlite3
|
|
8
|
-
from datetime import datetime
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
|
|
11
10
|
from ...models.events import EventType, FetchEvent
|
|
11
|
+
from ...time_utils import utc_now_iso
|
|
12
12
|
from ..base import EventEmitter, PageContext
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
@@ -109,7 +109,7 @@ class SqliteSaveStep:
|
|
|
109
109
|
ctx.title,
|
|
110
110
|
ctx.markdown,
|
|
111
111
|
json.dumps(ctx.metadata, ensure_ascii=False),
|
|
112
|
-
|
|
112
|
+
utc_now_iso(),
|
|
113
113
|
),
|
|
114
114
|
)
|
|
115
115
|
# Only count if a row was actually inserted (not ignored)
|
|
@@ -348,7 +348,8 @@ class RobotsChecker:
|
|
|
348
348
|
delay = parser.crawl_delay(self.user_agent)
|
|
349
349
|
if delay is not None:
|
|
350
350
|
return float(delay)
|
|
351
|
-
except (TypeError, ValueError):
|
|
351
|
+
except (TypeError, ValueError) as err:
|
|
352
|
+
self.logger.debug("Ignoring invalid Crawl-delay for %s: %s", url, err)
|
|
352
353
|
return None
|
|
353
354
|
|
|
354
355
|
return None
|
|
@@ -372,7 +373,8 @@ class RobotsChecker:
|
|
|
372
373
|
try:
|
|
373
374
|
sitemaps = parser.site_maps()
|
|
374
375
|
return list(sitemaps) if sitemaps else []
|
|
375
|
-
except Exception:
|
|
376
|
+
except Exception as err:
|
|
377
|
+
self.logger.debug("Could not read Sitemap entries for %s: %s", url, err)
|
|
376
378
|
return []
|
|
377
379
|
|
|
378
380
|
def clear_cache(self) -> None:
|
|
@@ -150,9 +150,7 @@ class UrlValidator:
|
|
|
150
150
|
ipaddress.ip_address(normalized)
|
|
151
151
|
return [normalized]
|
|
152
152
|
except ValueError:
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
return self._resolver(normalized)
|
|
153
|
+
return self._resolver(normalized)
|
|
156
154
|
|
|
157
155
|
def _resolve_hostname(self, hostname: str) -> list[str]:
|
|
158
156
|
"""Resolve hostname to a deduplicated list of IP addresses."""
|
|
@@ -212,11 +210,13 @@ class UrlValidator:
|
|
|
212
210
|
This closes the gap where attacker-controlled DNS maps a public-looking
|
|
213
211
|
hostname to a private or loopback address.
|
|
214
212
|
"""
|
|
213
|
+
is_hostname_ip = True
|
|
215
214
|
try:
|
|
216
215
|
ipaddress.ip_address(hostname)
|
|
217
|
-
return None
|
|
218
216
|
except ValueError:
|
|
219
|
-
|
|
217
|
+
is_hostname_ip = False
|
|
218
|
+
if is_hostname_ip:
|
|
219
|
+
return None
|
|
220
220
|
|
|
221
221
|
try:
|
|
222
222
|
addresses = self._resolver(hostname)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""UTC time helpers for persisted docpull data."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def utc_now() -> datetime:
|
|
9
|
+
"""Return the current instant as a timezone-aware UTC datetime."""
|
|
10
|
+
return datetime.now(timezone.utc)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def utc_now_iso() -> str:
|
|
14
|
+
"""Return the current instant as an ISO-8601 UTC timestamp."""
|
|
15
|
+
return utc_now().isoformat()
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def parse_persisted_datetime(value: str) -> datetime:
|
|
19
|
+
"""Parse a stored timestamp and normalize it to timezone-aware UTC.
|
|
20
|
+
|
|
21
|
+
Older cache files used naive local timestamps. Treat those legacy values
|
|
22
|
+
as UTC so comparisons stay deterministic after newer writes include an
|
|
23
|
+
explicit ``+00:00`` offset.
|
|
24
|
+
"""
|
|
25
|
+
normalized = value[:-1] + "+00:00" if value.endswith("Z") else value
|
|
26
|
+
parsed = datetime.fromisoformat(normalized)
|
|
27
|
+
if parsed.tzinfo is None:
|
|
28
|
+
return parsed.replace(tzinfo=timezone.utc)
|
|
29
|
+
return parsed.astimezone(timezone.utc)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 3.0.
|
|
3
|
+
Version: 3.0.1
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -43,9 +43,12 @@ Requires-Dist: html2text>=2020.1.16
|
|
|
43
43
|
Requires-Dist: defusedxml>=0.7.1
|
|
44
44
|
Requires-Dist: extruct>=0.15.0
|
|
45
45
|
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: idna>=3.15
|
|
47
|
+
Requires-Dist: regex>=2024.11.6
|
|
46
48
|
Requires-Dist: rich>=13.0.0
|
|
47
49
|
Requires-Dist: pyyaml>=6.0
|
|
48
50
|
Requires-Dist: pydantic>=2.0
|
|
51
|
+
Requires-Dist: urllib3>=2.7.0
|
|
49
52
|
Provides-Extra: proxy
|
|
50
53
|
Requires-Dist: aiohttp-socks>=0.8.0; extra == "proxy"
|
|
51
54
|
Provides-Extra: normalize
|
|
@@ -56,6 +59,8 @@ Provides-Extra: tokens
|
|
|
56
59
|
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
57
60
|
Provides-Extra: mcp
|
|
58
61
|
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
62
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
|
|
63
|
+
Requires-Dist: starlette>=1.0.1; extra == "mcp"
|
|
59
64
|
Provides-Extra: llm
|
|
60
65
|
Requires-Dist: tiktoken>=0.7.0; extra == "llm"
|
|
61
66
|
Provides-Extra: all
|
|
@@ -64,6 +69,8 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
|
64
69
|
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
65
70
|
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
66
71
|
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
72
|
+
Requires-Dist: python-multipart>=0.0.27; extra == "all"
|
|
73
|
+
Requires-Dist: starlette>=1.0.1; extra == "all"
|
|
67
74
|
Provides-Extra: dev
|
|
68
75
|
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
69
76
|
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
@@ -8,6 +8,7 @@ src/docpull/doctor.py
|
|
|
8
8
|
src/docpull/logging_config.py
|
|
9
9
|
src/docpull/metadata_extractor.py
|
|
10
10
|
src/docpull/py.typed
|
|
11
|
+
src/docpull/time_utils.py
|
|
11
12
|
src/docpull.egg-info/PKG-INFO
|
|
12
13
|
src/docpull.egg-info/SOURCES.txt
|
|
13
14
|
src/docpull.egg-info/dependency_links.txt
|
|
@@ -80,4 +81,5 @@ tests/test_pipeline.py
|
|
|
80
81
|
tests/test_real_site_regressions.py
|
|
81
82
|
tests/test_save_ndjson.py
|
|
82
83
|
tests/test_security_hardening.py
|
|
83
|
-
tests/test_special_cases.py
|
|
84
|
+
tests/test_special_cases.py
|
|
85
|
+
tests/test_time_utils.py
|
|
@@ -3,9 +3,12 @@ html2text>=2020.1.16
|
|
|
3
3
|
defusedxml>=0.7.1
|
|
4
4
|
extruct>=0.15.0
|
|
5
5
|
aiohttp>=3.9.0
|
|
6
|
+
idna>=3.15
|
|
7
|
+
regex>=2024.11.6
|
|
6
8
|
rich>=13.0.0
|
|
7
9
|
pyyaml>=6.0
|
|
8
10
|
pydantic>=2.0
|
|
11
|
+
urllib3>=2.7.0
|
|
9
12
|
|
|
10
13
|
[all]
|
|
11
14
|
aiohttp-socks>=0.8.0
|
|
@@ -13,6 +16,8 @@ url-normalize>=1.4.0
|
|
|
13
16
|
trafilatura>=1.12.0
|
|
14
17
|
tiktoken>=0.7.0
|
|
15
18
|
mcp>=1.0.0
|
|
19
|
+
python-multipart>=0.0.27
|
|
20
|
+
starlette>=1.0.1
|
|
16
21
|
|
|
17
22
|
[dev]
|
|
18
23
|
pytest>=7.0.0
|
|
@@ -33,6 +38,8 @@ tiktoken>=0.7.0
|
|
|
33
38
|
|
|
34
39
|
[mcp]
|
|
35
40
|
mcp>=1.0.0
|
|
41
|
+
python-multipart>=0.0.27
|
|
42
|
+
starlette>=1.0.1
|
|
36
43
|
|
|
37
44
|
[normalize]
|
|
38
45
|
url-normalize>=1.4.0
|
|
@@ -22,6 +22,7 @@ from docpull.mcp.tools import (
|
|
|
22
22
|
read_doc,
|
|
23
23
|
remove_source,
|
|
24
24
|
)
|
|
25
|
+
from docpull.security.url_validator import UrlValidationResult
|
|
25
26
|
|
|
26
27
|
|
|
27
28
|
def test_builtin_sources_include_common_libraries():
|
|
@@ -329,6 +330,18 @@ def test_grep_docs_rejects_invalid_regex(tmp_path):
|
|
|
329
330
|
assert "Invalid pattern" in result.text
|
|
330
331
|
|
|
331
332
|
|
|
333
|
+
def test_grep_docs_times_out_pathological_regex(tmp_path, monkeypatch):
|
|
334
|
+
lib = tmp_path / "lib"
|
|
335
|
+
lib.mkdir()
|
|
336
|
+
(lib / "a.md").write_text("a" * 20_000 + "!")
|
|
337
|
+
monkeypatch.setattr("docpull.mcp.tools.GREP_LINE_TIMEOUT_SECONDS", 0.001)
|
|
338
|
+
|
|
339
|
+
result = grep_docs(r"(a+)+$", docs_dir=tmp_path)
|
|
340
|
+
|
|
341
|
+
assert result.is_error is False
|
|
342
|
+
assert result.data["timed_out"] is True
|
|
343
|
+
|
|
344
|
+
|
|
332
345
|
# --- Robustness -------------------------------------------------------
|
|
333
346
|
|
|
334
347
|
|
|
@@ -341,6 +354,44 @@ def test_load_user_sources_logs_yaml_error(tmp_path, caplog):
|
|
|
341
354
|
assert any("Failed to parse" in rec.message for rec in caplog.records)
|
|
342
355
|
|
|
343
356
|
|
|
357
|
+
def test_load_user_sources_rejects_unsafe_manual_entries(tmp_path, caplog, monkeypatch):
|
|
358
|
+
class FakeValidator:
|
|
359
|
+
def validate(self, url: str) -> UrlValidationResult:
|
|
360
|
+
if "blocked.example" in url:
|
|
361
|
+
return UrlValidationResult.invalid("blocked test host")
|
|
362
|
+
return UrlValidationResult.valid()
|
|
363
|
+
|
|
364
|
+
monkeypatch.setattr("docpull.mcp.sources._USER_SOURCE_URL_VALIDATOR", FakeValidator())
|
|
365
|
+
path = tmp_path / "sources.yaml"
|
|
366
|
+
path.write_text(
|
|
367
|
+
"""
|
|
368
|
+
sources:
|
|
369
|
+
good:
|
|
370
|
+
url: https://example.com/docs
|
|
371
|
+
max_pages: "5"
|
|
372
|
+
../bad:
|
|
373
|
+
url: https://example.com/docs
|
|
374
|
+
plain_http:
|
|
375
|
+
url: http://example.com/docs
|
|
376
|
+
blocked:
|
|
377
|
+
url: https://blocked.example/docs
|
|
378
|
+
too_many:
|
|
379
|
+
url: https://example.com/docs
|
|
380
|
+
max_pages: 100001
|
|
381
|
+
"""
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
with caplog.at_level(logging.WARNING, logger="docpull.mcp.sources"):
|
|
385
|
+
sources = load_user_sources(path=path)
|
|
386
|
+
|
|
387
|
+
assert list(sources) == ["good"]
|
|
388
|
+
assert sources["good"].max_pages == 5
|
|
389
|
+
assert any("unsafe source name" in rec.message for rec in caplog.records)
|
|
390
|
+
assert any("url must be an HTTPS URL" in rec.message for rec in caplog.records)
|
|
391
|
+
assert any("blocked test host" in rec.message for rec in caplog.records)
|
|
392
|
+
assert any("max_pages must be between" in rec.message for rec in caplog.records)
|
|
393
|
+
|
|
394
|
+
|
|
344
395
|
def test_partial_meta_treats_cache_as_stale(tmp_path):
|
|
345
396
|
"""A meta file marked partial=true should not be considered fresh."""
|
|
346
397
|
import json
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import timezone
|
|
4
|
+
|
|
5
|
+
from docpull.time_utils import parse_persisted_datetime, utc_now_iso
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_utc_now_iso_is_timezone_explicit() -> None:
|
|
9
|
+
assert utc_now_iso().endswith("+00:00")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_parse_persisted_datetime_normalizes_legacy_naive_values() -> None:
|
|
13
|
+
parsed = parse_persisted_datetime("2026-04-26T00:00:00")
|
|
14
|
+
|
|
15
|
+
assert parsed.tzinfo == timezone.utc
|
|
16
|
+
assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_parse_persisted_datetime_accepts_z_suffix() -> None:
|
|
20
|
+
parsed = parse_persisted_datetime("2026-04-26T00:00:00Z")
|
|
21
|
+
|
|
22
|
+
assert parsed.tzinfo == timezone.utc
|
|
23
|
+
assert parsed.isoformat() == "2026-04-26T00:00:00+00:00"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|