mcp-data-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_data_core/__init__.py +78 -0
- mcp_data_core/base_client.py +348 -0
- mcp_data_core/cache.py +367 -0
- mcp_data_core/corpus_compression.py +160 -0
- mcp_data_core/corpus_db.py +371 -0
- mcp_data_core/envelope.py +211 -0
- mcp_data_core/exceptions.py +110 -0
- mcp_data_core/filenames.py +233 -0
- mcp_data_core/logging.py +94 -0
- mcp_data_core/mcp/__init__.py +74 -0
- mcp_data_core/mcp/_env.py +24 -0
- mcp_data_core/mcp/annotations.py +18 -0
- mcp_data_core/mcp/auth.py +238 -0
- mcp_data_core/mcp/conditional.py +169 -0
- mcp_data_core/mcp/downloads.py +972 -0
- mcp_data_core/mcp/middleware.py +201 -0
- mcp_data_core/mcp/server_factory.py +113 -0
- mcp_data_core/oauth2.py +171 -0
- mcp_data_core/py.typed +0 -0
- mcp_data_core/resilience.py +99 -0
- mcp_data_core-0.1.0.dist-info/METADATA +180 -0
- mcp_data_core-0.1.0.dist-info/RECORD +24 -0
- mcp_data_core-0.1.0.dist-info/WHEEL +4 -0
- mcp_data_core-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Shared HTTP and MCP scaffolding for consumer libraries.
|
|
2
|
+
|
|
3
|
+
Provides the infrastructure that consumers build on:
|
|
4
|
+
|
|
5
|
+
- Exception hierarchy for API errors (``McpDataCoreError`` and subclasses)
|
|
6
|
+
- ``BaseAsyncClient`` with caching and retry support
|
|
7
|
+
- HTTP caching utilities (``CacheManager``, ``build_cached_http_client``)
|
|
8
|
+
- Resilience utilities (``default_retryer``, ``with_retry``)
|
|
9
|
+
- File-based logging configured per consumer app (``configure``)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from .base_client import BaseAsyncClient
|
|
13
|
+
from .cache import CacheManager, CacheStats, build_cached_http_client
|
|
14
|
+
from .envelope import (
|
|
15
|
+
ListEnvelope,
|
|
16
|
+
Provenance,
|
|
17
|
+
ResponseEnvelope,
|
|
18
|
+
decode_cursor,
|
|
19
|
+
encode_cursor,
|
|
20
|
+
make_provenance,
|
|
21
|
+
)
|
|
22
|
+
from .envelope import configure as configure_envelope
|
|
23
|
+
from .exceptions import (
|
|
24
|
+
ApiError,
|
|
25
|
+
AuthenticationError,
|
|
26
|
+
ConfigurationError,
|
|
27
|
+
McpDataCoreError,
|
|
28
|
+
NotFoundError,
|
|
29
|
+
ParseError,
|
|
30
|
+
RateLimitError,
|
|
31
|
+
ServerError,
|
|
32
|
+
ValidationError,
|
|
33
|
+
)
|
|
34
|
+
from .logging import configure, log_file_hint
|
|
35
|
+
from .oauth2 import OAuth2ClientCredentialsAuth
|
|
36
|
+
from .resilience import (
|
|
37
|
+
RETRYABLE_STATUS_CODES,
|
|
38
|
+
default_retryer,
|
|
39
|
+
is_retryable_error,
|
|
40
|
+
with_retry,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
# Base client
|
|
45
|
+
"BaseAsyncClient",
|
|
46
|
+
# Caching
|
|
47
|
+
"build_cached_http_client",
|
|
48
|
+
"CacheManager",
|
|
49
|
+
"CacheStats",
|
|
50
|
+
# Envelope
|
|
51
|
+
"Provenance",
|
|
52
|
+
"ResponseEnvelope",
|
|
53
|
+
"ListEnvelope",
|
|
54
|
+
"configure_envelope",
|
|
55
|
+
"make_provenance",
|
|
56
|
+
"encode_cursor",
|
|
57
|
+
"decode_cursor",
|
|
58
|
+
# Exceptions
|
|
59
|
+
"McpDataCoreError",
|
|
60
|
+
"ApiError",
|
|
61
|
+
"NotFoundError",
|
|
62
|
+
"RateLimitError",
|
|
63
|
+
"AuthenticationError",
|
|
64
|
+
"ServerError",
|
|
65
|
+
"ValidationError",
|
|
66
|
+
"ConfigurationError",
|
|
67
|
+
"ParseError",
|
|
68
|
+
# Logging
|
|
69
|
+
"configure",
|
|
70
|
+
"log_file_hint",
|
|
71
|
+
# OAuth2
|
|
72
|
+
"OAuth2ClientCredentialsAuth",
|
|
73
|
+
# Resilience
|
|
74
|
+
"RETRYABLE_STATUS_CODES",
|
|
75
|
+
"is_retryable_error",
|
|
76
|
+
"default_retryer",
|
|
77
|
+
"with_retry",
|
|
78
|
+
]
|
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
"""Base async HTTP client with standardized patterns."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import logging
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, Self
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
from .cache import CacheManager, CacheStats, build_cached_http_client, get_default_cache_dir
|
|
14
|
+
from .exceptions import (
|
|
15
|
+
ApiError,
|
|
16
|
+
AuthenticationError,
|
|
17
|
+
NotFoundError,
|
|
18
|
+
RateLimitError,
|
|
19
|
+
ServerError,
|
|
20
|
+
)
|
|
21
|
+
from .resilience import default_retryer
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BaseAsyncClient:
|
|
27
|
+
"""Base class for async API clients with caching and retry support.
|
|
28
|
+
|
|
29
|
+
Subclasses should override:
|
|
30
|
+
- DEFAULT_BASE_URL: The default API base URL
|
|
31
|
+
- CACHE_NAME: Name for the cache database file
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
class MyApiClient(BaseAsyncClient):
|
|
35
|
+
DEFAULT_BASE_URL = "https://api.example.com"
|
|
36
|
+
CACHE_NAME = "my_api"
|
|
37
|
+
|
|
38
|
+
async def get_resource(self, id: str) -> dict:
|
|
39
|
+
return await self._request_json("GET", f"/resources/{id}")
|
|
40
|
+
|
|
41
|
+
Cache Management:
|
|
42
|
+
async with MyApiClient() as client:
|
|
43
|
+
# Make requests...
|
|
44
|
+
result = await client.get_resource("123")
|
|
45
|
+
|
|
46
|
+
# Get cache statistics
|
|
47
|
+
stats = await client.cache_stats()
|
|
48
|
+
print(f"Hit rate: {stats.hit_rate:.1f}%")
|
|
49
|
+
print(f"Cache size: {stats.size_mb:.2f} MB")
|
|
50
|
+
|
|
51
|
+
# Clear all cached data
|
|
52
|
+
cleared = await client.cache_clear()
|
|
53
|
+
print(f"Cleared {cleared} entries")
|
|
54
|
+
|
|
55
|
+
# Clear entries older than 1 hour
|
|
56
|
+
cleared = await client.cache_clear_expired(max_age=timedelta(hours=1))
|
|
57
|
+
|
|
58
|
+
# Invalidate specific URLs by pattern
|
|
59
|
+
cleared = await client.cache_invalidate(r"/resources/123")
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
DEFAULT_BASE_URL: str = ""
|
|
63
|
+
CACHE_NAME: str = "default"
|
|
64
|
+
DEFAULT_TIMEOUT: float = 30.0
|
|
65
|
+
HTTP2: bool = False
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
*,
|
|
70
|
+
base_url: str | None = None,
|
|
71
|
+
cache_path: Path | None = None,
|
|
72
|
+
client: httpx.AsyncClient | None = None,
|
|
73
|
+
use_cache: bool = True,
|
|
74
|
+
ttl_seconds: int | None = None,
|
|
75
|
+
max_retries: int = 4,
|
|
76
|
+
headers: dict[str, str] | None = None,
|
|
77
|
+
timeout: float | None = None,
|
|
78
|
+
auth: httpx.Auth | None = None,
|
|
79
|
+
http2: bool | None = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Initialize the client.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
base_url: Override the default base URL.
|
|
85
|
+
cache_path: Custom path for the cache directory.
|
|
86
|
+
client: Existing httpx.AsyncClient to use (for testing).
|
|
87
|
+
use_cache: Whether to enable HTTP caching.
|
|
88
|
+
ttl_seconds: Default TTL for cache entries. None uses HTTP headers.
|
|
89
|
+
max_retries: Maximum retry attempts for transient failures.
|
|
90
|
+
headers: Additional headers to include in requests.
|
|
91
|
+
timeout: Request timeout in seconds (defaults to ``DEFAULT_TIMEOUT``).
|
|
92
|
+
auth: httpx Auth handler (e.g. for OAuth2 token refresh).
|
|
93
|
+
http2: Enable HTTP/2 for the underlying client. Falls back to
|
|
94
|
+
the subclass ``HTTP2`` class attribute. Some upstream APIs
|
|
95
|
+
(e.g. api.publicrecords.copyright.gov) reject HTTP/1.1.
|
|
96
|
+
"""
|
|
97
|
+
self.base_url = (base_url or self.DEFAULT_BASE_URL).rstrip("/")
|
|
98
|
+
self._owns_client = client is None
|
|
99
|
+
self._max_retries = max_retries
|
|
100
|
+
self._timeout = timeout or self.DEFAULT_TIMEOUT
|
|
101
|
+
self._cache_manager: CacheManager | None = None
|
|
102
|
+
resolved_http2 = self.HTTP2 if http2 is None else http2
|
|
103
|
+
|
|
104
|
+
if client is None:
|
|
105
|
+
cache_dir = cache_path or get_default_cache_dir()
|
|
106
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
self._client, self._cache_manager = build_cached_http_client(
|
|
108
|
+
use_cache=use_cache,
|
|
109
|
+
cache_name=self.CACHE_NAME,
|
|
110
|
+
cache_dir=cache_dir,
|
|
111
|
+
ttl_seconds=ttl_seconds,
|
|
112
|
+
headers=headers or {},
|
|
113
|
+
follow_redirects=True,
|
|
114
|
+
timeout=self._timeout,
|
|
115
|
+
auth=auth,
|
|
116
|
+
http2=resolved_http2,
|
|
117
|
+
)
|
|
118
|
+
else:
|
|
119
|
+
self._client = client
|
|
120
|
+
if headers:
|
|
121
|
+
for key, value in headers.items():
|
|
122
|
+
self._client.headers.setdefault(key, value)
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def cache_enabled(self) -> bool:
|
|
126
|
+
"""Check if caching is enabled."""
|
|
127
|
+
return self._cache_manager is not None
|
|
128
|
+
|
|
129
|
+
async def cache_stats(self) -> CacheStats:
|
|
130
|
+
"""Get cache statistics.
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
CacheStats with hits, misses, entry count, and size.
|
|
134
|
+
|
|
135
|
+
Raises:
|
|
136
|
+
RuntimeError: If caching is disabled.
|
|
137
|
+
"""
|
|
138
|
+
if self._cache_manager is None:
|
|
139
|
+
raise RuntimeError("Caching is disabled for this client")
|
|
140
|
+
return await self._cache_manager.get_stats()
|
|
141
|
+
|
|
142
|
+
async def cache_clear(self) -> int:
|
|
143
|
+
"""Clear all cache entries.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Number of entries cleared.
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
RuntimeError: If caching is disabled.
|
|
150
|
+
"""
|
|
151
|
+
if self._cache_manager is None:
|
|
152
|
+
raise RuntimeError("Caching is disabled for this client")
|
|
153
|
+
return await self._cache_manager.clear_all()
|
|
154
|
+
|
|
155
|
+
async def cache_clear_expired(self, max_age: timedelta | None = None) -> int:
|
|
156
|
+
"""Clear expired cache entries.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
max_age: Maximum age for entries. Defaults to TTL or 24 hours.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Number of entries cleared.
|
|
163
|
+
|
|
164
|
+
Raises:
|
|
165
|
+
RuntimeError: If caching is disabled.
|
|
166
|
+
"""
|
|
167
|
+
if self._cache_manager is None:
|
|
168
|
+
raise RuntimeError("Caching is disabled for this client")
|
|
169
|
+
return await self._cache_manager.clear_expired(max_age)
|
|
170
|
+
|
|
171
|
+
async def cache_invalidate(self, url_pattern: str) -> int:
|
|
172
|
+
"""Invalidate cache entries matching a URL pattern.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
url_pattern: Regex pattern to match against cached URLs.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
Number of entries invalidated.
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
RuntimeError: If caching is disabled.
|
|
182
|
+
"""
|
|
183
|
+
if self._cache_manager is None:
|
|
184
|
+
raise RuntimeError("Caching is disabled for this client")
|
|
185
|
+
return await self._cache_manager.invalidate_pattern(url_pattern)
|
|
186
|
+
|
|
187
|
+
async def close(self) -> None:
|
|
188
|
+
"""Close the underlying HTTP client if we own it."""
|
|
189
|
+
if self._owns_client:
|
|
190
|
+
await self._client.aclose()
|
|
191
|
+
if self._cache_manager is not None:
|
|
192
|
+
await self._cache_manager.close()
|
|
193
|
+
|
|
194
|
+
async def __aenter__(self) -> Self:
|
|
195
|
+
return self
|
|
196
|
+
|
|
197
|
+
async def __aexit__(self, *exc: object) -> None:
|
|
198
|
+
await self.close()
|
|
199
|
+
|
|
200
|
+
def _build_url(self, path: str) -> str:
|
|
201
|
+
"""Build a full URL from a path."""
|
|
202
|
+
return f"{self.base_url}{path}"
|
|
203
|
+
|
|
204
|
+
def _raise_for_status(self, response: httpx.Response, context: str = "") -> None:
|
|
205
|
+
"""Convert HTTP errors to typed exceptions.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
response: The HTTP response to check.
|
|
209
|
+
context: Optional context string for error messages.
|
|
210
|
+
|
|
211
|
+
Raises:
|
|
212
|
+
NotFoundError: For 404 responses.
|
|
213
|
+
RateLimitError: For 429 responses.
|
|
214
|
+
AuthenticationError: For 401/403 responses.
|
|
215
|
+
ServerError: For 5xx responses.
|
|
216
|
+
ApiError: For other non-success responses.
|
|
217
|
+
"""
|
|
218
|
+
if response.is_success:
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
status = response.status_code
|
|
222
|
+
body = response.text[:500] if response.text else ""
|
|
223
|
+
msg = f"{context}: HTTP {status}" if context else f"HTTP {status}"
|
|
224
|
+
|
|
225
|
+
# Log full response details to file for debugging
|
|
226
|
+
logger.error(
|
|
227
|
+
"%s %s -> %s\nResponse body: %s",
|
|
228
|
+
response.request.method,
|
|
229
|
+
response.request.url,
|
|
230
|
+
status,
|
|
231
|
+
body,
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if status == 404:
|
|
235
|
+
raise NotFoundError(msg, status, body)
|
|
236
|
+
if status == 429:
|
|
237
|
+
retry_after: float | None = None
|
|
238
|
+
raw = response.headers.get("Retry-After")
|
|
239
|
+
if raw is not None:
|
|
240
|
+
with contextlib.suppress(ValueError):
|
|
241
|
+
retry_after = float(raw)
|
|
242
|
+
raise RateLimitError(msg, status, body, retry_after=retry_after)
|
|
243
|
+
if status in (401, 403):
|
|
244
|
+
raise AuthenticationError(msg, status, body)
|
|
245
|
+
if 500 <= status < 600:
|
|
246
|
+
raise ServerError(msg, status, body)
|
|
247
|
+
raise ApiError(msg, status, body)
|
|
248
|
+
|
|
249
|
+
async def _request(
|
|
250
|
+
self,
|
|
251
|
+
method: str,
|
|
252
|
+
path: str,
|
|
253
|
+
*,
|
|
254
|
+
params: dict[str, Any] | None = None,
|
|
255
|
+
json: dict[str, Any] | None = None,
|
|
256
|
+
data: dict[str, Any] | None = None,
|
|
257
|
+
content: bytes | None = None,
|
|
258
|
+
headers: dict[str, str] | None = None,
|
|
259
|
+
context: str = "",
|
|
260
|
+
timeout: float | None = None,
|
|
261
|
+
) -> httpx.Response:
|
|
262
|
+
"""Make an HTTP request with retry logic.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
method: HTTP method (GET, POST, etc.).
|
|
266
|
+
path: URL path (will be appended to base_url).
|
|
267
|
+
params: Query parameters.
|
|
268
|
+
json: JSON body for POST/PUT requests.
|
|
269
|
+
data: Form-encoded body.
|
|
270
|
+
content: Raw bytes body.
|
|
271
|
+
headers: Per-request header overrides (merged on top of client headers).
|
|
272
|
+
context: Context string for error messages.
|
|
273
|
+
timeout: Optional per-request timeout in seconds.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
The HTTP response.
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
ApiError: On non-retryable HTTP errors.
|
|
280
|
+
"""
|
|
281
|
+
url = self._build_url(path)
|
|
282
|
+
request_kwargs: dict[str, Any] = {}
|
|
283
|
+
if params:
|
|
284
|
+
request_kwargs["params"] = params
|
|
285
|
+
if json is not None:
|
|
286
|
+
request_kwargs["json"] = json
|
|
287
|
+
if data is not None:
|
|
288
|
+
request_kwargs["data"] = data
|
|
289
|
+
if content is not None:
|
|
290
|
+
request_kwargs["content"] = content
|
|
291
|
+
if headers:
|
|
292
|
+
request_kwargs["headers"] = headers
|
|
293
|
+
if timeout:
|
|
294
|
+
request_kwargs["timeout"] = timeout
|
|
295
|
+
|
|
296
|
+
async for attempt in default_retryer(max_attempts=self._max_retries):
|
|
297
|
+
with attempt:
|
|
298
|
+
response = await self._client.request(method, url, **request_kwargs)
|
|
299
|
+
self._raise_for_status(response, context)
|
|
300
|
+
return response
|
|
301
|
+
|
|
302
|
+
# Should not reach here due to reraise=True in retryer
|
|
303
|
+
raise RuntimeError("Unexpected retry exhaustion")
|
|
304
|
+
|
|
305
|
+
async def _request_json(
|
|
306
|
+
self,
|
|
307
|
+
method: str,
|
|
308
|
+
path: str,
|
|
309
|
+
*,
|
|
310
|
+
params: dict[str, Any] | None = None,
|
|
311
|
+
json: dict[str, Any] | None = None,
|
|
312
|
+
data: dict[str, Any] | None = None,
|
|
313
|
+
content: bytes | None = None,
|
|
314
|
+
headers: dict[str, str] | None = None,
|
|
315
|
+
context: str = "",
|
|
316
|
+
timeout: float | None = None,
|
|
317
|
+
) -> dict[str, Any]:
|
|
318
|
+
"""Make an HTTP request and return JSON response.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
method: HTTP method.
|
|
322
|
+
path: URL path.
|
|
323
|
+
params: Query parameters.
|
|
324
|
+
json: JSON body.
|
|
325
|
+
data: Form-encoded body.
|
|
326
|
+
content: Raw bytes body.
|
|
327
|
+
headers: Per-request header overrides.
|
|
328
|
+
context: Context string for error messages.
|
|
329
|
+
timeout: Optional per-request timeout.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
Parsed JSON response as a dictionary.
|
|
333
|
+
"""
|
|
334
|
+
response = await self._request(
|
|
335
|
+
method,
|
|
336
|
+
path,
|
|
337
|
+
params=params,
|
|
338
|
+
json=json,
|
|
339
|
+
data=data,
|
|
340
|
+
content=content,
|
|
341
|
+
headers=headers,
|
|
342
|
+
context=context,
|
|
343
|
+
timeout=timeout,
|
|
344
|
+
)
|
|
345
|
+
return response.json()
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
__all__ = ["BaseAsyncClient"]
|