afs-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- afs_core/__init__.py +9 -0
- afs_core/contracts/__init__.py +11 -0
- afs_core/contracts/catalog.py +90 -0
- afs_core/contracts/normalize.py +40 -0
- afs_core/contracts/objects.py +44 -0
- afs_core/errors.py +212 -0
- afs_core/keys.py +271 -0
- afs_core/models/__init__.py +43 -0
- afs_core/models/control.py +62 -0
- afs_core/models/core.py +60 -0
- afs_core/models/extraction.py +50 -0
- afs_core/models/objects.py +26 -0
- afs_core/py.typed +0 -0
- afs_core/testing/__init__.py +23 -0
- afs_core/testing/conformance.py +220 -0
- afs_core/testing/memory.py +285 -0
- afs_core-0.1.0.dist-info/METADATA +49 -0
- afs_core-0.1.0.dist-info/RECORD +19 -0
- afs_core-0.1.0.dist-info/WHEEL +4 -0
afs_core/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""agentic-fs core: contracts, DTOs, the key scheme, and the error vocabulary.
|
|
2
|
+
|
|
3
|
+
Depends on pydantic only — importable without the server. (``afs_core.testing``
|
|
4
|
+
is intentionally not imported here: it depends on pytest and is opt-in.)
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from afs_core import contracts, errors, keys, models
|
|
8
|
+
|
|
9
|
+
__all__ = ["contracts", "errors", "keys", "models"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""The agentic-fs contracts: async ``typing.Protocol`` interfaces (plan §5).
|
|
2
|
+
|
|
3
|
+
Structural — adopters implement without importing our hierarchy. Each is proven
|
|
4
|
+
by a conformance kit in :mod:`afs_core.testing`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from afs_core.contracts.catalog import CatalogStore
|
|
8
|
+
from afs_core.contracts.normalize import NormalizationError, Normalizer
|
|
9
|
+
from afs_core.contracts.objects import ObjectStore
|
|
10
|
+
|
|
11
|
+
__all__ = ["CatalogStore", "NormalizationError", "Normalizer", "ObjectStore"]
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""The ``CatalogStore`` contract (plan §5.1).
|
|
2
|
+
|
|
3
|
+
One contract covers entries + control records + checkpoints + scratch quota, so a
|
|
4
|
+
self-hoster swaps **one** stateful dependency. Structural ``Protocol``, async.
|
|
5
|
+
Certify an impl with ``CatalogStoreConformance`` (afs_core.testing); DynamoDB and
|
|
6
|
+
Postgres are the two reference implementations.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
from afs_core.models import (
|
|
14
|
+
CatalogEntry,
|
|
15
|
+
ExtractionState,
|
|
16
|
+
NamespaceRecord,
|
|
17
|
+
Page,
|
|
18
|
+
PrincipalRecord,
|
|
19
|
+
ScratchUsage,
|
|
20
|
+
SyncCheckpoint,
|
|
21
|
+
TenantRecord,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@runtime_checkable
|
|
26
|
+
class CatalogStore(Protocol):
|
|
27
|
+
# -- entries (derived index of S3; healable FROM S3) --
|
|
28
|
+
async def put_entry(self, entry: CatalogEntry) -> None: ...
|
|
29
|
+
|
|
30
|
+
async def get_entry(self, tenant_id: str, namespace: str, path: str) -> CatalogEntry | None: ...
|
|
31
|
+
|
|
32
|
+
async def delete_entry(
|
|
33
|
+
self, tenant_id: str, namespace: str, path: str, *, hard: bool = False
|
|
34
|
+
) -> None:
|
|
35
|
+
"""Tombstone (soft) by default; ``hard=True`` removes the row entirely."""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
async def list_entries(
|
|
39
|
+
self,
|
|
40
|
+
tenant_id: str,
|
|
41
|
+
namespace: str,
|
|
42
|
+
*,
|
|
43
|
+
prefix: str = "",
|
|
44
|
+
include_deleted: bool = False,
|
|
45
|
+
cursor: str | None = None,
|
|
46
|
+
limit: int = 1000,
|
|
47
|
+
) -> Page[CatalogEntry]: ...
|
|
48
|
+
|
|
49
|
+
async def find_by_checksum(self, tenant_id: str, checksum: str) -> list[CatalogEntry]: ...
|
|
50
|
+
|
|
51
|
+
async def set_extraction(
|
|
52
|
+
self, tenant_id: str, namespace: str, path: str, state: ExtractionState
|
|
53
|
+
) -> None: ...
|
|
54
|
+
|
|
55
|
+
async def list_by_extraction_status(
|
|
56
|
+
self, status: str, *, cursor: str | None = None, limit: int = 100
|
|
57
|
+
) -> Page[CatalogEntry]: ...
|
|
58
|
+
|
|
59
|
+
async def tree_version(self, tenant_id: str, namespace: str) -> str:
|
|
60
|
+
"""A token bumped on any write to the namespace — the tree-cache key."""
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
# -- control records (tenants / namespaces / principals) --
|
|
64
|
+
async def put_tenant(self, tenant: TenantRecord) -> None: ...
|
|
65
|
+
async def get_tenant(self, tenant_id: str) -> TenantRecord | None: ...
|
|
66
|
+
async def list_tenants(
|
|
67
|
+
self, *, cursor: str | None = None, limit: int = 100
|
|
68
|
+
) -> Page[TenantRecord]: ...
|
|
69
|
+
|
|
70
|
+
async def put_namespace(self, ns: NamespaceRecord) -> None: ...
|
|
71
|
+
async def get_namespace(self, tenant_id: str, name: str) -> NamespaceRecord | None: ...
|
|
72
|
+
async def list_namespaces(self, tenant_id: str) -> list[NamespaceRecord]: ...
|
|
73
|
+
async def delete_namespace(self, tenant_id: str, name: str) -> None: ...
|
|
74
|
+
|
|
75
|
+
async def put_principal(self, p: PrincipalRecord) -> None: ...
|
|
76
|
+
async def get_principal(self, tenant_id: str, principal_id: str) -> PrincipalRecord | None: ...
|
|
77
|
+
async def list_principals(self, tenant_id: str) -> list[PrincipalRecord]: ...
|
|
78
|
+
|
|
79
|
+
# -- connector checkpoints --
|
|
80
|
+
async def get_checkpoint(self, tenant_id: str, connector_id: str) -> SyncCheckpoint | None: ...
|
|
81
|
+
async def put_checkpoint(
|
|
82
|
+
self, tenant_id: str, connector_id: str, cp: SyncCheckpoint
|
|
83
|
+
) -> None: ...
|
|
84
|
+
|
|
85
|
+
# -- scratch quota (atomic; raises QuotaExceededError) --
|
|
86
|
+
async def adjust_scratch_usage(
|
|
87
|
+
self, tenant_id: str, principal_id: str, *, delta_bytes: int, delta_objects: int
|
|
88
|
+
) -> ScratchUsage: ...
|
|
89
|
+
|
|
90
|
+
async def get_scratch_usage(self, tenant_id: str, principal_id: str) -> ScratchUsage: ...
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""The ``Normalizer`` contract (plan §5.4) — the extractor/parser seam.
|
|
2
|
+
|
|
3
|
+
A normalizer turns one raw document into normalized per-page markdown. It is the
|
|
4
|
+
*only* place document parsing lives: text_native, docling, llamaparse, and any
|
|
5
|
+
custom parser are all just `Normalizer`s. The `ExtractionPipeline` (in
|
|
6
|
+
afs-server) orders them into a ladder, applies a quality gate, and degrades to
|
|
7
|
+
`catalog_only` — none of which a normalizer needs to know about.
|
|
8
|
+
|
|
9
|
+
Adding your own: implement this Protocol, certify it against
|
|
10
|
+
`afs_core.testing.NormalizerConformance`, register it via the `afs.normalizers`
|
|
11
|
+
entry-point group, and name it in the extraction ladder. No core changes.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Protocol, runtime_checkable
|
|
17
|
+
|
|
18
|
+
from afs_core.models import NormalizedDocument, SourceDocument
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class NormalizationError(Exception):
|
|
22
|
+
"""A normalizer couldn't parse the document. Carries a closed-vocabulary
|
|
23
|
+
``reason`` (events v1) so the pipeline can record why and try the next rung."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, reason: str, message: str | None = None) -> None:
|
|
26
|
+
self.reason = reason
|
|
27
|
+
super().__init__(message or reason)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@runtime_checkable
|
|
31
|
+
class Normalizer(Protocol):
|
|
32
|
+
name: str
|
|
33
|
+
|
|
34
|
+
def accepts(self, doc: SourceDocument) -> bool:
|
|
35
|
+
"""Whether this normalizer claims the document (by MIME/extension)."""
|
|
36
|
+
...
|
|
37
|
+
|
|
38
|
+
async def normalize(self, doc: SourceDocument) -> NormalizedDocument:
|
|
39
|
+
"""Parse ``doc`` into per-page markdown, or raise ``NormalizationError``."""
|
|
40
|
+
...
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""The ``ObjectStore`` contract (plan §5.2).
|
|
2
|
+
|
|
3
|
+
Structural ``Protocol`` — adopters implement it without importing our hierarchy
|
|
4
|
+
or depending on ``afs-server``. S3 is the only production impl; the protocol
|
|
5
|
+
exists so MinIO/LocalStack back local dev and an in-memory fake backs tests.
|
|
6
|
+
Certify any impl with ``ObjectStoreConformance`` (afs_core.testing).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Protocol, runtime_checkable
|
|
12
|
+
|
|
13
|
+
from afs_core.models import ObjectStat, Page, PresignedPut
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@runtime_checkable
|
|
17
|
+
class ObjectStore(Protocol):
|
|
18
|
+
async def get(self, key: str, *, start: int | None = None, end: int | None = None) -> bytes:
|
|
19
|
+
"""Fetch an object, optionally a byte range ``[start, end]`` (inclusive)."""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
async def put(
|
|
23
|
+
self, key: str, body: bytes, *, content_type: str | None = None
|
|
24
|
+
) -> ObjectStat: ...
|
|
25
|
+
|
|
26
|
+
async def delete(self, key: str) -> None: ...
|
|
27
|
+
|
|
28
|
+
async def delete_prefix(self, prefix: str) -> int:
|
|
29
|
+
"""Delete every object under ``prefix``; returns the count removed."""
|
|
30
|
+
...
|
|
31
|
+
|
|
32
|
+
async def stat(self, key: str) -> ObjectStat | None:
|
|
33
|
+
"""Metadata for ``key``, or ``None`` if it does not exist."""
|
|
34
|
+
...
|
|
35
|
+
|
|
36
|
+
async def list(
|
|
37
|
+
self, prefix: str, *, cursor: str | None = None, limit: int = 1000
|
|
38
|
+
) -> Page[ObjectStat]: ...
|
|
39
|
+
|
|
40
|
+
async def presigned_put(
|
|
41
|
+
self, key: str, *, content_type: str, max_bytes: int, expires_in: int = 900
|
|
42
|
+
) -> PresignedPut: ...
|
|
43
|
+
|
|
44
|
+
async def presigned_get(self, key: str, *, expires_in: int = 300) -> str: ...
|
afs_core/errors.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""The closed error vocabulary and the ``AfsError`` hierarchy.
|
|
2
|
+
|
|
3
|
+
Every error agentic-fs raises across the wire carries a code from the closed
|
|
4
|
+
:class:`ErrorCode` enum and serializes to an RFC 9457 ``application/problem+json``
|
|
5
|
+
envelope. The vocabulary is closed on purpose: clients (and the MCP tool layer)
|
|
6
|
+
can branch on a small, stable set of codes instead of parsing prose.
|
|
7
|
+
|
|
8
|
+
Design note — **misses are 404, never 403** (plan §4.1): a caller must not be
|
|
9
|
+
able to tell "exists but forbidden" from "does not exist", or they could
|
|
10
|
+
enumerate tenants/namespaces/documents. So the not-found errors below all map to
|
|
11
|
+
404 and there is intentionally no 403 for resource access.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from enum import StrEnum
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ErrorCode(StrEnum):
|
|
21
|
+
"""Closed vocabulary of machine-readable error codes."""
|
|
22
|
+
|
|
23
|
+
# Request / validation
|
|
24
|
+
VALIDATION_ERROR = "validation_error"
|
|
25
|
+
INVALID_KEY = "invalid_key"
|
|
26
|
+
INVALID_NAMESPACE = "invalid_namespace"
|
|
27
|
+
|
|
28
|
+
# Not found (also used to hide forbidden — see module docstring)
|
|
29
|
+
NOT_FOUND = "not_found"
|
|
30
|
+
TENANT_NOT_FOUND = "tenant_not_found"
|
|
31
|
+
NAMESPACE_NOT_FOUND = "namespace_not_found"
|
|
32
|
+
DOCUMENT_NOT_FOUND = "document_not_found"
|
|
33
|
+
|
|
34
|
+
# AuthN / AuthZ
|
|
35
|
+
UNAUTHENTICATED = "unauthenticated"
|
|
36
|
+
INSUFFICIENT_SCOPE = "insufficient_scope"
|
|
37
|
+
|
|
38
|
+
# Read-path / capability
|
|
39
|
+
CATALOG_ONLY = "catalog_only"
|
|
40
|
+
SEARCH_NOT_ENABLED = "search_not_enabled"
|
|
41
|
+
BUDGET_EXCEEDED = "budget_exceeded"
|
|
42
|
+
|
|
43
|
+
# Write-path / quota
|
|
44
|
+
QUOTA_EXCEEDED = "quota_exceeded"
|
|
45
|
+
PAYLOAD_TOO_LARGE = "payload_too_large"
|
|
46
|
+
CONFLICT = "conflict"
|
|
47
|
+
|
|
48
|
+
# Extraction
|
|
49
|
+
EXTRACTION_FAILED = "extraction_failed"
|
|
50
|
+
|
|
51
|
+
# Catch-all
|
|
52
|
+
INTERNAL = "internal"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class AfsError(Exception):
|
|
56
|
+
"""Base for every agentic-fs error.
|
|
57
|
+
|
|
58
|
+
Carries a closed :class:`ErrorCode`, an HTTP status, and an optional
|
|
59
|
+
``detail`` map; serializes to an RFC 9457 problem object via
|
|
60
|
+
:meth:`to_problem`.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
code: ErrorCode = ErrorCode.INTERNAL
|
|
64
|
+
http_status: int = 500
|
|
65
|
+
title: str = "Internal error"
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
message: str | None = None,
|
|
70
|
+
*,
|
|
71
|
+
detail: dict[str, Any] | None = None,
|
|
72
|
+
) -> None:
|
|
73
|
+
self.message = message or self.title
|
|
74
|
+
self.detail = detail or {}
|
|
75
|
+
super().__init__(self.message)
|
|
76
|
+
|
|
77
|
+
def to_problem(self, *, instance: str | None = None) -> dict[str, Any]:
|
|
78
|
+
"""Render as an RFC 9457 ``application/problem+json`` object."""
|
|
79
|
+
problem: dict[str, Any] = {
|
|
80
|
+
"type": f"https://agentic-fs.dev/errors/{self.code.value}",
|
|
81
|
+
"title": self.title,
|
|
82
|
+
"status": self.http_status,
|
|
83
|
+
"code": self.code.value,
|
|
84
|
+
"detail": self.message,
|
|
85
|
+
}
|
|
86
|
+
if instance is not None:
|
|
87
|
+
problem["instance"] = instance
|
|
88
|
+
problem.update(self.detail)
|
|
89
|
+
return problem
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --- 4xx: client ---------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class ValidationError(AfsError):
|
|
96
|
+
code = ErrorCode.VALIDATION_ERROR
|
|
97
|
+
http_status = 400
|
|
98
|
+
title = "Validation error"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class InvalidKeyError(ValidationError):
|
|
102
|
+
code = ErrorCode.INVALID_KEY
|
|
103
|
+
title = "Invalid object key"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class InvalidNamespaceError(ValidationError):
|
|
107
|
+
code = ErrorCode.INVALID_NAMESPACE
|
|
108
|
+
title = "Invalid namespace"
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class UnauthenticatedError(AfsError):
|
|
112
|
+
code = ErrorCode.UNAUTHENTICATED
|
|
113
|
+
http_status = 401
|
|
114
|
+
title = "Unauthenticated"
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class InsufficientScopeError(AfsError):
|
|
118
|
+
code = ErrorCode.INSUFFICIENT_SCOPE
|
|
119
|
+
http_status = 403
|
|
120
|
+
title = "Insufficient scope"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class NotFoundError(AfsError):
|
|
124
|
+
"""Generic 404. Also the disguise for forbidden resource access (§4.1)."""
|
|
125
|
+
|
|
126
|
+
code = ErrorCode.NOT_FOUND
|
|
127
|
+
http_status = 404
|
|
128
|
+
title = "Not found"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class TenantNotFoundError(NotFoundError):
|
|
132
|
+
code = ErrorCode.TENANT_NOT_FOUND
|
|
133
|
+
title = "Tenant not found"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class NamespaceNotFoundError(NotFoundError):
|
|
137
|
+
code = ErrorCode.NAMESPACE_NOT_FOUND
|
|
138
|
+
title = "Namespace not found"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
class DocumentNotFoundError(NotFoundError):
|
|
142
|
+
code = ErrorCode.DOCUMENT_NOT_FOUND
|
|
143
|
+
title = "Document not found"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ConflictError(AfsError):
|
|
147
|
+
code = ErrorCode.CONFLICT
|
|
148
|
+
http_status = 409
|
|
149
|
+
title = "Conflict"
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class PayloadTooLargeError(AfsError):
|
|
153
|
+
code = ErrorCode.PAYLOAD_TOO_LARGE
|
|
154
|
+
http_status = 413
|
|
155
|
+
title = "Payload too large"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
class QuotaExceededError(AfsError):
|
|
159
|
+
code = ErrorCode.QUOTA_EXCEEDED
|
|
160
|
+
http_status = 429
|
|
161
|
+
title = "Quota exceeded"
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
class BudgetExceededError(AfsError):
|
|
165
|
+
code = ErrorCode.BUDGET_EXCEEDED
|
|
166
|
+
http_status = 422
|
|
167
|
+
title = "Budget exceeded"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class CatalogOnlyError(AfsError):
|
|
171
|
+
"""The document exists and is cite-able, but its contents aren't readable yet."""
|
|
172
|
+
|
|
173
|
+
code = ErrorCode.CATALOG_ONLY
|
|
174
|
+
http_status = 422
|
|
175
|
+
title = "Document is catalog-only"
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class SearchNotEnabledError(AfsError):
|
|
179
|
+
code = ErrorCode.SEARCH_NOT_ENABLED
|
|
180
|
+
http_status = 422
|
|
181
|
+
title = "Search is not enabled"
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
# --- 5xx: server ---------------------------------------------------------------
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
class ExtractionFailedError(AfsError):
|
|
188
|
+
code = ErrorCode.EXTRACTION_FAILED
|
|
189
|
+
http_status = 500
|
|
190
|
+
title = "Extraction failed"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
__all__ = [
|
|
194
|
+
"AfsError",
|
|
195
|
+
"BudgetExceededError",
|
|
196
|
+
"CatalogOnlyError",
|
|
197
|
+
"ConflictError",
|
|
198
|
+
"DocumentNotFoundError",
|
|
199
|
+
"ErrorCode",
|
|
200
|
+
"ExtractionFailedError",
|
|
201
|
+
"InsufficientScopeError",
|
|
202
|
+
"InvalidKeyError",
|
|
203
|
+
"InvalidNamespaceError",
|
|
204
|
+
"NamespaceNotFoundError",
|
|
205
|
+
"NotFoundError",
|
|
206
|
+
"PayloadTooLargeError",
|
|
207
|
+
"QuotaExceededError",
|
|
208
|
+
"SearchNotEnabledError",
|
|
209
|
+
"TenantNotFoundError",
|
|
210
|
+
"UnauthenticatedError",
|
|
211
|
+
"ValidationError",
|
|
212
|
+
]
|
afs_core/keys.py
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
"""The single definition of the S3 key scheme (plan §3.2).
|
|
2
|
+
|
|
3
|
+
Nothing else in agentic-fs concatenates an object key — every consumer builds,
|
|
4
|
+
parses, and validates through here. The scheme is **channel-first** so that one
|
|
5
|
+
EventBridge rule (``prefix: tenants/``) feeds extraction, Bedrock KB syncs from a
|
|
6
|
+
prefix containing only embeddable text, and lifecycle rules are plain prefix
|
|
7
|
+
rules (plan §3.1):
|
|
8
|
+
|
|
9
|
+
tenants/{tenant}/{namespace}/{relpath} raw canonical documents
|
|
10
|
+
scratch/{tenant}/{principal}/{relpath} agent scratch (TTL'd)
|
|
11
|
+
derived/text/{tenant}/{ns}/{doc_id}/{page:04d}.md extracted text layer
|
|
12
|
+
derived/text/{tenant}/{ns}/{doc_id}/{page:04d}.md.metadata.json KB sidecar
|
|
13
|
+
derived/meta/{tenant}/{ns}/{doc_id}/manifest.json extraction manifest
|
|
14
|
+
derived/tree/{tenant}/{ns}.json.zst path-tree artifact
|
|
15
|
+
|
|
16
|
+
``parse_key`` returns ``None`` for anything nonconforming — it never guesses.
|
|
17
|
+
``is_indexable`` is the one predicate every consumer (cataloger, extractor,
|
|
18
|
+
index-sync, tree builder, search scope) uses to exclude ``scratch/`` and
|
|
19
|
+
``derived/``.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from enum import StrEnum
|
|
27
|
+
|
|
28
|
+
from afs_core.errors import InvalidKeyError
|
|
29
|
+
|
|
30
|
+
PAGE_DIGITS = 4
|
|
31
|
+
MAX_PAGE = 10**PAGE_DIGITS - 1
|
|
32
|
+
|
|
33
|
+
# Lowercase slugs for tenant / namespace / principal ids.
|
|
34
|
+
_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
|
|
35
|
+
# doc_id is a ULID (uppercase Crockford base32) or similar opaque id.
|
|
36
|
+
_ID_RE = re.compile(r"^[A-Za-z0-9_-]+$")
|
|
37
|
+
|
|
38
|
+
# Segments a relpath may never contain (traversal + channel words + dotfiles).
|
|
39
|
+
_RESERVED_SEGMENTS = frozenset({"", ".", "..", "tenants", "scratch", "derived"})
|
|
40
|
+
|
|
41
|
+
_TREE_SUFFIX = ".json.zst"
|
|
42
|
+
_TEXT_SUFFIX = ".md"
|
|
43
|
+
_METADATA_SUFFIX = ".metadata.json"
|
|
44
|
+
_MANIFEST_NAME = "manifest.json"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class Channel(StrEnum):
|
|
48
|
+
"""Top-level key channels."""
|
|
49
|
+
|
|
50
|
+
ORIGINAL = "original" # tenants/
|
|
51
|
+
SCRATCH = "scratch" # scratch/
|
|
52
|
+
DERIVED_TEXT = "derived_text" # derived/text/
|
|
53
|
+
DERIVED_META = "derived_meta" # derived/meta/
|
|
54
|
+
DERIVED_TREE = "derived_tree" # derived/tree/
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(frozen=True, slots=True)
|
|
58
|
+
class ParsedKey:
|
|
59
|
+
"""The structured decomposition of a conforming key.
|
|
60
|
+
|
|
61
|
+
Fields not relevant to a channel stay ``None`` (e.g. an ORIGINAL key has no
|
|
62
|
+
``doc_id``; a DERIVED_TREE key has no ``path``).
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
channel: Channel
|
|
66
|
+
tenant_id: str
|
|
67
|
+
namespace: str | None = None
|
|
68
|
+
path: str | None = None
|
|
69
|
+
principal_id: str | None = None
|
|
70
|
+
doc_id: str | None = None
|
|
71
|
+
page: int | None = None
|
|
72
|
+
is_metadata: bool = False
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# --- validation helpers --------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _is_slug(value: str) -> bool:
|
|
79
|
+
return bool(_SLUG_RE.fullmatch(value))
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _is_id(value: str) -> bool:
|
|
83
|
+
return bool(_ID_RE.fullmatch(value))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _relpath_ok(relpath: str) -> bool:
|
|
87
|
+
if not relpath or relpath.startswith("/"):
|
|
88
|
+
return False
|
|
89
|
+
return all(
|
|
90
|
+
seg not in _RESERVED_SEGMENTS and not seg.startswith("_") for seg in relpath.split("/")
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def validate_slug(value: str, *, field: str) -> str:
|
|
95
|
+
if not _is_slug(value):
|
|
96
|
+
raise InvalidKeyError(
|
|
97
|
+
f"{field} must be a lowercase slug", detail={"field": field, "value": value}
|
|
98
|
+
)
|
|
99
|
+
return value
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def validate_id(value: str, *, field: str) -> str:
|
|
103
|
+
if not _is_id(value):
|
|
104
|
+
raise InvalidKeyError(
|
|
105
|
+
f"{field} must be alphanumeric", detail={"field": field, "value": value}
|
|
106
|
+
)
|
|
107
|
+
return value
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def validate_relpath(relpath: str) -> str:
|
|
111
|
+
"""Reject traversal, absolute paths, reserved/dotfile segments. One code path."""
|
|
112
|
+
if not _relpath_ok(relpath):
|
|
113
|
+
raise InvalidKeyError("invalid relpath", detail={"relpath": relpath})
|
|
114
|
+
return relpath
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _validate_page(page: int) -> int:
|
|
118
|
+
if not (0 <= page <= MAX_PAGE):
|
|
119
|
+
raise InvalidKeyError(f"page must be in [0, {MAX_PAGE}]", detail={"page": page})
|
|
120
|
+
return page
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# --- builders ------------------------------------------------------------------
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def originals_key(tenant_id: str, namespace: str, path: str) -> str:
|
|
127
|
+
validate_slug(tenant_id, field="tenant_id")
|
|
128
|
+
validate_slug(namespace, field="namespace")
|
|
129
|
+
validate_relpath(path)
|
|
130
|
+
return f"tenants/{tenant_id}/{namespace}/{path}"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def scratch_key(tenant_id: str, principal_id: str, path: str) -> str:
|
|
134
|
+
validate_slug(tenant_id, field="tenant_id")
|
|
135
|
+
validate_slug(principal_id, field="principal_id")
|
|
136
|
+
validate_relpath(path)
|
|
137
|
+
return f"scratch/{tenant_id}/{principal_id}/{path}"
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def derived_text_key(tenant_id: str, namespace: str, doc_id: str, page: int) -> str:
|
|
141
|
+
validate_slug(tenant_id, field="tenant_id")
|
|
142
|
+
validate_slug(namespace, field="namespace")
|
|
143
|
+
validate_id(doc_id, field="doc_id")
|
|
144
|
+
_validate_page(page)
|
|
145
|
+
return f"derived/text/{tenant_id}/{namespace}/{doc_id}/{page:0{PAGE_DIGITS}d}{_TEXT_SUFFIX}"
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def derived_text_metadata_key(tenant_id: str, namespace: str, doc_id: str, page: int) -> str:
|
|
149
|
+
return derived_text_key(tenant_id, namespace, doc_id, page) + _METADATA_SUFFIX
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def derived_meta_key(tenant_id: str, namespace: str, doc_id: str) -> str:
|
|
153
|
+
validate_slug(tenant_id, field="tenant_id")
|
|
154
|
+
validate_slug(namespace, field="namespace")
|
|
155
|
+
validate_id(doc_id, field="doc_id")
|
|
156
|
+
return f"derived/meta/{tenant_id}/{namespace}/{doc_id}/{_MANIFEST_NAME}"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def tree_key(tenant_id: str, namespace: str) -> str:
|
|
160
|
+
validate_slug(tenant_id, field="tenant_id")
|
|
161
|
+
validate_slug(namespace, field="namespace")
|
|
162
|
+
return f"derived/tree/{tenant_id}/{namespace}{_TREE_SUFFIX}"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
# --- parsing -------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def parse_key(key: str) -> ParsedKey | None:
|
|
169
|
+
"""Decompose a key, or return ``None`` if it doesn't conform. Never guesses."""
|
|
170
|
+
if key.startswith("derived/tree/"):
|
|
171
|
+
return _parse_tree(key.removeprefix("derived/tree/"))
|
|
172
|
+
if key.startswith("derived/text/"):
|
|
173
|
+
return _parse_text(key.removeprefix("derived/text/"))
|
|
174
|
+
if key.startswith("derived/meta/"):
|
|
175
|
+
return _parse_meta(key.removeprefix("derived/meta/"))
|
|
176
|
+
if key.startswith("scratch/"):
|
|
177
|
+
return _parse_scratch(key.removeprefix("scratch/"))
|
|
178
|
+
if key.startswith("tenants/"):
|
|
179
|
+
return _parse_original(key.removeprefix("tenants/"))
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _parse_tree(rest: str) -> ParsedKey | None:
|
|
184
|
+
segs = rest.split("/")
|
|
185
|
+
if len(segs) != 2 or not segs[1].endswith(_TREE_SUFFIX):
|
|
186
|
+
return None
|
|
187
|
+
tenant, ns = segs[0], segs[1].removesuffix(_TREE_SUFFIX)
|
|
188
|
+
if not (_is_slug(tenant) and _is_slug(ns)):
|
|
189
|
+
return None
|
|
190
|
+
return ParsedKey(Channel.DERIVED_TREE, tenant_id=tenant, namespace=ns)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _parse_text(rest: str) -> ParsedKey | None:
|
|
194
|
+
segs = rest.split("/")
|
|
195
|
+
if len(segs) != 4:
|
|
196
|
+
return None
|
|
197
|
+
tenant, ns, doc_id, filename = segs
|
|
198
|
+
is_metadata = filename.endswith(_METADATA_SUFFIX)
|
|
199
|
+
base = filename.removesuffix(_METADATA_SUFFIX) if is_metadata else filename
|
|
200
|
+
if not base.endswith(_TEXT_SUFFIX):
|
|
201
|
+
return None
|
|
202
|
+
page_str = base.removesuffix(_TEXT_SUFFIX)
|
|
203
|
+
if not (page_str.isdigit() and len(page_str) == PAGE_DIGITS):
|
|
204
|
+
return None
|
|
205
|
+
if not (_is_slug(tenant) and _is_slug(ns) and _is_id(doc_id)):
|
|
206
|
+
return None
|
|
207
|
+
return ParsedKey(
|
|
208
|
+
Channel.DERIVED_TEXT,
|
|
209
|
+
tenant_id=tenant,
|
|
210
|
+
namespace=ns,
|
|
211
|
+
doc_id=doc_id,
|
|
212
|
+
page=int(page_str),
|
|
213
|
+
is_metadata=is_metadata,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _parse_meta(rest: str) -> ParsedKey | None:
|
|
218
|
+
segs = rest.split("/")
|
|
219
|
+
if len(segs) != 4 or segs[3] != _MANIFEST_NAME:
|
|
220
|
+
return None
|
|
221
|
+
tenant, ns, doc_id, _ = segs
|
|
222
|
+
if not (_is_slug(tenant) and _is_slug(ns) and _is_id(doc_id)):
|
|
223
|
+
return None
|
|
224
|
+
return ParsedKey(Channel.DERIVED_META, tenant_id=tenant, namespace=ns, doc_id=doc_id)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _parse_scratch(rest: str) -> ParsedKey | None:
|
|
228
|
+
segs = rest.split("/")
|
|
229
|
+
if len(segs) < 3:
|
|
230
|
+
return None
|
|
231
|
+
tenant, principal, relpath = segs[0], segs[1], "/".join(segs[2:])
|
|
232
|
+
if not (_is_slug(tenant) and _is_slug(principal) and _relpath_ok(relpath)):
|
|
233
|
+
return None
|
|
234
|
+
return ParsedKey(Channel.SCRATCH, tenant_id=tenant, principal_id=principal, path=relpath)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _parse_original(rest: str) -> ParsedKey | None:
|
|
238
|
+
segs = rest.split("/")
|
|
239
|
+
if len(segs) < 3:
|
|
240
|
+
return None
|
|
241
|
+
tenant, ns, relpath = segs[0], segs[1], "/".join(segs[2:])
|
|
242
|
+
if not (_is_slug(tenant) and _is_slug(ns) and _relpath_ok(relpath)):
|
|
243
|
+
return None
|
|
244
|
+
return ParsedKey(Channel.ORIGINAL, tenant_id=tenant, namespace=ns, path=relpath)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def is_indexable(key: str) -> bool:
|
|
248
|
+
"""True only for raw canonical documents under ``tenants/`` (plan §3.2).
|
|
249
|
+
|
|
250
|
+
The single predicate that excludes ``scratch/`` and ``derived/`` from the
|
|
251
|
+
cataloger, extractor, index-sync, tree builder, and search scope.
|
|
252
|
+
"""
|
|
253
|
+
parsed = parse_key(key)
|
|
254
|
+
return parsed is not None and parsed.channel is Channel.ORIGINAL
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
__all__ = [
|
|
258
|
+
"Channel",
|
|
259
|
+
"ParsedKey",
|
|
260
|
+
"derived_meta_key",
|
|
261
|
+
"derived_text_key",
|
|
262
|
+
"derived_text_metadata_key",
|
|
263
|
+
"is_indexable",
|
|
264
|
+
"originals_key",
|
|
265
|
+
"parse_key",
|
|
266
|
+
"scratch_key",
|
|
267
|
+
"tree_key",
|
|
268
|
+
"validate_id",
|
|
269
|
+
"validate_relpath",
|
|
270
|
+
"validate_slug",
|
|
271
|
+
]
|