afs-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
afs_core/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ """agentic-fs core: contracts, DTOs, the key scheme, and the error vocabulary.
2
+
3
+ Depends on pydantic only — importable without the server. (``afs_core.testing``
4
+ is intentionally not imported here: it depends on pytest and is opt-in.)
5
+ """
6
+
7
+ from afs_core import contracts, errors, keys, models
8
+
9
+ __all__ = ["contracts", "errors", "keys", "models"]
@@ -0,0 +1,11 @@
1
+ """The agentic-fs contracts: async ``typing.Protocol`` interfaces (plan §5).
2
+
3
+ Structural — adopters implement without importing our hierarchy. Each is proven
4
+ by a conformance kit in :mod:`afs_core.testing`.
5
+ """
6
+
7
+ from afs_core.contracts.catalog import CatalogStore
8
+ from afs_core.contracts.normalize import NormalizationError, Normalizer
9
+ from afs_core.contracts.objects import ObjectStore
10
+
11
+ __all__ = ["CatalogStore", "NormalizationError", "Normalizer", "ObjectStore"]
@@ -0,0 +1,90 @@
1
+ """The ``CatalogStore`` contract (plan §5.1).
2
+
3
+ One contract covers entries + control records + checkpoints + scratch quota, so a
4
+ self-hoster swaps **one** stateful dependency. Structural ``Protocol``, async.
5
+ Certify an impl with ``CatalogStoreConformance`` (afs_core.testing); DynamoDB and
6
+ Postgres are the two reference implementations.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Protocol, runtime_checkable
12
+
13
+ from afs_core.models import (
14
+ CatalogEntry,
15
+ ExtractionState,
16
+ NamespaceRecord,
17
+ Page,
18
+ PrincipalRecord,
19
+ ScratchUsage,
20
+ SyncCheckpoint,
21
+ TenantRecord,
22
+ )
23
+
24
+
25
+ @runtime_checkable
26
+ class CatalogStore(Protocol):
27
+ # -- entries (derived index of S3; healable FROM S3) --
28
+ async def put_entry(self, entry: CatalogEntry) -> None: ...
29
+
30
+ async def get_entry(self, tenant_id: str, namespace: str, path: str) -> CatalogEntry | None: ...
31
+
32
+ async def delete_entry(
33
+ self, tenant_id: str, namespace: str, path: str, *, hard: bool = False
34
+ ) -> None:
35
+ """Tombstone (soft) by default; ``hard=True`` removes the row entirely."""
36
+ ...
37
+
38
+ async def list_entries(
39
+ self,
40
+ tenant_id: str,
41
+ namespace: str,
42
+ *,
43
+ prefix: str = "",
44
+ include_deleted: bool = False,
45
+ cursor: str | None = None,
46
+ limit: int = 1000,
47
+ ) -> Page[CatalogEntry]: ...
48
+
49
+ async def find_by_checksum(self, tenant_id: str, checksum: str) -> list[CatalogEntry]: ...
50
+
51
+ async def set_extraction(
52
+ self, tenant_id: str, namespace: str, path: str, state: ExtractionState
53
+ ) -> None: ...
54
+
55
+ async def list_by_extraction_status(
56
+ self, status: str, *, cursor: str | None = None, limit: int = 100
57
+ ) -> Page[CatalogEntry]: ...
58
+
59
+ async def tree_version(self, tenant_id: str, namespace: str) -> str:
60
+ """A token bumped on any write to the namespace — the tree-cache key."""
61
+ ...
62
+
63
+ # -- control records (tenants / namespaces / principals) --
64
+ async def put_tenant(self, tenant: TenantRecord) -> None: ...
65
+ async def get_tenant(self, tenant_id: str) -> TenantRecord | None: ...
66
+ async def list_tenants(
67
+ self, *, cursor: str | None = None, limit: int = 100
68
+ ) -> Page[TenantRecord]: ...
69
+
70
+ async def put_namespace(self, ns: NamespaceRecord) -> None: ...
71
+ async def get_namespace(self, tenant_id: str, name: str) -> NamespaceRecord | None: ...
72
+ async def list_namespaces(self, tenant_id: str) -> list[NamespaceRecord]: ...
73
+ async def delete_namespace(self, tenant_id: str, name: str) -> None: ...
74
+
75
+ async def put_principal(self, p: PrincipalRecord) -> None: ...
76
+ async def get_principal(self, tenant_id: str, principal_id: str) -> PrincipalRecord | None: ...
77
+ async def list_principals(self, tenant_id: str) -> list[PrincipalRecord]: ...
78
+
79
+ # -- connector checkpoints --
80
+ async def get_checkpoint(self, tenant_id: str, connector_id: str) -> SyncCheckpoint | None: ...
81
+ async def put_checkpoint(
82
+ self, tenant_id: str, connector_id: str, cp: SyncCheckpoint
83
+ ) -> None: ...
84
+
85
+ # -- scratch quota (atomic; raises QuotaExceededError) --
86
+ async def adjust_scratch_usage(
87
+ self, tenant_id: str, principal_id: str, *, delta_bytes: int, delta_objects: int
88
+ ) -> ScratchUsage: ...
89
+
90
+ async def get_scratch_usage(self, tenant_id: str, principal_id: str) -> ScratchUsage: ...
@@ -0,0 +1,40 @@
1
+ """The ``Normalizer`` contract (plan §5.4) — the extractor/parser seam.
2
+
3
+ A normalizer turns one raw document into normalized per-page markdown. It is the
4
+ *only* place document parsing lives: text_native, docling, llamaparse, and any
5
+ custom parser are all just `Normalizer`s. The `ExtractionPipeline` (in
6
+ afs-server) orders them into a ladder, applies a quality gate, and degrades to
7
+ `catalog_only` — none of which a normalizer needs to know about.
8
+
9
+ Adding your own: implement this Protocol, certify it against
10
+ `afs_core.testing.NormalizerConformance`, register it via the `afs.normalizers`
11
+ entry-point group, and name it in the extraction ladder. No core changes.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import Protocol, runtime_checkable
17
+
18
+ from afs_core.models import NormalizedDocument, SourceDocument
19
+
20
+
21
+ class NormalizationError(Exception):
22
+ """A normalizer couldn't parse the document. Carries a closed-vocabulary
23
+ ``reason`` (events v1) so the pipeline can record why and try the next rung."""
24
+
25
+ def __init__(self, reason: str, message: str | None = None) -> None:
26
+ self.reason = reason
27
+ super().__init__(message or reason)
28
+
29
+
30
+ @runtime_checkable
31
+ class Normalizer(Protocol):
32
+ name: str
33
+
34
+ def accepts(self, doc: SourceDocument) -> bool:
35
+ """Whether this normalizer claims the document (by MIME/extension)."""
36
+ ...
37
+
38
+ async def normalize(self, doc: SourceDocument) -> NormalizedDocument:
39
+ """Parse ``doc`` into per-page markdown, or raise ``NormalizationError``."""
40
+ ...
@@ -0,0 +1,44 @@
1
+ """The ``ObjectStore`` contract (plan §5.2).
2
+
3
+ Structural ``Protocol`` — adopters implement it without importing our hierarchy
4
+ or depending on ``afs-server``. S3 is the only production impl; the protocol
5
+ exists so MinIO/LocalStack back local dev and an in-memory fake backs tests.
6
+ Certify any impl with ``ObjectStoreConformance`` (afs_core.testing).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Protocol, runtime_checkable
12
+
13
+ from afs_core.models import ObjectStat, Page, PresignedPut
14
+
15
+
16
+ @runtime_checkable
17
+ class ObjectStore(Protocol):
18
+ async def get(self, key: str, *, start: int | None = None, end: int | None = None) -> bytes:
19
+ """Fetch an object, optionally a byte range ``[start, end]`` (inclusive)."""
20
+ ...
21
+
22
+ async def put(
23
+ self, key: str, body: bytes, *, content_type: str | None = None
24
+ ) -> ObjectStat: ...
25
+
26
+ async def delete(self, key: str) -> None: ...
27
+
28
+ async def delete_prefix(self, prefix: str) -> int:
29
+ """Delete every object under ``prefix``; returns the count removed."""
30
+ ...
31
+
32
+ async def stat(self, key: str) -> ObjectStat | None:
33
+ """Metadata for ``key``, or ``None`` if it does not exist."""
34
+ ...
35
+
36
+ async def list(
37
+ self, prefix: str, *, cursor: str | None = None, limit: int = 1000
38
+ ) -> Page[ObjectStat]: ...
39
+
40
+ async def presigned_put(
41
+ self, key: str, *, content_type: str, max_bytes: int, expires_in: int = 900
42
+ ) -> PresignedPut: ...
43
+
44
+ async def presigned_get(self, key: str, *, expires_in: int = 300) -> str: ...
afs_core/errors.py ADDED
@@ -0,0 +1,212 @@
1
+ """The closed error vocabulary and the ``AfsError`` hierarchy.
2
+
3
+ Every error agentic-fs raises across the wire carries a code from the closed
4
+ :class:`ErrorCode` enum and serializes to an RFC 9457 ``application/problem+json``
5
+ envelope. The vocabulary is closed on purpose: clients (and the MCP tool layer)
6
+ can branch on a small, stable set of codes instead of parsing prose.
7
+
8
+ Design note — **misses are 404, never 403** (plan §4.1): a caller must not be
9
+ able to tell "exists but forbidden" from "does not exist", or they could
10
+ enumerate tenants/namespaces/documents. So the not-found errors below all map to
11
+ 404 and there is intentionally no 403 for resource access.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from enum import StrEnum
17
+ from typing import Any
18
+
19
+
20
+ class ErrorCode(StrEnum):
21
+ """Closed vocabulary of machine-readable error codes."""
22
+
23
+ # Request / validation
24
+ VALIDATION_ERROR = "validation_error"
25
+ INVALID_KEY = "invalid_key"
26
+ INVALID_NAMESPACE = "invalid_namespace"
27
+
28
+ # Not found (also used to hide forbidden — see module docstring)
29
+ NOT_FOUND = "not_found"
30
+ TENANT_NOT_FOUND = "tenant_not_found"
31
+ NAMESPACE_NOT_FOUND = "namespace_not_found"
32
+ DOCUMENT_NOT_FOUND = "document_not_found"
33
+
34
+ # AuthN / AuthZ
35
+ UNAUTHENTICATED = "unauthenticated"
36
+ INSUFFICIENT_SCOPE = "insufficient_scope"
37
+
38
+ # Read-path / capability
39
+ CATALOG_ONLY = "catalog_only"
40
+ SEARCH_NOT_ENABLED = "search_not_enabled"
41
+ BUDGET_EXCEEDED = "budget_exceeded"
42
+
43
+ # Write-path / quota
44
+ QUOTA_EXCEEDED = "quota_exceeded"
45
+ PAYLOAD_TOO_LARGE = "payload_too_large"
46
+ CONFLICT = "conflict"
47
+
48
+ # Extraction
49
+ EXTRACTION_FAILED = "extraction_failed"
50
+
51
+ # Catch-all
52
+ INTERNAL = "internal"
53
+
54
+
55
+ class AfsError(Exception):
56
+ """Base for every agentic-fs error.
57
+
58
+ Carries a closed :class:`ErrorCode`, an HTTP status, and an optional
59
+ ``detail`` map; serializes to an RFC 9457 problem object via
60
+ :meth:`to_problem`.
61
+ """
62
+
63
+ code: ErrorCode = ErrorCode.INTERNAL
64
+ http_status: int = 500
65
+ title: str = "Internal error"
66
+
67
+ def __init__(
68
+ self,
69
+ message: str | None = None,
70
+ *,
71
+ detail: dict[str, Any] | None = None,
72
+ ) -> None:
73
+ self.message = message or self.title
74
+ self.detail = detail or {}
75
+ super().__init__(self.message)
76
+
77
+ def to_problem(self, *, instance: str | None = None) -> dict[str, Any]:
78
+ """Render as an RFC 9457 ``application/problem+json`` object."""
79
+ problem: dict[str, Any] = {
80
+ "type": f"https://agentic-fs.dev/errors/{self.code.value}",
81
+ "title": self.title,
82
+ "status": self.http_status,
83
+ "code": self.code.value,
84
+ "detail": self.message,
85
+ }
86
+ if instance is not None:
87
+ problem["instance"] = instance
88
+ problem.update(self.detail)
89
+ return problem
90
+
91
+
92
+ # --- 4xx: client ---------------------------------------------------------------
93
+
94
+
95
+ class ValidationError(AfsError):
96
+ code = ErrorCode.VALIDATION_ERROR
97
+ http_status = 400
98
+ title = "Validation error"
99
+
100
+
101
+ class InvalidKeyError(ValidationError):
102
+ code = ErrorCode.INVALID_KEY
103
+ title = "Invalid object key"
104
+
105
+
106
+ class InvalidNamespaceError(ValidationError):
107
+ code = ErrorCode.INVALID_NAMESPACE
108
+ title = "Invalid namespace"
109
+
110
+
111
+ class UnauthenticatedError(AfsError):
112
+ code = ErrorCode.UNAUTHENTICATED
113
+ http_status = 401
114
+ title = "Unauthenticated"
115
+
116
+
117
+ class InsufficientScopeError(AfsError):
118
+ code = ErrorCode.INSUFFICIENT_SCOPE
119
+ http_status = 403
120
+ title = "Insufficient scope"
121
+
122
+
123
+ class NotFoundError(AfsError):
124
+ """Generic 404. Also the disguise for forbidden resource access (§4.1)."""
125
+
126
+ code = ErrorCode.NOT_FOUND
127
+ http_status = 404
128
+ title = "Not found"
129
+
130
+
131
+ class TenantNotFoundError(NotFoundError):
132
+ code = ErrorCode.TENANT_NOT_FOUND
133
+ title = "Tenant not found"
134
+
135
+
136
+ class NamespaceNotFoundError(NotFoundError):
137
+ code = ErrorCode.NAMESPACE_NOT_FOUND
138
+ title = "Namespace not found"
139
+
140
+
141
+ class DocumentNotFoundError(NotFoundError):
142
+ code = ErrorCode.DOCUMENT_NOT_FOUND
143
+ title = "Document not found"
144
+
145
+
146
+ class ConflictError(AfsError):
147
+ code = ErrorCode.CONFLICT
148
+ http_status = 409
149
+ title = "Conflict"
150
+
151
+
152
+ class PayloadTooLargeError(AfsError):
153
+ code = ErrorCode.PAYLOAD_TOO_LARGE
154
+ http_status = 413
155
+ title = "Payload too large"
156
+
157
+
158
+ class QuotaExceededError(AfsError):
159
+ code = ErrorCode.QUOTA_EXCEEDED
160
+ http_status = 429
161
+ title = "Quota exceeded"
162
+
163
+
164
+ class BudgetExceededError(AfsError):
165
+ code = ErrorCode.BUDGET_EXCEEDED
166
+ http_status = 422
167
+ title = "Budget exceeded"
168
+
169
+
170
+ class CatalogOnlyError(AfsError):
171
+ """The document exists and is cite-able, but its contents aren't readable yet."""
172
+
173
+ code = ErrorCode.CATALOG_ONLY
174
+ http_status = 422
175
+ title = "Document is catalog-only"
176
+
177
+
178
+ class SearchNotEnabledError(AfsError):
179
+ code = ErrorCode.SEARCH_NOT_ENABLED
180
+ http_status = 422
181
+ title = "Search is not enabled"
182
+
183
+
184
+ # --- 5xx: server ---------------------------------------------------------------
185
+
186
+
187
+ class ExtractionFailedError(AfsError):
188
+ code = ErrorCode.EXTRACTION_FAILED
189
+ http_status = 500
190
+ title = "Extraction failed"
191
+
192
+
193
+ __all__ = [
194
+ "AfsError",
195
+ "BudgetExceededError",
196
+ "CatalogOnlyError",
197
+ "ConflictError",
198
+ "DocumentNotFoundError",
199
+ "ErrorCode",
200
+ "ExtractionFailedError",
201
+ "InsufficientScopeError",
202
+ "InvalidKeyError",
203
+ "InvalidNamespaceError",
204
+ "NamespaceNotFoundError",
205
+ "NotFoundError",
206
+ "PayloadTooLargeError",
207
+ "QuotaExceededError",
208
+ "SearchNotEnabledError",
209
+ "TenantNotFoundError",
210
+ "UnauthenticatedError",
211
+ "ValidationError",
212
+ ]
afs_core/keys.py ADDED
@@ -0,0 +1,271 @@
1
+ """The single definition of the S3 key scheme (plan §3.2).
2
+
3
+ Nothing else in agentic-fs concatenates an object key — every consumer builds,
4
+ parses, and validates through here. The scheme is **channel-first** so that one
5
+ EventBridge rule (``prefix: tenants/``) feeds extraction, Bedrock KB syncs from a
6
+ prefix containing only embeddable text, and lifecycle rules are plain prefix
7
+ rules (plan §3.1):
8
+
9
+ tenants/{tenant}/{namespace}/{relpath} raw canonical documents
10
+ scratch/{tenant}/{principal}/{relpath} agent scratch (TTL'd)
11
+ derived/text/{tenant}/{ns}/{doc_id}/{page:04d}.md extracted text layer
12
+ derived/text/{tenant}/{ns}/{doc_id}/{page:04d}.md.metadata.json KB sidecar
13
+ derived/meta/{tenant}/{ns}/{doc_id}/manifest.json extraction manifest
14
+ derived/tree/{tenant}/{ns}.json.zst path-tree artifact
15
+
16
+ ``parse_key`` returns ``None`` for anything nonconforming — it never guesses.
17
+ ``is_indexable`` is the one predicate every consumer (cataloger, extractor,
18
+ index-sync, tree builder, search scope) uses to exclude ``scratch/`` and
19
+ ``derived/``.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import re
25
+ from dataclasses import dataclass
26
+ from enum import StrEnum
27
+
28
+ from afs_core.errors import InvalidKeyError
29
+
30
+ PAGE_DIGITS = 4
31
+ MAX_PAGE = 10**PAGE_DIGITS - 1
32
+
33
+ # Lowercase slugs for tenant / namespace / principal ids.
34
+ _SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9_-]*$")
35
+ # doc_id is a ULID (uppercase Crockford base32) or similar opaque id.
36
+ _ID_RE = re.compile(r"^[A-Za-z0-9_-]+$")
37
+
38
+ # Segments a relpath may never contain (traversal + channel words + dotfiles).
39
+ _RESERVED_SEGMENTS = frozenset({"", ".", "..", "tenants", "scratch", "derived"})
40
+
41
+ _TREE_SUFFIX = ".json.zst"
42
+ _TEXT_SUFFIX = ".md"
43
+ _METADATA_SUFFIX = ".metadata.json"
44
+ _MANIFEST_NAME = "manifest.json"
45
+
46
+
47
+ class Channel(StrEnum):
48
+ """Top-level key channels."""
49
+
50
+ ORIGINAL = "original" # tenants/
51
+ SCRATCH = "scratch" # scratch/
52
+ DERIVED_TEXT = "derived_text" # derived/text/
53
+ DERIVED_META = "derived_meta" # derived/meta/
54
+ DERIVED_TREE = "derived_tree" # derived/tree/
55
+
56
+
57
+ @dataclass(frozen=True, slots=True)
58
+ class ParsedKey:
59
+ """The structured decomposition of a conforming key.
60
+
61
+ Fields not relevant to a channel stay ``None`` (e.g. an ORIGINAL key has no
62
+ ``doc_id``; a DERIVED_TREE key has no ``path``).
63
+ """
64
+
65
+ channel: Channel
66
+ tenant_id: str
67
+ namespace: str | None = None
68
+ path: str | None = None
69
+ principal_id: str | None = None
70
+ doc_id: str | None = None
71
+ page: int | None = None
72
+ is_metadata: bool = False
73
+
74
+
75
+ # --- validation helpers --------------------------------------------------------
76
+
77
+
78
+ def _is_slug(value: str) -> bool:
79
+ return bool(_SLUG_RE.fullmatch(value))
80
+
81
+
82
+ def _is_id(value: str) -> bool:
83
+ return bool(_ID_RE.fullmatch(value))
84
+
85
+
86
+ def _relpath_ok(relpath: str) -> bool:
87
+ if not relpath or relpath.startswith("/"):
88
+ return False
89
+ return all(
90
+ seg not in _RESERVED_SEGMENTS and not seg.startswith("_") for seg in relpath.split("/")
91
+ )
92
+
93
+
94
+ def validate_slug(value: str, *, field: str) -> str:
95
+ if not _is_slug(value):
96
+ raise InvalidKeyError(
97
+ f"{field} must be a lowercase slug", detail={"field": field, "value": value}
98
+ )
99
+ return value
100
+
101
+
102
+ def validate_id(value: str, *, field: str) -> str:
103
+ if not _is_id(value):
104
+ raise InvalidKeyError(
105
+ f"{field} must be alphanumeric", detail={"field": field, "value": value}
106
+ )
107
+ return value
108
+
109
+
110
+ def validate_relpath(relpath: str) -> str:
111
+ """Reject traversal, absolute paths, reserved/dotfile segments. One code path."""
112
+ if not _relpath_ok(relpath):
113
+ raise InvalidKeyError("invalid relpath", detail={"relpath": relpath})
114
+ return relpath
115
+
116
+
117
+ def _validate_page(page: int) -> int:
118
+ if not (0 <= page <= MAX_PAGE):
119
+ raise InvalidKeyError(f"page must be in [0, {MAX_PAGE}]", detail={"page": page})
120
+ return page
121
+
122
+
123
+ # --- builders ------------------------------------------------------------------
124
+
125
+
126
+ def originals_key(tenant_id: str, namespace: str, path: str) -> str:
127
+ validate_slug(tenant_id, field="tenant_id")
128
+ validate_slug(namespace, field="namespace")
129
+ validate_relpath(path)
130
+ return f"tenants/{tenant_id}/{namespace}/{path}"
131
+
132
+
133
+ def scratch_key(tenant_id: str, principal_id: str, path: str) -> str:
134
+ validate_slug(tenant_id, field="tenant_id")
135
+ validate_slug(principal_id, field="principal_id")
136
+ validate_relpath(path)
137
+ return f"scratch/{tenant_id}/{principal_id}/{path}"
138
+
139
+
140
+ def derived_text_key(tenant_id: str, namespace: str, doc_id: str, page: int) -> str:
141
+ validate_slug(tenant_id, field="tenant_id")
142
+ validate_slug(namespace, field="namespace")
143
+ validate_id(doc_id, field="doc_id")
144
+ _validate_page(page)
145
+ return f"derived/text/{tenant_id}/{namespace}/{doc_id}/{page:0{PAGE_DIGITS}d}{_TEXT_SUFFIX}"
146
+
147
+
148
+ def derived_text_metadata_key(tenant_id: str, namespace: str, doc_id: str, page: int) -> str:
149
+ return derived_text_key(tenant_id, namespace, doc_id, page) + _METADATA_SUFFIX
150
+
151
+
152
+ def derived_meta_key(tenant_id: str, namespace: str, doc_id: str) -> str:
153
+ validate_slug(tenant_id, field="tenant_id")
154
+ validate_slug(namespace, field="namespace")
155
+ validate_id(doc_id, field="doc_id")
156
+ return f"derived/meta/{tenant_id}/{namespace}/{doc_id}/{_MANIFEST_NAME}"
157
+
158
+
159
+ def tree_key(tenant_id: str, namespace: str) -> str:
160
+ validate_slug(tenant_id, field="tenant_id")
161
+ validate_slug(namespace, field="namespace")
162
+ return f"derived/tree/{tenant_id}/{namespace}{_TREE_SUFFIX}"
163
+
164
+
165
+ # --- parsing -------------------------------------------------------------------
166
+
167
+
168
+ def parse_key(key: str) -> ParsedKey | None:
169
+ """Decompose a key, or return ``None`` if it doesn't conform. Never guesses."""
170
+ if key.startswith("derived/tree/"):
171
+ return _parse_tree(key.removeprefix("derived/tree/"))
172
+ if key.startswith("derived/text/"):
173
+ return _parse_text(key.removeprefix("derived/text/"))
174
+ if key.startswith("derived/meta/"):
175
+ return _parse_meta(key.removeprefix("derived/meta/"))
176
+ if key.startswith("scratch/"):
177
+ return _parse_scratch(key.removeprefix("scratch/"))
178
+ if key.startswith("tenants/"):
179
+ return _parse_original(key.removeprefix("tenants/"))
180
+ return None
181
+
182
+
183
+ def _parse_tree(rest: str) -> ParsedKey | None:
184
+ segs = rest.split("/")
185
+ if len(segs) != 2 or not segs[1].endswith(_TREE_SUFFIX):
186
+ return None
187
+ tenant, ns = segs[0], segs[1].removesuffix(_TREE_SUFFIX)
188
+ if not (_is_slug(tenant) and _is_slug(ns)):
189
+ return None
190
+ return ParsedKey(Channel.DERIVED_TREE, tenant_id=tenant, namespace=ns)
191
+
192
+
193
+ def _parse_text(rest: str) -> ParsedKey | None:
194
+ segs = rest.split("/")
195
+ if len(segs) != 4:
196
+ return None
197
+ tenant, ns, doc_id, filename = segs
198
+ is_metadata = filename.endswith(_METADATA_SUFFIX)
199
+ base = filename.removesuffix(_METADATA_SUFFIX) if is_metadata else filename
200
+ if not base.endswith(_TEXT_SUFFIX):
201
+ return None
202
+ page_str = base.removesuffix(_TEXT_SUFFIX)
203
+ if not (page_str.isdigit() and len(page_str) == PAGE_DIGITS):
204
+ return None
205
+ if not (_is_slug(tenant) and _is_slug(ns) and _is_id(doc_id)):
206
+ return None
207
+ return ParsedKey(
208
+ Channel.DERIVED_TEXT,
209
+ tenant_id=tenant,
210
+ namespace=ns,
211
+ doc_id=doc_id,
212
+ page=int(page_str),
213
+ is_metadata=is_metadata,
214
+ )
215
+
216
+
217
+ def _parse_meta(rest: str) -> ParsedKey | None:
218
+ segs = rest.split("/")
219
+ if len(segs) != 4 or segs[3] != _MANIFEST_NAME:
220
+ return None
221
+ tenant, ns, doc_id, _ = segs
222
+ if not (_is_slug(tenant) and _is_slug(ns) and _is_id(doc_id)):
223
+ return None
224
+ return ParsedKey(Channel.DERIVED_META, tenant_id=tenant, namespace=ns, doc_id=doc_id)
225
+
226
+
227
+ def _parse_scratch(rest: str) -> ParsedKey | None:
228
+ segs = rest.split("/")
229
+ if len(segs) < 3:
230
+ return None
231
+ tenant, principal, relpath = segs[0], segs[1], "/".join(segs[2:])
232
+ if not (_is_slug(tenant) and _is_slug(principal) and _relpath_ok(relpath)):
233
+ return None
234
+ return ParsedKey(Channel.SCRATCH, tenant_id=tenant, principal_id=principal, path=relpath)
235
+
236
+
237
+ def _parse_original(rest: str) -> ParsedKey | None:
238
+ segs = rest.split("/")
239
+ if len(segs) < 3:
240
+ return None
241
+ tenant, ns, relpath = segs[0], segs[1], "/".join(segs[2:])
242
+ if not (_is_slug(tenant) and _is_slug(ns) and _relpath_ok(relpath)):
243
+ return None
244
+ return ParsedKey(Channel.ORIGINAL, tenant_id=tenant, namespace=ns, path=relpath)
245
+
246
+
247
+ def is_indexable(key: str) -> bool:
248
+ """True only for raw canonical documents under ``tenants/`` (plan §3.2).
249
+
250
+ The single predicate that excludes ``scratch/`` and ``derived/`` from the
251
+ cataloger, extractor, index-sync, tree builder, and search scope.
252
+ """
253
+ parsed = parse_key(key)
254
+ return parsed is not None and parsed.channel is Channel.ORIGINAL
255
+
256
+
257
+ __all__ = [
258
+ "Channel",
259
+ "ParsedKey",
260
+ "derived_meta_key",
261
+ "derived_text_key",
262
+ "derived_text_metadata_key",
263
+ "is_indexable",
264
+ "originals_key",
265
+ "parse_key",
266
+ "scratch_key",
267
+ "tree_key",
268
+ "validate_id",
269
+ "validate_relpath",
270
+ "validate_slug",
271
+ ]