delimit-cli 4.6.0 → 4.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,339 @@
1
+ """LED-2268 P0 Phase 0.3 — first consumer of the tenant_data_root primitive.
2
+
3
+ Provides describe_tenant_data() — the read-only view of what's on disk
4
+ inside a given tenant's data root. Used by the /tenant/data endpoint
5
+ and intended to power the dashboard's "your data lives here" home tile
6
+ for browser-only operators.
7
+
8
+ The describe call is deliberately minimal:
9
+ - data_root: absolute path string the gateway resolved for this tenant
10
+ - exists: has the dir been created yet?
11
+ - files: relative paths inside the dir (deepest-first, sorted)
12
+ - dirs: relative paths of subdirectories
13
+ - total_size_bytes: sum of all file sizes (sentinel for usage display)
14
+ - cap_bytes: soft cap if configured (Phase 0.3 hard-codes None — no cap)
15
+
16
+ Phase 0.3 ONLY reads. No write/delete API yet — that's Phase 0.4+, when
17
+ the dashboard ships its first "create note / save memory" surface.
18
+
19
+ Founder-data migration is handled by the SEPARATE manual script
20
+ scripts/delimit_seed_tenant_data.py (also in this PR), not by an
21
+ auto-trigger inside describe(). Keeps the read path side-effect-free.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import logging
26
+ import os
27
+ from pathlib import Path
28
+ from typing import Optional, TypedDict
29
+
30
+ from . import tenant_paths
31
+
32
+ logger = logging.getLogger("delimit.tenant_data")
33
+
34
+
35
+ # ─────────────────────────────────────────────────────────────────────
36
+ # Phase 0.4 — write/read/delete limits + allowlist
37
+ # ─────────────────────────────────────────────────────────────────────
38
+
39
+ # Max bytes a single tenant file may contain. Generous enough for
40
+ # memory.jsonl / ledger.jsonl scale (typically <100KB per tenant) but
41
+ # tight enough that a runaway client can't fill the disk. Future quota
42
+ # enforcement will sum across files; this is per-file.
43
+ MAX_FILE_BYTES = 1024 * 1024 # 1 MiB
44
+
45
+ # Allowlist of file extensions tenants may write/read. Restrictive on
46
+ # purpose: text-shaped data files only. Blocks .py / .sh / .so / .dll
47
+ # / anything executable so the tenant data root can never become a
48
+ # code-drop or LD-load source.
49
+ _ALLOWED_EXTENSIONS = frozenset({
50
+ ".json",
51
+ ".jsonl",
52
+ ".md",
53
+ ".txt",
54
+ ".csv",
55
+ ".yaml",
56
+ ".yml",
57
+ })
58
+
59
+ # Max path-segment count (depth) to discourage deeply-nested layouts
60
+ # that complicate audit + backup. Practical cap; nothing in the
61
+ # legitimate use case needs >5 levels of subdirectory.
62
+ _MAX_PATH_DEPTH = 5
63
+
64
+
65
+ class TenantPathError(Exception):
66
+ """Raised for any tenant-data path that fails validation.
67
+
68
+ Caller pattern is `except TenantPathError as e: return 400 ...`.
69
+ The message is the diagnostic suitable for surfacing to the user
70
+ ("path_too_deep", "extension_forbidden", "path_escapes_root", etc).
71
+ """
72
+
73
+
74
+ def _resolve_tenant_file(user_id: str, rel_path: str, *, create_root: bool = False) -> Path:
75
+ """Validate + resolve `rel_path` inside the tenant's data root.
76
+
77
+ Raises TenantPathError on any of:
78
+ - empty / non-string rel_path
79
+ - rel_path containing nul bytes
80
+ - rel_path with absolute prefix ('/...')
81
+ - rel_path with traversal segments ('..') that would escape root
82
+ - rel_path with > _MAX_PATH_DEPTH segments
83
+ - extension not in _ALLOWED_EXTENSIONS
84
+ - user_id unsanitisable (no resolvable tenant root)
85
+
86
+ Returns the absolute resolved Path, NEVER outside the tenant root.
87
+ """
88
+ if not isinstance(rel_path, str) or not rel_path:
89
+ raise TenantPathError("path_required")
90
+ if "\x00" in rel_path:
91
+ raise TenantPathError("path_invalid")
92
+ # Normalise separators (a tenant could send "\" on Windows-style
93
+ # input even if the server is Linux; treat both as separators).
94
+ norm = rel_path.replace("\\", "/").strip()
95
+ if not norm:
96
+ raise TenantPathError("path_required")
97
+ if norm.startswith("/"):
98
+ raise TenantPathError("path_must_be_relative")
99
+
100
+ # Split + reject any traversal segments before resolving. The
101
+ # post-resolve check below is a second line of defence; do this
102
+ # pre-check too so we don't even touch the filesystem for obvious
103
+ # attacks.
104
+ parts = [p for p in norm.split("/") if p]
105
+ if any(p in ("", ".", "..") for p in parts):
106
+ raise TenantPathError("path_traversal_forbidden")
107
+ if len(parts) > _MAX_PATH_DEPTH:
108
+ raise TenantPathError("path_too_deep")
109
+
110
+ # Extension allowlist applies to the final segment only.
111
+ final = parts[-1]
112
+ suffix = Path(final).suffix.lower()
113
+ if suffix not in _ALLOWED_EXTENSIONS:
114
+ raise TenantPathError("extension_forbidden")
115
+
116
+ root = tenant_paths.tenant_data_root(user_id, create=create_root)
117
+ if root is None:
118
+ raise TenantPathError("tenant_resolve_failed")
119
+
120
+ # Build the candidate path + verify it stays under the tenant root
121
+ # after path-resolution. Defence in depth against any sanitiser
122
+ # gap (symlinks, alternate path-separator tricks, OS-specific
123
+ # weirdness).
124
+ candidate = (root / Path(*parts)).resolve()
125
+ try:
126
+ candidate.relative_to(root.resolve())
127
+ except ValueError as e:
128
+ raise TenantPathError("path_escapes_root") from e
129
+ return candidate
130
+
131
+
132
+ def write_tenant_file(user_id: str, rel_path: str, content: bytes) -> int:
133
+ """Atomically write `content` to `rel_path` inside the tenant's data root.
134
+
135
+ - Creates the tenant root + intermediate directories with 0o700.
136
+ - Enforces MAX_FILE_BYTES on `content`.
137
+ - Writes to a sibling `.tmp` file then renames (atomic on POSIX).
138
+ - File mode is 0o600 (gateway-process-owner readable only).
139
+
140
+ Returns the number of bytes written. Raises TenantPathError on
141
+ validation failure or OSError on filesystem failure.
142
+ """
143
+ if not isinstance(content, (bytes, bytearray, memoryview)):
144
+ raise TenantPathError("content_must_be_bytes")
145
+ if len(content) > MAX_FILE_BYTES:
146
+ raise TenantPathError("content_too_large")
147
+ target = _resolve_tenant_file(user_id, rel_path, create_root=True)
148
+ target.parent.mkdir(parents=True, exist_ok=True, mode=0o700)
149
+ tmp = target.with_name(target.name + ".tmp")
150
+ # Use os.open so we can set the mode atomically (chmod-after-write
151
+ # would race with a reader that opened between create + chmod).
152
+ fd = os.open(str(tmp), os.O_WRONLY | os.O_CREAT | os.O_TRUNC, 0o600)
153
+ try:
154
+ os.write(fd, bytes(content))
155
+ finally:
156
+ os.close(fd)
157
+ os.replace(tmp, target)
158
+ return len(content)
159
+
160
+
161
+ def read_tenant_file(user_id: str, rel_path: str) -> Optional[bytes]:
162
+ """Read a tenant file, or None if it doesn't exist.
163
+
164
+ Raises TenantPathError on validation failure. Other filesystem
165
+ errors (PermissionError, IsADirectoryError) propagate — those
166
+ indicate a bug or hostile filesystem state, not normal client
167
+ input.
168
+ """
169
+ target = _resolve_tenant_file(user_id, rel_path, create_root=False)
170
+ if not target.is_file():
171
+ return None
172
+ if target.stat().st_size > MAX_FILE_BYTES:
173
+ # Defence in depth: even if a write somehow bypassed the cap,
174
+ # don't echo the over-large content back to a client. Return
175
+ # None and log — caller surfaces as "not found".
176
+ logger.warning(
177
+ "read_tenant_file refusing oversize file: user=%s path=%s size=%d",
178
+ user_id, rel_path, target.stat().st_size,
179
+ )
180
+ return None
181
+ return target.read_bytes()
182
+
183
+
184
+ def delete_tenant_file(user_id: str, rel_path: str) -> bool:
185
+ """Delete a tenant file. Returns True if deleted, False if absent.
186
+
187
+ Raises TenantPathError on validation failure.
188
+ """
189
+ target = _resolve_tenant_file(user_id, rel_path, create_root=False)
190
+ if not target.exists():
191
+ return False
192
+ if target.is_dir():
193
+ # We don't currently support tenant subdirs at the API level
194
+ # (write creates them as a side effect of the file path).
195
+ # Reject directory deletes outright — tenants shouldn't be
196
+ # able to recursively rm their own dir tree via this API.
197
+ raise TenantPathError("path_is_directory")
198
+ target.unlink()
199
+ return True
200
+
201
+
202
+ class TenantDataSummary(TypedDict):
203
+ """What /tenant/data returns to a caller."""
204
+ user_id: str
205
+ data_root: str
206
+ exists: bool
207
+ files: list[str]
208
+ dirs: list[str]
209
+ total_size_bytes: int
210
+ cap_bytes: Optional[int]
211
+
212
+
213
+ # Conservative cap on how many entries we'll enumerate / size-sum before
214
+ # bailing out. A tenant with 100k files shouldn't be able to make a
215
+ # single /tenant/data call stat() every one of them on every dashboard
216
+ # refresh. Returning truncated counts is honest enough for "how full is
217
+ # my dir" UX; the dashboard can surface "(more — refresh to scan)".
218
+ _MAX_ENTRIES_PER_SUMMARY = 1000
219
+
220
+
221
+ def describe_tenant_data(user_id: str, *, create: bool = False) -> Optional[TenantDataSummary]:
222
+ """Read-only summary of a tenant's on-disk data.
223
+
224
+ Returns None if `user_id` is unsanitisable (same failure mode as
225
+ tenant_paths.tenant_data_root). Caller treats that as "unauthorised".
226
+
227
+ When `create=False` (default) and the dir doesn't exist yet, returns
228
+ a summary with exists=False and empty lists. This is the normal
229
+ first-call shape — operators see "no data yet, you're brand new."
230
+ When `create=True`, the dir is mkdir'd and an empty summary returned
231
+ (used by /tenant/setup-style flows; Phase 0.3 doesn't ship one yet).
232
+ """
233
+ root = tenant_paths.tenant_data_root(user_id, create=create)
234
+ if root is None:
235
+ return None
236
+
237
+ summary: TenantDataSummary = {
238
+ "user_id": user_id,
239
+ "data_root": str(root),
240
+ "exists": root.exists(),
241
+ "files": [],
242
+ "dirs": [],
243
+ "total_size_bytes": 0,
244
+ "cap_bytes": None,
245
+ }
246
+
247
+ if not summary["exists"]:
248
+ return summary
249
+
250
+ files: list[str] = []
251
+ dirs: list[str] = []
252
+ total = 0
253
+ count = 0
254
+ try:
255
+ for entry in sorted(root.rglob("*")):
256
+ count += 1
257
+ if count > _MAX_ENTRIES_PER_SUMMARY:
258
+ break
259
+ rel = entry.relative_to(root)
260
+ rel_str = str(rel)
261
+ if entry.is_file():
262
+ files.append(rel_str)
263
+ try:
264
+ total += entry.stat().st_size
265
+ except OSError:
266
+ # Race: file existed in glob but vanished by stat.
267
+ # Treat as zero-size and continue. Not a fatal error.
268
+ pass
269
+ elif entry.is_dir():
270
+ dirs.append(rel_str)
271
+ except (OSError, PermissionError) as e:
272
+ # Don't blow up the response — return what we have so the caller
273
+ # at least sees the root + the readability problem in the log.
274
+ logger.warning("describe_tenant_data partial: %s", e)
275
+
276
+ summary["files"] = files
277
+ summary["dirs"] = dirs
278
+ summary["total_size_bytes"] = total
279
+ return summary
280
+
281
+
282
+ def describe_shared_data() -> dict:
283
+ """Read-only summary of the legacy single-tenant `~/.delimit/` view.
284
+
285
+ Used by the shared-bearer (founder/system) path on /tenant/data.
286
+ Returns the same shape as describe_tenant_data minus `user_id`
287
+ (there is no user_id for the shared-bearer caller — it's the
288
+ founder/system).
289
+ """
290
+ # Reuse the same _MAX_ENTRIES_PER_SUMMARY cap. Founder's `~/.delimit/`
291
+ # typically has hundreds of files (memory.jsonl, ledger.jsonl,
292
+ # evidence/, daemon/, etc), so truncation is realistic.
293
+ home = os.environ.get("DELIMIT_HOME")
294
+ root = Path(home).expanduser().resolve() if home else (Path.home() / ".delimit")
295
+ summary: dict = {
296
+ "user_id": "", # shared-bearer: no tenant scope
297
+ "data_root": str(root),
298
+ "exists": root.is_dir(),
299
+ "files": [],
300
+ "dirs": [],
301
+ "total_size_bytes": 0,
302
+ "cap_bytes": None,
303
+ }
304
+ if not summary["exists"]:
305
+ return summary
306
+
307
+ files: list[str] = []
308
+ dirs: list[str] = []
309
+ total = 0
310
+ count = 0
311
+ try:
312
+ for entry in sorted(root.rglob("*")):
313
+ # Skip the tenants/ subdir from the shared view — that's the
314
+ # per-tenant tree, which the founder views via the dashboard's
315
+ # tenant-list / admin surface, not as part of her own data.
316
+ try:
317
+ if entry.relative_to(root).parts[:1] == ("tenants",):
318
+ continue
319
+ except ValueError:
320
+ pass
321
+ count += 1
322
+ if count > _MAX_ENTRIES_PER_SUMMARY:
323
+ break
324
+ rel_str = str(entry.relative_to(root))
325
+ if entry.is_file():
326
+ files.append(rel_str)
327
+ try:
328
+ total += entry.stat().st_size
329
+ except OSError:
330
+ pass
331
+ elif entry.is_dir():
332
+ dirs.append(rel_str)
333
+ except (OSError, PermissionError) as e:
334
+ logger.warning("describe_shared_data partial: %s", e)
335
+
336
+ summary["files"] = files
337
+ summary["dirs"] = dirs
338
+ summary["total_size_bytes"] = total
339
+ return summary
@@ -0,0 +1,150 @@
1
+ """LED-2268 P0 Phase 0.2 — tenant-scoped filesystem layout.
2
+
3
+ The gateway today stores everything under `~/.delimit/` (memory.jsonl,
4
+ ledger.jsonl, evidence/, etc). That's correct for the single-tenant
5
+ founder install but doesn't generalize once paying customers run their
6
+ own tenants against a shared gateway host.
7
+
8
+ This module owns the path-resolver primitive for the per-tenant layout:
9
+
10
+ ~/.delimit/ ← legacy / shared root (unchanged)
11
+ ~/.delimit/tenants/
12
+ <safe-user-id>/ ← one dir per resolved API-key user
13
+ memory.jsonl
14
+ ledger.jsonl
15
+ evidence/
16
+ ...
17
+
18
+ Phase 0.2 ONLY ships the resolver + sanitiser + base-dir creation. No
19
+ existing storage is migrated; no endpoint is yet rerouted through here.
20
+ Phase 0.3 will add the first endpoint that uses tenant_data_root() and
21
+ copy the founder's existing single-tenant data into her own tenant
22
+ folder.
23
+
24
+ Security note: the user_id segment comes from Supabase
25
+ `user_api_keys.user_id` (which itself comes from NextAuth users.id, a
26
+ GitHub-OAuth-derived string). It's NEVER raw user input from the
27
+ request — but we still sanitise it defensively so a malformed value in
28
+ the DB can't escape into adjacent dirs via `..` or NUL bytes.
29
+ """
30
+ from __future__ import annotations
31
+
32
+ import os
33
+ import re
34
+ import string
35
+ from pathlib import Path
36
+ from typing import Optional
37
+
38
+
39
+ # Base of the whole per-tenant tree. Lives under the existing delimit
40
+ # home so backup/restore tooling sees it without extra wiring.
41
+ def _delimit_home() -> Path:
42
+ """Resolve ~/.delimit/ — same convention as the rest of the gateway."""
43
+ home = os.environ.get("DELIMIT_HOME")
44
+ if home:
45
+ return Path(home).expanduser().resolve()
46
+ return Path.home() / ".delimit"
47
+
48
+
49
+ _TENANTS_DIRNAME = "tenants"
50
+ # Allowed chars in a sanitised user-id segment. Conservative: ASCII
51
+ # alphanumerics + a small set of safe punctuation. Nothing that could
52
+ # be interpreted by the shell, the path parser, or a downstream tool.
53
+ _SAFE_CHARS = frozenset(string.ascii_letters + string.digits + "-_.")
54
+ # Max chars in a single user-id segment. Filesystems generally allow
55
+ # 255-byte basenames; we cap well below that and prefix-truncate +
56
+ # hash-suffix any longer input so distinct over-long IDs don't collide.
57
+ _MAX_SEGMENT_LEN = 64
58
+
59
+
60
+ def safe_user_segment(user_id: str) -> Optional[str]:
61
+ """Sanitise a user_id into a filesystem-safe directory name.
62
+
63
+ Returns None for empty / suspicious input so callers MUST handle
64
+ the rejection rather than silently writing to a default dir. The
65
+ intentional asymmetry from `_hash_key` (which always produces a
66
+ valid hex string) is that an unauthenticated request can't land
67
+ here — only an already-validated identity does — so a None here
68
+ represents a corrupted DB row, not a normal failure mode.
69
+
70
+ Strategy:
71
+ - Strip whitespace, lowercase.
72
+ - Replace any char outside the safe set with '_'.
73
+ - If result is empty or only underscores, reject.
74
+ - If result is longer than _MAX_SEGMENT_LEN, truncate + append
75
+ a short hash suffix so distinct over-long IDs don't collide.
76
+ - Reject anything that resolves to '.' or '..' (defence in depth
77
+ against malformed DB rows like literally the string "..").
78
+ """
79
+ if not isinstance(user_id, str) or not user_id:
80
+ return None
81
+ s = user_id.strip().lower()
82
+ if not s:
83
+ return None
84
+ # Substitute unsafe chars one-for-one — preserves length / readability
85
+ # for the common case (NextAuth GitHub uses bare integer-ish strings).
86
+ safe = "".join(c if c in _SAFE_CHARS else "_" for c in s)
87
+ if not safe or safe.strip("_") == "":
88
+ return None
89
+ if safe in (".", ".."):
90
+ return None
91
+ if len(safe) > _MAX_SEGMENT_LEN:
92
+ # Truncate to (max - 9) so the suffix `-<8hex>` fits in budget.
93
+ import hashlib
94
+ digest = hashlib.sha256(s.encode("utf-8")).hexdigest()[:8]
95
+ safe = safe[: _MAX_SEGMENT_LEN - 9] + "-" + digest
96
+ return safe
97
+
98
+
99
+ def tenants_root() -> Path:
100
+ """The shared parent of all per-tenant dirs. Always under DELIMIT_HOME."""
101
+ return _delimit_home() / _TENANTS_DIRNAME
102
+
103
+
104
+ def tenant_data_root(user_id: str, *, create: bool = False) -> Optional[Path]:
105
+ """Resolve the on-disk root for a specific tenant's data.
106
+
107
+ Returns None if `user_id` doesn't sanitise to a usable segment.
108
+ Caller treats that as "unauthorised" — same shape as the validator.
109
+
110
+ If `create=True`, ensures the directory exists (mkdir -p, mode 0700).
111
+ Default is read-only resolve so this can be called on hot paths
112
+ without making syscalls when the dir is already present.
113
+ """
114
+ seg = safe_user_segment(user_id)
115
+ if seg is None:
116
+ return None
117
+ root = tenants_root() / seg
118
+ # Defence in depth: ensure the resolved path stays under tenants_root.
119
+ # Belt-and-braces against an unforeseen sanitiser bypass.
120
+ try:
121
+ if tenants_root().resolve() not in root.resolve().parents and \
122
+ root.resolve() != tenants_root().resolve():
123
+ return None
124
+ except (OSError, RuntimeError):
125
+ return None
126
+ if create:
127
+ root.mkdir(parents=True, exist_ok=True, mode=0o700)
128
+ # Ensure tenants_root itself has the right mode too — first-
129
+ # ever tenant write would otherwise inherit umask.
130
+ try:
131
+ tenants_root().chmod(0o700)
132
+ except OSError:
133
+ pass
134
+ return root
135
+
136
+
137
+ def list_tenants() -> list[str]:
138
+ """List the segment names of all tenants currently with on-disk data.
139
+
140
+ Used by maintenance / audit / backup tooling. Returns an empty list
141
+ when no tenants exist yet (the directory simply doesn't exist).
142
+ """
143
+ root = tenants_root()
144
+ if not root.is_dir():
145
+ return []
146
+ out: list[str] = []
147
+ for entry in root.iterdir():
148
+ if entry.is_dir() and entry.name and not entry.name.startswith("."):
149
+ out.append(entry.name)
150
+ return sorted(out)