@ictechgy/context-guard 0.4.7 → 0.4.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/context-guard-kit/context_filter.py +212 -21
- package/context-guard-kit/context_guard_cli.py +174 -2
- package/context-guard-kit/context_pack.py +66 -21
- package/context-guard-kit/cost_guard.py +126 -59
- package/context-guard-kit/experimental_registry.py +362 -61
- package/package.json +1 -1
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/bin/context-guard +174 -2
- package/plugins/context-guard/bin/context-guard-cost +126 -59
- package/plugins/context-guard/bin/context-guard-experiments +362 -61
- package/plugins/context-guard/bin/context-guard-filter +212 -21
- package/plugins/context-guard/bin/context-guard-pack +66 -21
|
@@ -15,9 +15,12 @@ import hashlib
|
|
|
15
15
|
import ipaddress
|
|
16
16
|
import json
|
|
17
17
|
import math
|
|
18
|
+
import os
|
|
18
19
|
import re
|
|
20
|
+
import secrets
|
|
19
21
|
import shlex
|
|
20
22
|
from pathlib import Path
|
|
23
|
+
import stat
|
|
21
24
|
import sys
|
|
22
25
|
from typing import Any, NoReturn
|
|
23
26
|
import unicodedata
|
|
@@ -26,6 +29,7 @@ from urllib.parse import urlparse
|
|
|
26
29
|
TOOL_NAME = "context-guard-experiments"
|
|
27
30
|
CONFIG_SCHEMA_VERSION = "contextguard.experiments.v1"
|
|
28
31
|
DEFAULT_CONFIG = Path(".context-guard") / "experiments.json"
|
|
32
|
+
MAX_CONFIG_BYTES = 64_000
|
|
29
33
|
MAX_CONTEXT_DIFF_INPUT_BYTES = 256_000
|
|
30
34
|
MAX_VISUAL_OCR_TEXT_BYTES = 64_000
|
|
31
35
|
MAX_LEARNED_COMPRESSION_INPUT_BYTES = 128_000
|
|
@@ -49,6 +53,17 @@ LOCAL_PROXY_DEFAULT_BIND_PORT = 0
|
|
|
49
53
|
LOCAL_PROXY_DEFAULT_TARGET_HOST = "127.0.0.1"
|
|
50
54
|
LOCAL_PROXY_DEFAULT_TARGET_PORT = 0
|
|
51
55
|
LOCAL_PROXY_LOCALHOST_NAMES = {"localhost"}
|
|
56
|
+
ALLOWED_FIRST_COMPONENT_SYMLINKS = {
|
|
57
|
+
"tmp": Path("/private/tmp"),
|
|
58
|
+
"var": Path("/private/var"),
|
|
59
|
+
}
|
|
60
|
+
DIR_FD_OPEN_SUPPORTED = os.open in getattr(os, "supports_dir_fd", set())
|
|
61
|
+
DIR_FD_MKDIR_SUPPORTED = os.mkdir in getattr(os, "supports_dir_fd", set())
|
|
62
|
+
DIR_FD_STAT_NOFOLLOW_SUPPORTED = (
|
|
63
|
+
os.stat in getattr(os, "supports_dir_fd", set())
|
|
64
|
+
and os.stat in getattr(os, "supports_follow_symlinks", set())
|
|
65
|
+
)
|
|
66
|
+
NO_FOLLOW_SUPPORTED = hasattr(os, "O_NOFOLLOW")
|
|
52
67
|
|
|
53
68
|
|
|
54
69
|
@dataclass(frozen=True)
|
|
@@ -276,6 +291,306 @@ def fail(message: str, code: int = 2) -> NoReturn:
|
|
|
276
291
|
raise SystemExit(code)
|
|
277
292
|
|
|
278
293
|
|
|
294
|
+
def os_error_detail(exc: OSError) -> str:
|
|
295
|
+
detail = exc.strerror or exc.__class__.__name__
|
|
296
|
+
if exc.errno is not None:
|
|
297
|
+
return f"{detail} (errno {exc.errno})"
|
|
298
|
+
return detail
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _no_follow_flag(*, label: str) -> int:
|
|
302
|
+
if not NO_FOLLOW_SUPPORTED:
|
|
303
|
+
raise RegistryError(f"{label} requires O_NOFOLLOW support")
|
|
304
|
+
return os.O_NOFOLLOW
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def _directory_open_flags(*, follow_final: bool = False, label: str) -> int:
|
|
308
|
+
flags = os.O_RDONLY
|
|
309
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
310
|
+
flags |= os.O_CLOEXEC
|
|
311
|
+
if hasattr(os, "O_DIRECTORY"):
|
|
312
|
+
flags |= os.O_DIRECTORY
|
|
313
|
+
if not follow_final:
|
|
314
|
+
flags |= _no_follow_flag(label=label)
|
|
315
|
+
return flags
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def _file_open_flags(*, label: str, write: bool = False) -> int:
|
|
319
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC if write else os.O_RDONLY
|
|
320
|
+
flags |= _no_follow_flag(label=label)
|
|
321
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
322
|
+
flags |= os.O_CLOEXEC
|
|
323
|
+
if hasattr(os, "O_NONBLOCK"):
|
|
324
|
+
flags |= os.O_NONBLOCK
|
|
325
|
+
if hasattr(os, "O_NOCTTY"):
|
|
326
|
+
flags |= os.O_NOCTTY
|
|
327
|
+
return flags
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def _temp_file_open_flags(*, label: str) -> int:
|
|
331
|
+
flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
|
|
332
|
+
flags |= _no_follow_flag(label=label)
|
|
333
|
+
if hasattr(os, "O_CLOEXEC"):
|
|
334
|
+
flags |= os.O_CLOEXEC
|
|
335
|
+
if hasattr(os, "O_NOCTTY"):
|
|
336
|
+
flags |= os.O_NOCTTY
|
|
337
|
+
return flags
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _leaf_name(path: Path, *, label: str) -> str:
|
|
341
|
+
name = path.name
|
|
342
|
+
if name in {"", ".", ".."}:
|
|
343
|
+
raise RegistryError(f"{label} must name a regular file")
|
|
344
|
+
return name
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _normalized_link_target(anchor: Path, raw_target: str) -> Path:
|
|
348
|
+
target = Path(raw_target)
|
|
349
|
+
if target.is_absolute():
|
|
350
|
+
return Path(os.path.normpath(str(target)))
|
|
351
|
+
return Path(os.path.normpath(str(anchor / target)))
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
|
|
355
|
+
if not path.is_absolute():
|
|
356
|
+
return path
|
|
357
|
+
parts = path.parts
|
|
358
|
+
if len(parts) < 2:
|
|
359
|
+
return path
|
|
360
|
+
first = parts[1]
|
|
361
|
+
expected = ALLOWED_FIRST_COMPONENT_SYMLINKS.get(first)
|
|
362
|
+
if expected is None:
|
|
363
|
+
return path
|
|
364
|
+
link = Path(path.anchor) / first
|
|
365
|
+
try:
|
|
366
|
+
if link.is_symlink() and _normalized_link_target(Path(path.anchor), os.readlink(link)) == expected:
|
|
367
|
+
return expected.joinpath(*parts[2:])
|
|
368
|
+
except OSError:
|
|
369
|
+
return path
|
|
370
|
+
return path
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def normalize_local_path(path: Path) -> Path:
|
|
374
|
+
path = path.expanduser()
|
|
375
|
+
if not path.is_absolute():
|
|
376
|
+
path = Path.cwd() / path
|
|
377
|
+
return normalize_allowed_first_absolute_symlink(Path(os.path.normpath(str(path))))
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def normalize_project_path(root: Path, candidate: Path, *, label: str) -> Path:
|
|
381
|
+
candidate = candidate.expanduser()
|
|
382
|
+
if not candidate.is_absolute():
|
|
383
|
+
candidate = root / candidate
|
|
384
|
+
normalized = normalize_allowed_first_absolute_symlink(Path(os.path.normpath(str(candidate))))
|
|
385
|
+
try:
|
|
386
|
+
normalized.relative_to(root)
|
|
387
|
+
except ValueError as exc:
|
|
388
|
+
raise RegistryError(f"{label} must stay inside project root: {normalized}") from exc
|
|
389
|
+
return normalized
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def open_directory_no_follow(path: Path, *, label: str, create: bool = False, missing_ok: bool = False) -> int | None:
|
|
393
|
+
path = normalize_allowed_first_absolute_symlink(path)
|
|
394
|
+
if not DIR_FD_OPEN_SUPPORTED:
|
|
395
|
+
raise RegistryError(f"{label} requires dir_fd open support")
|
|
396
|
+
if create and not DIR_FD_MKDIR_SUPPORTED:
|
|
397
|
+
raise RegistryError(f"{label} requires dir_fd mkdir support")
|
|
398
|
+
flags = _directory_open_flags(label=label)
|
|
399
|
+
if path.is_absolute():
|
|
400
|
+
anchor = path.anchor or os.sep
|
|
401
|
+
parts = path.parts[1:]
|
|
402
|
+
try:
|
|
403
|
+
current_fd = os.open(anchor, _directory_open_flags(follow_final=True, label=label))
|
|
404
|
+
except OSError as exc:
|
|
405
|
+
raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
406
|
+
else:
|
|
407
|
+
parts = path.parts
|
|
408
|
+
try:
|
|
409
|
+
current_fd = os.open(".", flags)
|
|
410
|
+
except OSError as exc:
|
|
411
|
+
raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
412
|
+
try:
|
|
413
|
+
for part in parts:
|
|
414
|
+
if part in {"", "."}:
|
|
415
|
+
continue
|
|
416
|
+
if part == "..":
|
|
417
|
+
raise RegistryError(f"{label} must not contain parent traversal")
|
|
418
|
+
next_fd = -1
|
|
419
|
+
try:
|
|
420
|
+
next_fd = os.open(part, flags, dir_fd=current_fd)
|
|
421
|
+
except FileNotFoundError:
|
|
422
|
+
if missing_ok:
|
|
423
|
+
os.close(current_fd)
|
|
424
|
+
current_fd = -1
|
|
425
|
+
return None
|
|
426
|
+
if not create:
|
|
427
|
+
raise RegistryError(f"could not inspect {label}: missing directory component") from None
|
|
428
|
+
try:
|
|
429
|
+
os.mkdir(part, mode=0o755, dir_fd=current_fd)
|
|
430
|
+
except FileExistsError:
|
|
431
|
+
pass
|
|
432
|
+
except OSError as exc:
|
|
433
|
+
raise RegistryError(f"could not create {label}: {os_error_detail(exc)}") from exc
|
|
434
|
+
try:
|
|
435
|
+
next_fd = os.open(part, flags, dir_fd=current_fd)
|
|
436
|
+
except OSError as exc:
|
|
437
|
+
raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
438
|
+
except OSError as exc:
|
|
439
|
+
raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
440
|
+
try:
|
|
441
|
+
if not stat.S_ISDIR(os.fstat(next_fd).st_mode):
|
|
442
|
+
raise RegistryError(f"{label} must not traverse non-directory components")
|
|
443
|
+
except Exception:
|
|
444
|
+
if next_fd >= 0:
|
|
445
|
+
try:
|
|
446
|
+
os.close(next_fd)
|
|
447
|
+
except OSError:
|
|
448
|
+
pass
|
|
449
|
+
raise
|
|
450
|
+
try:
|
|
451
|
+
os.close(current_fd)
|
|
452
|
+
except OSError:
|
|
453
|
+
pass
|
|
454
|
+
current_fd = next_fd
|
|
455
|
+
owned_fd = current_fd
|
|
456
|
+
current_fd = -1
|
|
457
|
+
return owned_fd
|
|
458
|
+
finally:
|
|
459
|
+
if current_fd >= 0:
|
|
460
|
+
try:
|
|
461
|
+
os.close(current_fd)
|
|
462
|
+
except OSError:
|
|
463
|
+
pass
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
def _precheck_regular_leaf(parent_fd: int, leaf_name: str, *, label: str, missing_ok: bool = False) -> bool:
|
|
467
|
+
if not DIR_FD_STAT_NOFOLLOW_SUPPORTED:
|
|
468
|
+
raise RegistryError(f"{label} requires dir_fd stat support")
|
|
469
|
+
try:
|
|
470
|
+
st = os.stat(leaf_name, dir_fd=parent_fd, follow_symlinks=False)
|
|
471
|
+
except FileNotFoundError:
|
|
472
|
+
if missing_ok:
|
|
473
|
+
return False
|
|
474
|
+
raise RegistryError(f"could not inspect {label}: missing file") from None
|
|
475
|
+
except OSError as exc:
|
|
476
|
+
raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
|
|
477
|
+
if not stat.S_ISREG(st.st_mode):
|
|
478
|
+
raise RegistryError(f"{label} must be a regular file")
|
|
479
|
+
return True
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def read_bounded_regular_file(path: Path, *, max_bytes: int, label: str, missing_ok: bool = False) -> tuple[bytes, bool] | None:
|
|
483
|
+
path = normalize_local_path(path)
|
|
484
|
+
parent_fd = open_directory_no_follow(path.parent, label=f"{label} parent", missing_ok=missing_ok)
|
|
485
|
+
if parent_fd is None:
|
|
486
|
+
return None
|
|
487
|
+
fd = -1
|
|
488
|
+
try:
|
|
489
|
+
leaf = _leaf_name(path, label=label)
|
|
490
|
+
exists = _precheck_regular_leaf(parent_fd, leaf, label=label, missing_ok=missing_ok)
|
|
491
|
+
if not exists:
|
|
492
|
+
return None
|
|
493
|
+
fd = os.open(leaf, _file_open_flags(label=label), dir_fd=parent_fd)
|
|
494
|
+
if not stat.S_ISREG(os.fstat(fd).st_mode):
|
|
495
|
+
raise RegistryError(f"{label} must be a regular file")
|
|
496
|
+
chunks: list[bytes] = []
|
|
497
|
+
remaining = max_bytes + 1
|
|
498
|
+
while remaining > 0:
|
|
499
|
+
chunk = os.read(fd, min(64 * 1024, remaining))
|
|
500
|
+
if not chunk:
|
|
501
|
+
break
|
|
502
|
+
chunks.append(chunk)
|
|
503
|
+
remaining -= len(chunk)
|
|
504
|
+
raw = b"".join(chunks)
|
|
505
|
+
truncated = len(raw) > max_bytes
|
|
506
|
+
return raw[:max_bytes], truncated
|
|
507
|
+
except OSError as exc:
|
|
508
|
+
raise RegistryError(f"could not read {label}: {os_error_detail(exc)}") from exc
|
|
509
|
+
finally:
|
|
510
|
+
if fd >= 0:
|
|
511
|
+
try:
|
|
512
|
+
os.close(fd)
|
|
513
|
+
except OSError:
|
|
514
|
+
pass
|
|
515
|
+
try:
|
|
516
|
+
os.close(parent_fd)
|
|
517
|
+
except OSError:
|
|
518
|
+
pass
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def write_all_fd(fd: int, data: bytes) -> None:
|
|
522
|
+
view = memoryview(data)
|
|
523
|
+
offset = 0
|
|
524
|
+
while offset < len(view):
|
|
525
|
+
written = os.write(fd, view[offset:])
|
|
526
|
+
if written <= 0:
|
|
527
|
+
raise OSError("short write")
|
|
528
|
+
offset += written
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def write_regular_file_no_follow(path: Path, data: bytes, *, label: str) -> None:
|
|
532
|
+
path = normalize_local_path(path)
|
|
533
|
+
parent_fd = open_directory_no_follow(path.parent, label=f"{label} parent", create=True)
|
|
534
|
+
if parent_fd is None: # pragma: no cover - create=True never returns None.
|
|
535
|
+
raise RegistryError(f"could not inspect {label} parent")
|
|
536
|
+
fd = -1
|
|
537
|
+
temp_leaf: str | None = None
|
|
538
|
+
try:
|
|
539
|
+
leaf = _leaf_name(path, label=label)
|
|
540
|
+
exists = _precheck_regular_leaf(parent_fd, leaf, label=label, missing_ok=True)
|
|
541
|
+
mode = 0o644
|
|
542
|
+
if exists:
|
|
543
|
+
try:
|
|
544
|
+
mode = stat.S_IMODE(os.stat(leaf, dir_fd=parent_fd, follow_symlinks=False).st_mode) or 0o644
|
|
545
|
+
except OSError:
|
|
546
|
+
mode = 0o644
|
|
547
|
+
for _attempt in range(20):
|
|
548
|
+
candidate = _leaf_name(Path(f".{leaf}.{os.getpid()}.{secrets.token_hex(8)}.tmp"), label=f"{label} temp")
|
|
549
|
+
try:
|
|
550
|
+
fd = os.open(candidate, _temp_file_open_flags(label=f"{label} temp"), mode, dir_fd=parent_fd)
|
|
551
|
+
temp_leaf = candidate
|
|
552
|
+
break
|
|
553
|
+
except FileExistsError:
|
|
554
|
+
continue
|
|
555
|
+
if fd < 0 or temp_leaf is None:
|
|
556
|
+
raise RegistryError(f"could not create temporary {label}")
|
|
557
|
+
if not stat.S_ISREG(os.fstat(fd).st_mode):
|
|
558
|
+
raise RegistryError(f"{label} temp must be a regular file")
|
|
559
|
+
write_all_fd(fd, data)
|
|
560
|
+
try:
|
|
561
|
+
os.fsync(fd)
|
|
562
|
+
except OSError:
|
|
563
|
+
pass
|
|
564
|
+
try:
|
|
565
|
+
os.close(fd)
|
|
566
|
+
except OSError:
|
|
567
|
+
pass
|
|
568
|
+
fd = -1
|
|
569
|
+
os.replace(temp_leaf, leaf, src_dir_fd=parent_fd, dst_dir_fd=parent_fd)
|
|
570
|
+
temp_leaf = None
|
|
571
|
+
except OSError as exc:
|
|
572
|
+
raise RegistryError(f"could not write {label}: {os_error_detail(exc)}") from exc
|
|
573
|
+
finally:
|
|
574
|
+
if fd >= 0:
|
|
575
|
+
try:
|
|
576
|
+
os.close(fd)
|
|
577
|
+
except OSError:
|
|
578
|
+
pass
|
|
579
|
+
if temp_leaf is not None:
|
|
580
|
+
try:
|
|
581
|
+
os.unlink(temp_leaf, dir_fd=parent_fd)
|
|
582
|
+
except OSError:
|
|
583
|
+
pass
|
|
584
|
+
try:
|
|
585
|
+
os.fsync(parent_fd)
|
|
586
|
+
except OSError:
|
|
587
|
+
pass
|
|
588
|
+
try:
|
|
589
|
+
os.close(parent_fd)
|
|
590
|
+
except OSError:
|
|
591
|
+
pass
|
|
592
|
+
|
|
593
|
+
|
|
279
594
|
def resolve_root(raw_root: str | None) -> Path:
|
|
280
595
|
root = Path(raw_root) if raw_root else Path.cwd()
|
|
281
596
|
try:
|
|
@@ -286,27 +601,25 @@ def resolve_root(raw_root: str | None) -> Path:
|
|
|
286
601
|
|
|
287
602
|
def resolve_config_path(root: Path, raw_config: str | None) -> Path:
|
|
288
603
|
if raw_config:
|
|
289
|
-
candidate = Path(raw_config)
|
|
290
|
-
if not candidate.is_absolute():
|
|
291
|
-
candidate = root / candidate
|
|
604
|
+
candidate = Path(raw_config)
|
|
292
605
|
else:
|
|
293
|
-
candidate =
|
|
294
|
-
|
|
295
|
-
resolved = candidate.resolve(strict=False)
|
|
296
|
-
except OSError as exc:
|
|
297
|
-
raise RegistryError(f"could not resolve config path: {candidate}: {exc}") from exc
|
|
298
|
-
try:
|
|
299
|
-
resolved.relative_to(root)
|
|
300
|
-
except ValueError as exc:
|
|
301
|
-
raise RegistryError(f"config path must stay inside project root: {resolved}") from exc
|
|
302
|
-
return resolved
|
|
606
|
+
candidate = DEFAULT_CONFIG
|
|
607
|
+
return normalize_project_path(root, candidate, label="config path")
|
|
303
608
|
|
|
304
609
|
|
|
305
610
|
def load_config(path: Path) -> dict[str, Any]:
|
|
306
|
-
|
|
611
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_CONFIG_BYTES, label="config", missing_ok=True)
|
|
612
|
+
if loaded is None:
|
|
307
613
|
return {"schema_version": CONFIG_SCHEMA_VERSION, "enabled": []}
|
|
614
|
+
raw, truncated = loaded
|
|
615
|
+
if truncated:
|
|
616
|
+
raise RegistryError("config exceeded max bytes")
|
|
617
|
+
try:
|
|
618
|
+
text = raw.decode("utf-8")
|
|
619
|
+
except UnicodeDecodeError as exc:
|
|
620
|
+
raise RegistryError(f"could not decode config UTF-8: {path}: {exc.reason}") from exc
|
|
308
621
|
try:
|
|
309
|
-
data = json.loads(
|
|
622
|
+
data = json.loads(text)
|
|
310
623
|
except json.JSONDecodeError as exc:
|
|
311
624
|
raise RegistryError(f"could not parse config JSON: {path}: {exc.msg}") from exc
|
|
312
625
|
except OSError as exc:
|
|
@@ -328,11 +641,8 @@ def write_config(path: Path, enabled: set[str]) -> dict[str, Any]:
|
|
|
328
641
|
"updated_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
|
|
329
642
|
"enabled": sorted(enabled),
|
|
330
643
|
}
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
path.write_text(json.dumps(data, indent=2, sort_keys=True) + "\n", encoding="utf-8")
|
|
334
|
-
except OSError as exc:
|
|
335
|
-
raise RegistryError(f"could not write config: {path}: {exc}") from exc
|
|
644
|
+
payload = (json.dumps(data, indent=2, sort_keys=True) + "\n").encode("utf-8")
|
|
645
|
+
write_regular_file_no_follow(path, payload, label="config")
|
|
336
646
|
return data
|
|
337
647
|
|
|
338
648
|
|
|
@@ -459,18 +769,16 @@ def read_bounded_input(args: argparse.Namespace) -> tuple[str, dict[str, Any]]:
|
|
|
459
769
|
if args.input:
|
|
460
770
|
path = Path(args.input)
|
|
461
771
|
source_label = source_label or str(path)
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
except OSError as exc:
|
|
466
|
-
raise RegistryError(f"could not read input: {path}: {exc}") from exc
|
|
772
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_CONTEXT_DIFF_INPUT_BYTES, label="input")
|
|
773
|
+
assert loaded is not None
|
|
774
|
+
raw, truncated = loaded
|
|
467
775
|
else:
|
|
468
776
|
source_label = source_label or "stdin"
|
|
469
777
|
raw = sys.stdin.buffer.read(MAX_CONTEXT_DIFF_INPUT_BYTES + 1)
|
|
778
|
+
truncated = len(raw) > MAX_CONTEXT_DIFF_INPUT_BYTES
|
|
779
|
+
raw = raw[:MAX_CONTEXT_DIFF_INPUT_BYTES]
|
|
470
780
|
if not raw:
|
|
471
781
|
raise RegistryError("context-diff-compaction plan requires diff input on stdin or --input")
|
|
472
|
-
truncated = len(raw) > MAX_CONTEXT_DIFF_INPUT_BYTES
|
|
473
|
-
raw = raw[:MAX_CONTEXT_DIFF_INPUT_BYTES]
|
|
474
782
|
text = raw.decode("utf-8", errors="replace")
|
|
475
783
|
metadata = {
|
|
476
784
|
"source_label": source_label,
|
|
@@ -678,23 +986,21 @@ def read_visual_ocr_text(args: argparse.Namespace) -> dict[str, Any]:
|
|
|
678
986
|
if args.ocr_text_file is not None:
|
|
679
987
|
path = Path(args.ocr_text_file)
|
|
680
988
|
source_label = args.ocr_source_label.strip() if args.ocr_source_label else path.name
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
except OSError as exc:
|
|
685
|
-
raise RegistryError(f"could not read OCR text file: {path}: {exc}") from exc
|
|
989
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_VISUAL_OCR_TEXT_BYTES, label="OCR text file")
|
|
990
|
+
assert loaded is not None
|
|
991
|
+
raw, truncated = loaded
|
|
686
992
|
source_type = "file"
|
|
687
993
|
elif args.ocr_text is not None:
|
|
688
994
|
raw = args.ocr_text.encode("utf-8")
|
|
689
995
|
source_label = args.ocr_source_label.strip() if args.ocr_source_label else "inline"
|
|
690
996
|
source_type = "inline"
|
|
997
|
+
truncated = len(raw) > MAX_VISUAL_OCR_TEXT_BYTES
|
|
998
|
+
raw = raw[:MAX_VISUAL_OCR_TEXT_BYTES]
|
|
691
999
|
else:
|
|
692
1000
|
raw = b""
|
|
693
1001
|
source_label = args.ocr_source_label.strip() if args.ocr_source_label else None
|
|
694
1002
|
source_type = None
|
|
695
|
-
|
|
696
|
-
truncated = len(raw) > MAX_VISUAL_OCR_TEXT_BYTES
|
|
697
|
-
raw = raw[:MAX_VISUAL_OCR_TEXT_BYTES]
|
|
1003
|
+
truncated = False
|
|
698
1004
|
try:
|
|
699
1005
|
text = raw.decode("utf-8")
|
|
700
1006
|
valid_encoding = True
|
|
@@ -1059,22 +1365,21 @@ def read_self_hosted_payload(args: argparse.Namespace) -> tuple[Any, dict[str, A
|
|
|
1059
1365
|
path = Path(args.input)
|
|
1060
1366
|
source_label = source_label or sanitize_self_hosted_text(path)
|
|
1061
1367
|
try:
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
if exc.errno is not None:
|
|
1068
|
-
detail = f"{detail} (errno {exc.errno})"
|
|
1069
|
-
raise RegistryError(f"could not read self-hosted metrics input: {safe_path}: {detail}") from exc
|
|
1368
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_SELF_HOSTED_METRICS_INPUT_BYTES, label=f"self-hosted metrics input: {source_label}")
|
|
1369
|
+
except RegistryError as exc:
|
|
1370
|
+
raise RegistryError(f"could not read self-hosted metrics input: {source_label}: {exc}") from exc
|
|
1371
|
+
assert loaded is not None
|
|
1372
|
+
raw, loaded_truncated = loaded
|
|
1070
1373
|
else:
|
|
1071
1374
|
source_label = source_label or "stdin"
|
|
1072
1375
|
raw = sys.stdin.buffer.read(MAX_SELF_HOSTED_METRICS_INPUT_BYTES + 1)
|
|
1073
|
-
|
|
1376
|
+
loaded_truncated = len(raw) > MAX_SELF_HOSTED_METRICS_INPUT_BYTES
|
|
1377
|
+
raw = raw[:MAX_SELF_HOSTED_METRICS_INPUT_BYTES]
|
|
1378
|
+
if loaded_truncated:
|
|
1074
1379
|
return None, {
|
|
1075
1380
|
"source_label": source_label,
|
|
1076
1381
|
"bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
|
|
1077
|
-
"sha256": hashlib.sha256(raw
|
|
1382
|
+
"sha256": hashlib.sha256(raw).hexdigest(),
|
|
1078
1383
|
"truncated": True,
|
|
1079
1384
|
"max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
|
|
1080
1385
|
"envelope_source": None,
|
|
@@ -1333,18 +1638,16 @@ def read_local_proxy_payload(args: argparse.Namespace) -> tuple[dict[str, Any],
|
|
|
1333
1638
|
path = Path(args.input)
|
|
1334
1639
|
safe_path = sanitize_local_proxy_value(path)
|
|
1335
1640
|
try:
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
raise RegistryError(f"could not read local-proxy input: {safe_path}: {detail}") from exc
|
|
1343
|
-
if len(raw) > MAX_SELF_HOSTED_METRICS_INPUT_BYTES:
|
|
1641
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_SELF_HOSTED_METRICS_INPUT_BYTES, label=f"local-proxy input: {safe_path}")
|
|
1642
|
+
except RegistryError as exc:
|
|
1643
|
+
raise RegistryError(f"could not read local-proxy input: {safe_path}: {exc}") from exc
|
|
1644
|
+
assert loaded is not None
|
|
1645
|
+
raw, loaded_truncated = loaded
|
|
1646
|
+
if loaded_truncated:
|
|
1344
1647
|
return {}, {
|
|
1345
1648
|
"source_label": safe_path,
|
|
1346
1649
|
"bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
|
|
1347
|
-
"sha256": hashlib.sha256(raw
|
|
1650
|
+
"sha256": hashlib.sha256(raw).hexdigest(),
|
|
1348
1651
|
"truncated": True,
|
|
1349
1652
|
"ignored_keys": [],
|
|
1350
1653
|
}
|
|
@@ -1691,16 +1994,14 @@ def read_learned_input(args: argparse.Namespace) -> tuple[str, dict[str, Any]]:
|
|
|
1691
1994
|
if args.input:
|
|
1692
1995
|
path = Path(args.input)
|
|
1693
1996
|
source_label = source_label or path.name
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
except OSError as exc:
|
|
1698
|
-
raise RegistryError(f"could not read learned-compression input: {path}: {exc}") from exc
|
|
1997
|
+
loaded = read_bounded_regular_file(path, max_bytes=MAX_LEARNED_COMPRESSION_INPUT_BYTES, label="learned-compression input")
|
|
1998
|
+
assert loaded is not None
|
|
1999
|
+
raw, truncated = loaded
|
|
1699
2000
|
else:
|
|
1700
2001
|
source_label = source_label or "stdin"
|
|
1701
2002
|
raw = sys.stdin.buffer.read(MAX_LEARNED_COMPRESSION_INPUT_BYTES + 1)
|
|
1702
|
-
|
|
1703
|
-
|
|
2003
|
+
truncated = len(raw) > MAX_LEARNED_COMPRESSION_INPUT_BYTES
|
|
2004
|
+
raw = raw[:MAX_LEARNED_COMPRESSION_INPUT_BYTES]
|
|
1704
2005
|
text = raw.decode("utf-8", errors="replace")
|
|
1705
2006
|
metadata = {
|
|
1706
2007
|
"source_label": source_label,
|