okstra 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -250,6 +250,24 @@ for worker in team_state.get("workers", []):
250
250
  )
251
251
  + "\n"
252
252
  )
253
+ # Mirror the audit sidecar contract — every completed worker-results
254
+ # file ships alongside `<worker>-audit-<task-type>-<seq>.md` carrying
255
+ # the Reading Confirmation block. Derive the sidecar path by
256
+ # inserting `-audit` after the worker-role segment of the
257
+ # result-file stem.
258
+ result_stem = result_path.stem # e.g. claude-worker-error-analysis-001
259
+ audit_stem = result_stem.replace("-worker-", "-worker-audit-", 1)
260
+ audit_path = result_path.with_name(f"{audit_stem}{result_path.suffix}")
261
+ audit_path.write_text(
262
+ "\n".join(
263
+ [
264
+ f"# {worker.get('role', worker_id)} Audit",
265
+ "",
266
+ "- Read task-brief.md end-to-end (validation fixture).",
267
+ ]
268
+ )
269
+ + "\n"
270
+ )
253
271
 
254
272
  lead = team_state.get("lead")
255
273
  if isinstance(lead, dict):
@@ -305,6 +323,16 @@ if not isinstance(required_status_entries, list):
305
323
  report_lines = [
306
324
  "# Validation Fixture Report",
307
325
  "",
326
+ "## Verdict Card",
327
+ "",
328
+ "| 항목 | 값 |",
329
+ "|------|----|",
330
+ "| Final Conclusion | validation fixture |",
331
+ "| Verdict Token | `not-applicable` |",
332
+ "| Direction | `continue-investigation` |",
333
+ "| Approval Required? | `no` |",
334
+ "| Next Step | fixture |",
335
+ "",
308
336
  "## Agent Execution Status",
309
337
  ]
310
338
  for label in required_status_entries:
@@ -312,6 +340,15 @@ for label in required_status_entries:
312
340
  report_lines.append(f"- {label}: fixture status recorded")
313
341
  report_lines.extend(
314
342
  [
343
+ "",
344
+ "## Token Usage Summary",
345
+ "",
346
+ "| 항목 | 처리 토큰 | 환산 토큰 | 비용 (USD) |",
347
+ "|------|-----------|-----------|------------|",
348
+ "| Lead | `1` | `1` | `$0.01` |",
349
+ "| Worker 합계 | `1` | `1` | `$0.01` |",
350
+ "| **전체 합계** | **`2`** | **`2`** | **`$0.02`** |",
351
+ "| Codex/Gemini CLI 추가 비용 | | | `$0.00` |",
315
352
  "",
316
353
  "## Final Verdict",
317
354
  "- Validation fixture report generated.",
@@ -465,6 +465,187 @@ TOKEN_PLACEHOLDERS = (
465
465
  )
466
466
 
467
467
 
468
+ # Token Usage Summary section between its `##` heading and the next `##`
469
+ # heading (or end-of-file). Matched non-greedily so the body of the next
470
+ # section never bleeds in.
471
+ _TOKEN_USAGE_SECTION_RE = re.compile(
472
+ r"^##[ \t]+Token Usage Summary[ \t]*$\n(?P<body>.*?)(?=^##[ \t]|\Z)",
473
+ re.DOTALL | re.MULTILINE,
474
+ )
475
+
476
+ # Backtick-wrapped cell values inside a Token Usage Summary row. We use
477
+ # this to inspect actual cell contents rather than fighting markdown
478
+ # table parsing rules.
479
+ _TOKEN_USAGE_BACKTICK_CELL_RE = re.compile(r"`([^`\n]*)`")
480
+
481
+ # Sentinel words workers have been observed typing INSTEAD of leaving the
482
+ # `{{...}}` placeholders verbatim. These bypass the placeholder check
483
+ # because they are valid string values; we must reject them by name.
484
+ _TOKEN_USAGE_SENTINEL_VALUES = frozenset(
485
+ {
486
+ "pending",
487
+ "n/a",
488
+ "na",
489
+ "tbd",
490
+ "tba",
491
+ "not-collected",
492
+ "not collected",
493
+ "--",
494
+ "?",
495
+ "unknown",
496
+ "",
497
+ }
498
+ )
499
+
500
+ # Numeric "valid zero" patterns. These ARE allowed in the CLI row when no
501
+ # Codex/Gemini CLI work was billed; rejected everywhere else.
502
+ _TOKEN_USAGE_ZERO_VALUES = frozenset({"0", "$0.00", "$0", "0.00"})
503
+
504
+
505
+ def _scan_token_usage_summary(content: str, failures: list[str]) -> None:
506
+ """Reject sentinel / zero values that workers typed into the Token
507
+ Usage Summary table instead of leaving the `{{...}}` placeholders
508
+ verbatim for Phase 7 substitution.
509
+
510
+ The placeholder check (`TOKEN_PLACEHOLDERS`) above catches the
511
+ "didn't substitute" case; this scanner catches the "substituted with
512
+ a sentinel string" case which is invisible to that check and was the
513
+ real source of `0` / `$0.00` / `pending` shipping in real reports.
514
+
515
+ Rules:
516
+ - The Codex/Gemini CLI 추가 비용 row may carry an empty cell or
517
+ `$0.00` (no CLI work was billed). Sentinel words are still
518
+ rejected.
519
+ - Every other row's backtick-wrapped cells must be either a
520
+ comma-grouped integer (e.g. `1,234,567`) or a USD value (`$5.43`).
521
+ Zero values (`0` / `$0.00`) are rejected because no okstra run
522
+ consumes zero tokens — a zero there means the writer fabricated a
523
+ stub.
524
+ """
525
+ match = _TOKEN_USAGE_SECTION_RE.search(content)
526
+ if match is None:
527
+ # The Token Usage Summary section is required in every report
528
+ # (the template emits it unconditionally). A missing section is
529
+ # surfaced elsewhere by the placeholder check (which would also
530
+ # not fire — so we add a dedicated failure here).
531
+ failures.append(
532
+ "final report is missing the `## Token Usage Summary` section — "
533
+ "the template renders it unconditionally and Phase 7 substitution "
534
+ "depends on it being present."
535
+ )
536
+ return
537
+
538
+ body = match.group("body")
539
+ for raw_line in body.splitlines():
540
+ line = raw_line.strip()
541
+ if not line.startswith("|") or line.startswith("|--"):
542
+ # Skip non-table lines, the header separator (`|------|`), and
543
+ # blank lines. Header rows have no backticks so they self-skip.
544
+ continue
545
+ cells = [c.strip() for c in line.strip("|").split("|")]
546
+ if not cells:
547
+ continue
548
+ label_cell = cells[0].strip("* `")
549
+ # The CLI row's label always contains the word "CLI" — matching
550
+ # `Codex/Gemini CLI 추가 비용` regardless of formatting variations.
551
+ is_cli_row = "CLI" in label_cell
552
+ for raw_cell in cells[1:]:
553
+ for value in _TOKEN_USAGE_BACKTICK_CELL_RE.findall(raw_cell):
554
+ stripped = value.strip()
555
+ lowered = stripped.lower()
556
+ if lowered in _TOKEN_USAGE_SENTINEL_VALUES:
557
+ failures.append(
558
+ "Token Usage Summary cell contains sentinel value "
559
+ f"`{stripped}` on row labelled `{label_cell or '<unlabeled>'}` — "
560
+ "leave the `{{...}}` placeholder verbatim until "
561
+ "`okstra-token-usage.py --substitute-final-report` runs "
562
+ "in Phase 7."
563
+ )
564
+ continue
565
+ if stripped in _TOKEN_USAGE_ZERO_VALUES and not is_cli_row:
566
+ failures.append(
567
+ f"Token Usage Summary row `{label_cell or '<unlabeled>'}` has "
568
+ f"a zero value `{stripped}` — no okstra run consumes zero "
569
+ "tokens. Re-run `python3 scripts/okstra-token-usage.py "
570
+ "<team-state> --write --summary --substitute-final-report "
571
+ "<report-path>` to repopulate from session jsonls. The "
572
+ "Codex/Gemini CLI row is the only place `$0.00` is "
573
+ "allowed (when no CLI work was billed)."
574
+ )
575
+
576
+
577
+ # Verdict Card heading (mandatory top-of-report at-a-glance block introduced
578
+ # with the report-format readability pass). Matches `## Verdict Card` only as
579
+ # a section heading line (not as inline text inside a paragraph or table).
580
+ _VERDICT_CARD_HEADING_RE = re.compile(r"^##[ \t]+Verdict Card\b", re.MULTILINE)
581
+
582
+ # Reading Confirmation heading must NOT appear in the final-report — it
583
+ # belongs in the worker audit sidecar (`<worker>-audit-<task-type>-<seq>.md`).
584
+ _READING_CONFIRMATION_HEADING_RE = re.compile(
585
+ r"^##[ \t]+0\.[ \t]+Reading Confirmation\b", re.MULTILINE
586
+ )
587
+
588
+ # Empty Section 0 (Clarification Response Carried In) stub. When no
589
+ # carry-in path is provided, the writer must OMIT the `## 0.` heading
590
+ # entirely — emitting the heading followed by the "No prior clarification
591
+ # response was provided" stub line is the recurring failure mode this
592
+ # regex catches. The 400-char window after the heading covers the stub
593
+ # line + any boilerplate without crossing into the next section.
594
+ _EMPTY_CARRY_IN_RE = re.compile(
595
+ r"^##[ \t]+0\.[ \t]+Clarification Response Carried In"
596
+ r"[\s\S]{0,400}?No prior clarification response was provided",
597
+ re.MULTILINE,
598
+ )
599
+
600
+ # Section 0 heading with an empty `Source file: \`\`` line — the second
601
+ # failure shape (writer keeps the heading + Source file row but with an
602
+ # empty backtick value because no carry-in was provided). Same remedy:
603
+ # omit the entire `## 0.` block when carry-in is absent.
604
+ _EMPTY_CARRY_IN_SOURCE_RE = re.compile(
605
+ r"^##[ \t]+0\.[ \t]+Clarification Response Carried In"
606
+ r"[\s\S]{0,400}?Source file:[ \t]*`\s*`",
607
+ re.MULTILINE,
608
+ )
609
+
610
+ # Deprecated section headings removed by the report-format readability
611
+ # pass. Each entry is (regex, human-readable remedy). The regexes are
612
+ # line-anchored to avoid false positives from inline references in prose
613
+ # (e.g. this file itself, or skill documentation that mentions the
614
+ # deprecated names).
615
+ _DEPRECATED_FINAL_REPORT_PATTERNS: tuple[tuple[re.Pattern, str], ...] = (
616
+ (
617
+ re.compile(r"^###[ \t]+4\.5\.8[ \t]+User Approval Request\b", re.MULTILINE),
618
+ "deprecated `### 4.5.8 User Approval Request` stub — the top-of-report "
619
+ "`## User Approval Request (사용자 승인 게이트)` block is the only one. "
620
+ "Delete the §4.5.8 heading + body.",
621
+ ),
622
+ (
623
+ re.compile(r"^###[ \t]+4\.5\.9[ \t]+Open Questions\b", re.MULTILINE),
624
+ "deprecated `### 4.5.9 Open Questions` block — promote each row into "
625
+ "`## 5. Clarification Items` with `Kind=decision` (and `Blocks=approval` "
626
+ "if it gated the User Approval Request).",
627
+ ),
628
+ (
629
+ re.compile(
630
+ r"^###[ \t]+5\.1[ \t]+(?:추가 자료 요청|Additional Materials)\b",
631
+ re.MULTILINE,
632
+ ),
633
+ "deprecated `### 5.1 추가 자료 요청` / `Additional Materials` sub-section — "
634
+ "every clarification item lives as one row of the unified `## 5. "
635
+ "Clarification Items` 8-column table (`Kind=material`).",
636
+ ),
637
+ (
638
+ re.compile(
639
+ r"^###[ \t]+5\.2[ \t]+(?:사용자 확인 질문|Questions for the User)\b",
640
+ re.MULTILINE,
641
+ ),
642
+ "deprecated `### 5.2 사용자 확인 질문` / `Questions for the User` "
643
+ "sub-section — collapse into the unified `## 5. Clarification Items` "
644
+ "8-column table (`Kind=decision`).",
645
+ ),
646
+ )
647
+
648
+
468
649
  def validate_report(
469
650
  report_path: Path, required_agent_status_entries: list[str], failures: list[str]
470
651
  ) -> None:
@@ -486,6 +667,126 @@ def validate_report(
486
667
  "run `okstra-token-usage.py ... --substitute-final-report <report-path>` during Phase 7"
487
668
  )
488
669
 
670
+ # Catch the "workers typed `0` / `pending` instead of the placeholder"
671
+ # failure mode that bypasses the placeholder check above.
672
+ _scan_token_usage_summary(content, failures)
673
+
674
+ # Verdict Card is mandatory in every final-report (introduced with the
675
+ # report-format readability pass). Missing card means the reader has no
676
+ # at-a-glance index — first decision lives 100+ lines down.
677
+ if _VERDICT_CARD_HEADING_RE.search(content) is None:
678
+ failures.append(
679
+ "final report is missing the top-of-report `## Verdict Card` block — "
680
+ "render it between the report header and the (conditional) Approval "
681
+ "block. Its Verdict Token / Direction / Next Step cells must byte-match "
682
+ "the corresponding cells in `## 2. Final Verdict` and `## 6.` first item."
683
+ )
684
+
685
+ # Reading Confirmation belongs in the worker audit sidecar, not the
686
+ # user-facing final-report.
687
+ if _READING_CONFIRMATION_HEADING_RE.search(content) is not None:
688
+ failures.append(
689
+ "final report contains a `## 0. Reading Confirmation` heading — "
690
+ "Reading Confirmation lives in the worker audit sidecar "
691
+ "(`runs/<task-type>/worker-results/<worker>-audit-<task-type>-<seq>.md`), "
692
+ "never in the final-report."
693
+ )
694
+
695
+ # Empty Section 0 stub — when no carry-in path was provided, the
696
+ # writer must OMIT the `## 0.` heading entirely.
697
+ if _EMPTY_CARRY_IN_RE.search(content) is not None or _EMPTY_CARRY_IN_SOURCE_RE.search(
698
+ content
699
+ ) is not None:
700
+ failures.append(
701
+ "final report has an empty `## 0. Clarification Response Carried In "
702
+ "From Previous Run` stub (either the `Source file:` cell is empty or "
703
+ "the body contains `No prior clarification response was provided`). "
704
+ "When no carry-in path was provided, OMIT the entire `## 0.` heading "
705
+ "and body — do NOT emit a placeholder stub."
706
+ )
707
+
708
+ # Deprecated section headings — pre-1.0 hard removal.
709
+ for pattern, remedy in _DEPRECATED_FINAL_REPORT_PATTERNS:
710
+ if pattern.search(content) is not None:
711
+ failures.append(f"final report contains {remedy}")
712
+
713
+
714
+ # Worker-results filename pattern: `<worker-role>-<task-type>-<seq>.md`.
715
+ # Every analysis-worker role name ends in `-worker` (`claude-worker`,
716
+ # `codex-worker`, `gemini-worker`, `report-writer-worker`), so anchor the
717
+ # split on that suffix — otherwise `gemini-worker-error-analysis-001.md`
718
+ # ambiguously parses as `worker=gemini, task=worker-error-analysis`.
719
+ # Audit sidecars (`*-audit-*`) and errors sidecars (`.json`) are not
720
+ # matched here.
721
+ _WORKER_RESULT_BASENAME_RE = re.compile(
722
+ r"^(?P<worker>[a-z][a-z0-9-]*-worker)-(?P<task_type>[a-z][a-z-]*?)-(?P<seq>\d{3})\.md$"
723
+ )
724
+
725
+
726
+ def validate_worker_results_audit(
727
+ report_path: Path, task_type: str, failures: list[str]
728
+ ) -> None:
729
+ """Enforce the worker audit sidecar contract.
730
+
731
+ For every `worker-results/<worker>-<task-type>-<seq>.md` produced by a
732
+ worker (skipping the audit sidecar itself), the validator checks:
733
+
734
+ 1. The main worker-results file does NOT contain a `## 0. Reading
735
+ Confirmation` heading. That block moved to the audit sidecar with
736
+ the report-format readability pass.
737
+ 2. The matching audit sidecar exists at
738
+ `<worker>-audit-<task-type>-<seq>.md`. Missing sidecar means the
739
+ worker silently skipped the reading-confirmation step.
740
+ """
741
+ # `report_path` is `runs/<task-type>/reports/final-report-...md`; the
742
+ # sibling `worker-results/` directory holds every worker artifact.
743
+ worker_results_dir = report_path.parent.parent / "worker-results"
744
+ if not worker_results_dir.is_dir():
745
+ # No worker-results directory means no analysis workers ran (e.g.
746
+ # `release-handoff` which is single-lead). Nothing to enforce.
747
+ return
748
+
749
+ for path in sorted(worker_results_dir.glob("*.md")):
750
+ name = path.name
751
+ if "-audit-" in name:
752
+ continue
753
+ match = _WORKER_RESULT_BASENAME_RE.match(name)
754
+ if match is None:
755
+ # Files that don't match the canonical pattern (e.g. ad-hoc
756
+ # notes left by the operator) are out of contract scope.
757
+ continue
758
+ if match.group("task_type") != task_type:
759
+ # Cross-phase artifacts shouldn't appear here; skip rather
760
+ # than fail to keep the check focused on the current phase.
761
+ continue
762
+
763
+ worker_role = match.group("worker")
764
+ seq = match.group("seq")
765
+ rel = path.name
766
+ try:
767
+ content = path.read_text()
768
+ except OSError as exc:
769
+ failures.append(f"worker-results file unreadable: {rel} ({exc})")
770
+ continue
771
+
772
+ if _READING_CONFIRMATION_HEADING_RE.search(content) is not None:
773
+ failures.append(
774
+ f"worker-results file `{rel}` contains a `## 0. Reading "
775
+ f"Confirmation` heading — that block moved to the audit "
776
+ f"sidecar (`{worker_role}-audit-{task_type}-{seq}.md`). "
777
+ f"Remove the §0 heading + body from the main file and "
778
+ f"write a fresh sidecar."
779
+ )
780
+
781
+ audit_path = worker_results_dir / f"{worker_role}-audit-{task_type}-{seq}.md"
782
+ if not audit_path.exists():
783
+ failures.append(
784
+ f"worker `{worker_role}` produced `{rel}` but no audit sidecar "
785
+ f"at `{audit_path.name}` — the sidecar must carry the Reading "
786
+ f"Confirmation block (one short line per input file). Workers "
787
+ f"write this in the same step as the main worker-results file."
788
+ )
789
+
489
790
 
490
791
  def validate_team_state_usage(team_state: dict, failures: list[str]) -> None:
491
792
  summary = team_state.get("usageSummary") or {}
@@ -812,7 +1113,16 @@ def attempt_token_usage_autofix(
812
1113
  team_state_path.write_text(
813
1114
  json.dumps(updated, indent=2, ensure_ascii=False) + "\n"
814
1115
  )
815
- replaced = substitute_final_report(report_path, updated)
1116
+ try:
1117
+ replaced = substitute_final_report(report_path, updated)
1118
+ except Exception as exc: # noqa: BLE001
1119
+ # `SubstituteRefusedError` (or any unexpected substitution
1120
+ # failure) — report it as an accuracy failure so the validator
1121
+ # surfaces a concrete remediation instead of silently shipping
1122
+ # a report with zeros / sentinels.
1123
+ return "accuracy-failed", [
1124
+ f"Phase 7 token-usage substitution refused: {exc}"
1125
+ ]
816
1126
  detail = (
817
1127
  f"replaced {replaced} placeholder(s)"
818
1128
  if replaced > 0
@@ -893,6 +1203,8 @@ def main() -> int:
893
1203
 
894
1204
  task_type = str(task_manifest.get("taskType") or run_manifest.get("taskType") or "").strip()
895
1205
  validate_phase_boundary(task_type, report_path, failures)
1206
+ if task_type:
1207
+ validate_worker_results_audit(report_path, task_type, failures)
896
1208
 
897
1209
  validation_status = "passed" if not failures else "failed"
898
1210
  update_validation_metadata(