npm - okstra - Versions diffs - 0.27.0 → 0.28.0 - Mend

okstra 0.27.0 → 0.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/package.json +1 -1
package/runtime/BUILD.json +2 -2
package/runtime/agents/workers/claude-worker.md +4 -3
package/runtime/agents/workers/codex-worker.md +4 -3
package/runtime/agents/workers/gemini-worker.md +4 -3
package/runtime/agents/workers/report-writer-worker.md +7 -2
package/runtime/prompts/launch.template.md +1 -1
package/runtime/prompts/profiles/_common-contract.md +12 -4
package/runtime/python/okstra_token_usage/cli.py +9 -2
package/runtime/python/okstra_token_usage/report.py +32 -3
package/runtime/skills/okstra-convergence/SKILL.md +2 -2
package/runtime/skills/okstra-report-writer/SKILL.md +6 -4
package/runtime/skills/okstra-team-contract/SKILL.md +14 -10
package/runtime/templates/reports/final-report.template.md +227 -207
package/runtime/validators/lib/fixtures.sh +37 -0
package/runtime/validators/validate-run.py +313 -1

package/runtime/validators/lib/fixtures.sh CHANGED Viewed

@@ -250,6 +250,24 @@ for worker in team_state.get("workers", []):
             )
             + "\n"
         )
+        # Mirror the audit sidecar contract — every completed worker-results
+        # file ships alongside `<worker>-audit-<task-type>-<seq>.md` carrying
+        # the Reading Confirmation block. Derive the sidecar path by
+        # inserting `-audit` after the worker-role segment of the
+        # result-file stem.
+        result_stem = result_path.stem  # e.g. claude-worker-error-analysis-001
+        audit_stem = result_stem.replace("-worker-", "-worker-audit-", 1)
+        audit_path = result_path.with_name(f"{audit_stem}{result_path.suffix}")
+        audit_path.write_text(
+            "\n".join(
+                [
+                    f"# {worker.get('role', worker_id)} Audit",
+                    "",
+                    "- Read task-brief.md end-to-end (validation fixture).",
+                ]
+            )
+            + "\n"
+        )
 lead = team_state.get("lead")
 if isinstance(lead, dict):
@@ -305,6 +323,16 @@ if not isinstance(required_status_entries, list):
 report_lines = [
     "# Validation Fixture Report",
     "",
+    "## Verdict Card",
+    "",
+    "| 항목 | 값 |",
+    "|------|----|",
+    "| Final Conclusion | validation fixture |",
+    "| Verdict Token | `not-applicable` |",
+    "| Direction | `continue-investigation` |",
+    "| Approval Required? | `no` |",
+    "| Next Step | fixture |",
+    "",
     "## Agent Execution Status",
 ]
 for label in required_status_entries:
@@ -312,6 +340,15 @@ for label in required_status_entries:
         report_lines.append(f"- {label}: fixture status recorded")
 report_lines.extend(
     [
+        "",
+        "## Token Usage Summary",
+        "",
+        "| 항목 | 처리 토큰 | 환산 토큰 | 비용 (USD) |",
+        "|------|-----------|-----------|------------|",
+        "| Lead | `1` | `1` | `$0.01` |",
+        "| Worker 합계 | `1` | `1` | `$0.01` |",
+        "| **전체 합계** | **`2`** | **`2`** | **`$0.02`** |",
+        "| Codex/Gemini CLI 추가 비용 |  |  | `$0.00` |",
         "",
         "## Final Verdict",
         "- Validation fixture report generated.",

package/runtime/validators/validate-run.py CHANGED Viewed

@@ -465,6 +465,187 @@ TOKEN_PLACEHOLDERS = (
 )
+# Token Usage Summary section between its `##` heading and the next `##`
+# heading (or end-of-file). Matched non-greedily so the body of the next
+# section never bleeds in.
+_TOKEN_USAGE_SECTION_RE = re.compile(
+    r"^##[ \t]+Token Usage Summary[ \t]*$\n(?P<body>.*?)(?=^##[ \t]|\Z)",
+    re.DOTALL | re.MULTILINE,
+)
+# Backtick-wrapped cell values inside a Token Usage Summary row. We use
+# this to inspect actual cell contents rather than fighting markdown
+# table parsing rules.
+_TOKEN_USAGE_BACKTICK_CELL_RE = re.compile(r"`([^`\n]*)`")
+# Sentinel words workers have been observed typing INSTEAD of leaving the
+# `{{...}}` placeholders verbatim. These bypass the placeholder check
+# because they are valid string values; we must reject them by name.
+_TOKEN_USAGE_SENTINEL_VALUES = frozenset(
+    {
+        "pending",
+        "n/a",
+        "na",
+        "tbd",
+        "tba",
+        "not-collected",
+        "not collected",
+        "--",
+        "?",
+        "unknown",
+        "",
+    }
+)
+# Numeric "valid zero" patterns. These ARE allowed in the CLI row when no
+# Codex/Gemini CLI work was billed; rejected everywhere else.
+_TOKEN_USAGE_ZERO_VALUES = frozenset({"0", "$0.00", "$0", "0.00"})
+def _scan_token_usage_summary(content: str, failures: list[str]) -> None:
+    """Reject sentinel / zero values that workers typed into the Token
+    Usage Summary table instead of leaving the `{{...}}` placeholders
+    verbatim for Phase 7 substitution.
+    The placeholder check (`TOKEN_PLACEHOLDERS`) above catches the
+    "didn't substitute" case; this scanner catches the "substituted with
+    a sentinel string" case which is invisible to that check and was the
+    real source of `0` / `$0.00` / `pending` shipping in real reports.
+    Rules:
+    - The Codex/Gemini CLI 추가 비용 row may carry an empty cell or
+      `$0.00` (no CLI work was billed). Sentinel words are still
+      rejected.
+    - Every other row's backtick-wrapped cells must be either a
+      comma-grouped integer (e.g. `1,234,567`) or a USD value (`$5.43`).
+      Zero values (`0` / `$0.00`) are rejected because no okstra run
+      consumes zero tokens — a zero there means the writer fabricated a
+      stub.
+    """
+    match = _TOKEN_USAGE_SECTION_RE.search(content)
+    if match is None:
+        # The Token Usage Summary section is required in every report
+        # (the template emits it unconditionally). A missing section is
+        # surfaced elsewhere by the placeholder check (which would also
+        # not fire — so we add a dedicated failure here).
+        failures.append(
+            "final report is missing the `## Token Usage Summary` section — "
+            "the template renders it unconditionally and Phase 7 substitution "
+            "depends on it being present."
+        )
+        return
+    body = match.group("body")
+    for raw_line in body.splitlines():
+        line = raw_line.strip()
+        if not line.startswith("|") or line.startswith("|--"):
+            # Skip non-table lines, the header separator (`|------|`), and
+            # blank lines. Header rows have no backticks so they self-skip.
+            continue
+        cells = [c.strip() for c in line.strip("|").split("|")]
+        if not cells:
+            continue
+        label_cell = cells[0].strip("* `")
+        # The CLI row's label always contains the word "CLI" — matching
+        # `Codex/Gemini CLI 추가 비용` regardless of formatting variations.
+        is_cli_row = "CLI" in label_cell
+        for raw_cell in cells[1:]:
+            for value in _TOKEN_USAGE_BACKTICK_CELL_RE.findall(raw_cell):
+                stripped = value.strip()
+                lowered = stripped.lower()
+                if lowered in _TOKEN_USAGE_SENTINEL_VALUES:
+                    failures.append(
+                        "Token Usage Summary cell contains sentinel value "
+                        f"`{stripped}` on row labelled `{label_cell or '<unlabeled>'}` — "
+                        "leave the `{{...}}` placeholder verbatim until "
+                        "`okstra-token-usage.py --substitute-final-report` runs "
+                        "in Phase 7."
+                    )
+                    continue
+                if stripped in _TOKEN_USAGE_ZERO_VALUES and not is_cli_row:
+                    failures.append(
+                        f"Token Usage Summary row `{label_cell or '<unlabeled>'}` has "
+                        f"a zero value `{stripped}` — no okstra run consumes zero "
+                        "tokens. Re-run `python3 scripts/okstra-token-usage.py "
+                        "<team-state> --write --summary --substitute-final-report "
+                        "<report-path>` to repopulate from session jsonls. The "
+                        "Codex/Gemini CLI row is the only place `$0.00` is "
+                        "allowed (when no CLI work was billed)."
+                    )
+# Verdict Card heading (mandatory top-of-report at-a-glance block introduced
+# with the report-format readability pass). Matches `## Verdict Card` only as
+# a section heading line (not as inline text inside a paragraph or table).
+_VERDICT_CARD_HEADING_RE = re.compile(r"^##[ \t]+Verdict Card\b", re.MULTILINE)
+# Reading Confirmation heading must NOT appear in the final-report — it
+# belongs in the worker audit sidecar (`<worker>-audit-<task-type>-<seq>.md`).
+_READING_CONFIRMATION_HEADING_RE = re.compile(
+    r"^##[ \t]+0\.[ \t]+Reading Confirmation\b", re.MULTILINE
+)
+# Empty Section 0 (Clarification Response Carried In) stub. When no
+# carry-in path is provided, the writer must OMIT the `## 0.` heading
+# entirely — emitting the heading followed by the "No prior clarification
+# response was provided" stub line is the recurring failure mode this
+# regex catches. The 400-char window after the heading covers the stub
+# line + any boilerplate without crossing into the next section.
+_EMPTY_CARRY_IN_RE = re.compile(
+    r"^##[ \t]+0\.[ \t]+Clarification Response Carried In"
+    r"[\s\S]{0,400}?No prior clarification response was provided",
+    re.MULTILINE,
+)
+# Section 0 heading with an empty `Source file: \`\`` line — the second
+# failure shape (writer keeps the heading + Source file row but with an
+# empty backtick value because no carry-in was provided). Same remedy:
+# omit the entire `## 0.` block when carry-in is absent.
+_EMPTY_CARRY_IN_SOURCE_RE = re.compile(
+    r"^##[ \t]+0\.[ \t]+Clarification Response Carried In"
+    r"[\s\S]{0,400}?Source file:[ \t]*`\s*`",
+    re.MULTILINE,
+)
+# Deprecated section headings removed by the report-format readability
+# pass. Each entry is (regex, human-readable remedy). The regexes are
+# line-anchored to avoid false positives from inline references in prose
+# (e.g. this file itself, or skill documentation that mentions the
+# deprecated names).
+_DEPRECATED_FINAL_REPORT_PATTERNS: tuple[tuple[re.Pattern, str], ...] = (
+    (
+        re.compile(r"^###[ \t]+4\.5\.8[ \t]+User Approval Request\b", re.MULTILINE),
+        "deprecated `### 4.5.8 User Approval Request` stub — the top-of-report "
+        "`## User Approval Request (사용자 승인 게이트)` block is the only one. "
+        "Delete the §4.5.8 heading + body.",
+    ),
+    (
+        re.compile(r"^###[ \t]+4\.5\.9[ \t]+Open Questions\b", re.MULTILINE),
+        "deprecated `### 4.5.9 Open Questions` block — promote each row into "
+        "`## 5. Clarification Items` with `Kind=decision` (and `Blocks=approval` "
+        "if it gated the User Approval Request).",
+    ),
+    (
+        re.compile(
+            r"^###[ \t]+5\.1[ \t]+(?:추가 자료 요청|Additional Materials)\b",
+            re.MULTILINE,
+        ),
+        "deprecated `### 5.1 추가 자료 요청` / `Additional Materials` sub-section — "
+        "every clarification item lives as one row of the unified `## 5. "
+        "Clarification Items` 8-column table (`Kind=material`).",
+    ),
+    (
+        re.compile(
+            r"^###[ \t]+5\.2[ \t]+(?:사용자 확인 질문|Questions for the User)\b",
+            re.MULTILINE,
+        ),
+        "deprecated `### 5.2 사용자 확인 질문` / `Questions for the User` "
+        "sub-section — collapse into the unified `## 5. Clarification Items` "
+        "8-column table (`Kind=decision`).",
+    ),
+)
 def validate_report(
     report_path: Path, required_agent_status_entries: list[str], failures: list[str]
 ) -> None:
@@ -486,6 +667,126 @@ def validate_report(
                 "run `okstra-token-usage.py ... --substitute-final-report <report-path>` during Phase 7"
             )
+    # Catch the "workers typed `0` / `pending` instead of the placeholder"
+    # failure mode that bypasses the placeholder check above.
+    _scan_token_usage_summary(content, failures)
+    # Verdict Card is mandatory in every final-report (introduced with the
+    # report-format readability pass). Missing card means the reader has no
+    # at-a-glance index — first decision lives 100+ lines down.
+    if _VERDICT_CARD_HEADING_RE.search(content) is None:
+        failures.append(
+            "final report is missing the top-of-report `## Verdict Card` block — "
+            "render it between the report header and the (conditional) Approval "
+            "block. Its Verdict Token / Direction / Next Step cells must byte-match "
+            "the corresponding cells in `## 2. Final Verdict` and `## 6.` first item."
+        )
+    # Reading Confirmation belongs in the worker audit sidecar, not the
+    # user-facing final-report.
+    if _READING_CONFIRMATION_HEADING_RE.search(content) is not None:
+        failures.append(
+            "final report contains a `## 0. Reading Confirmation` heading — "
+            "Reading Confirmation lives in the worker audit sidecar "
+            "(`runs/<task-type>/worker-results/<worker>-audit-<task-type>-<seq>.md`), "
+            "never in the final-report."
+        )
+    # Empty Section 0 stub — when no carry-in path was provided, the
+    # writer must OMIT the `## 0.` heading entirely.
+    if _EMPTY_CARRY_IN_RE.search(content) is not None or _EMPTY_CARRY_IN_SOURCE_RE.search(
+        content
+    ) is not None:
+        failures.append(
+            "final report has an empty `## 0. Clarification Response Carried In "
+            "From Previous Run` stub (either the `Source file:` cell is empty or "
+            "the body contains `No prior clarification response was provided`). "
+            "When no carry-in path was provided, OMIT the entire `## 0.` heading "
+            "and body — do NOT emit a placeholder stub."
+        )
+    # Deprecated section headings — pre-1.0 hard removal.
+    for pattern, remedy in _DEPRECATED_FINAL_REPORT_PATTERNS:
+        if pattern.search(content) is not None:
+            failures.append(f"final report contains {remedy}")
+# Worker-results filename pattern: `<worker-role>-<task-type>-<seq>.md`.
+# Every analysis-worker role name ends in `-worker` (`claude-worker`,
+# `codex-worker`, `gemini-worker`, `report-writer-worker`), so anchor the
+# split on that suffix — otherwise `gemini-worker-error-analysis-001.md`
+# ambiguously parses as `worker=gemini, task=worker-error-analysis`.
+# Audit sidecars (`*-audit-*`) and errors sidecars (`.json`) are not
+# matched here.
+_WORKER_RESULT_BASENAME_RE = re.compile(
+    r"^(?P<worker>[a-z][a-z0-9-]*-worker)-(?P<task_type>[a-z][a-z-]*?)-(?P<seq>\d{3})\.md$"
+)
+def validate_worker_results_audit(
+    report_path: Path, task_type: str, failures: list[str]
+) -> None:
+    """Enforce the worker audit sidecar contract.
+    For every `worker-results/<worker>-<task-type>-<seq>.md` produced by a
+    worker (skipping the audit sidecar itself), the validator checks:
+    1. The main worker-results file does NOT contain a `## 0. Reading
+       Confirmation` heading. That block moved to the audit sidecar with
+       the report-format readability pass.
+    2. The matching audit sidecar exists at
+       `<worker>-audit-<task-type>-<seq>.md`. Missing sidecar means the
+       worker silently skipped the reading-confirmation step.
+    """
+    # `report_path` is `runs/<task-type>/reports/final-report-...md`; the
+    # sibling `worker-results/` directory holds every worker artifact.
+    worker_results_dir = report_path.parent.parent / "worker-results"
+    if not worker_results_dir.is_dir():
+        # No worker-results directory means no analysis workers ran (e.g.
+        # `release-handoff` which is single-lead). Nothing to enforce.
+        return
+    for path in sorted(worker_results_dir.glob("*.md")):
+        name = path.name
+        if "-audit-" in name:
+            continue
+        match = _WORKER_RESULT_BASENAME_RE.match(name)
+        if match is None:
+            # Files that don't match the canonical pattern (e.g. ad-hoc
+            # notes left by the operator) are out of contract scope.
+            continue
+        if match.group("task_type") != task_type:
+            # Cross-phase artifacts shouldn't appear here; skip rather
+            # than fail to keep the check focused on the current phase.
+            continue
+        worker_role = match.group("worker")
+        seq = match.group("seq")
+        rel = path.name
+        try:
+            content = path.read_text()
+        except OSError as exc:
+            failures.append(f"worker-results file unreadable: {rel} ({exc})")
+            continue
+        if _READING_CONFIRMATION_HEADING_RE.search(content) is not None:
+            failures.append(
+                f"worker-results file `{rel}` contains a `## 0. Reading "
+                f"Confirmation` heading — that block moved to the audit "
+                f"sidecar (`{worker_role}-audit-{task_type}-{seq}.md`). "
+                f"Remove the §0 heading + body from the main file and "
+                f"write a fresh sidecar."
+            )
+        audit_path = worker_results_dir / f"{worker_role}-audit-{task_type}-{seq}.md"
+        if not audit_path.exists():
+            failures.append(
+                f"worker `{worker_role}` produced `{rel}` but no audit sidecar "
+                f"at `{audit_path.name}` — the sidecar must carry the Reading "
+                f"Confirmation block (one short line per input file). Workers "
+                f"write this in the same step as the main worker-results file."
+            )
 def validate_team_state_usage(team_state: dict, failures: list[str]) -> None:
     summary = team_state.get("usageSummary") or {}
@@ -812,7 +1113,16 @@ def attempt_token_usage_autofix(
     team_state_path.write_text(
         json.dumps(updated, indent=2, ensure_ascii=False) + "\n"
     )
-    replaced = substitute_final_report(report_path, updated)
+    try:
+        replaced = substitute_final_report(report_path, updated)
+    except Exception as exc:  # noqa: BLE001
+        # `SubstituteRefusedError` (or any unexpected substitution
+        # failure) — report it as an accuracy failure so the validator
+        # surfaces a concrete remediation instead of silently shipping
+        # a report with zeros / sentinels.
+        return "accuracy-failed", [
+            f"Phase 7 token-usage substitution refused: {exc}"
+        ]
     detail = (
         f"replaced {replaced} placeholder(s)"
         if replaced > 0
@@ -893,6 +1203,8 @@ def main() -> int:
     task_type = str(task_manifest.get("taskType") or run_manifest.get("taskType") or "").strip()
     validate_phase_boundary(task_type, report_path, failures)
+    if task_type:
+        validate_worker_results_audit(report_path, task_type, failures)
     validation_status = "passed" if not failures else "failed"
     update_validation_metadata(