npm - @ngocsangairvds/vsaf - Versions diffs - 3.2.13 → 3.2.15 - Mend

@ngocsangairvds/vsaf 3.2.13 → 3.2.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1441) hide show

package/tools/vds-scripts/audit_orchestrator/tests/unit/engine/test_llm_row_evaluator_retries.py DELETED Viewed

@@ -1,3684 +0,0 @@
-# FR-118: Bounded Retry and Degraded Finalize Policy (Phase 72)
-from __future__ import annotations
-import asyncio
-import sys
-from pathlib import Path
-from types import SimpleNamespace
-import pytest
-from vds_audit_orchestrator.engine import llm_row_evaluator as module
-from vds_audit_orchestrator.llm.audit_schemas import RowEvaluationRequest, RowEvaluationResponse
-from vds_audit_orchestrator.models.checklist import RowStatus
-class _FakeAgent:
-    created_retries: list[int] = []
-    created_kwargs: list[dict[str, object]] = []
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        _FakeAgent.created_retries.append(int(kwargs.get("retries", -1)))
-        _FakeAgent.created_kwargs.append(dict(kwargs))
-    def run_sync(self, *_args, **_kwargs):
-        return SimpleNamespace(
-            output={
-                "reasoning": "reasoning",
-                "reason": "reason",
-                "score": 50.0,
-                "confidence": 0.8,
-                "finding": "finding",
-            }
-        )
-    def run_stream(self, *_args, **_kwargs):
-        return _FakeStreamContext(_FakeStream())
-class _SlowAsyncAgent:
-    created_retries: list[int] = []
-    created_kwargs: list[dict[str, object]] = []
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        _SlowAsyncAgent.created_retries.append(int(kwargs.get("retries", -1)))
-        _SlowAsyncAgent.created_kwargs.append(dict(kwargs))
-    async def run(self, *_args, **_kwargs):
-        await asyncio.sleep(3600)
-    def run_stream(self, *_args, **_kwargs):
-        return _SlowStreamContext()
-class _AsyncRunResult:
-    def __init__(self) -> None:
-        self.output = {
-            "reasoning": "reasoning",
-            "reason": "reason",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class FunctionToolCallEvent:
-    def __init__(self, tool_name: str, tool_call_id: str = "call-1", args: dict | None = None) -> None:
-        self.part = SimpleNamespace(tool_name=tool_name, tool_call_id=tool_call_id, args=args or {})
-class FunctionToolResultEvent:
-    def __init__(
-        self,
-        tool_name: str,
-        *,
-        tool_call_id: str = "call-1",
-        output: dict | None = None,
-    ) -> None:
-        self.result = SimpleNamespace(tool_name=tool_name, tool_call_id=tool_call_id, output=output or {"hits": [1]})
-class _ProgressAsyncAgent:
-    created_kwargs: list[dict[str, object]] = []
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        _ProgressAsyncAgent.created_kwargs.append(dict(kwargs))
-    async def run(self, *_args, **kwargs):
-        handler = kwargs.get("event_stream_handler")
-        if callable(handler):
-            events = [
-                FunctionToolCallEvent("read_file", tool_call_id="call-1"),
-                FunctionToolResultEvent("read_file", tool_call_id="call-1"),
-                FunctionToolCallEvent("grep_search", tool_call_id="call-2"),
-                FunctionToolResultEvent("grep_search", tool_call_id="call-2"),
-            ]
-            async def _event_stream():
-                for event in events:
-                    await asyncio.sleep(0.015)
-                    yield event
-            await handler(None, _event_stream())
-        return _AsyncRunResult()
-    def run_stream(self, *_args, **kwargs):
-        return _ProgressStreamContext(self, *(_args), **(kwargs))
-class _EventStreamFallbackAgent:
-    call_kwargs: list[dict[str, object]] = []
-    def __init__(self, **_kwargs):
-        pass
-    async def run(self, *_args, **kwargs):
-        _EventStreamFallbackAgent.call_kwargs.append(dict(kwargs))
-        if "event_stream_handler" in kwargs:
-            raise RuntimeError("event_stream_handler unsupported")
-        return _AsyncRunResult()
-    def run_stream(self, *_args, **kwargs):
-        _EventStreamFallbackAgent.call_kwargs.append(dict(kwargs))
-        if "event_stream_handler" in kwargs:
-            raise RuntimeError("event_stream_handler unsupported")
-        return _FakeStreamContext(_FakeStream())
-class _EventStreamFallbackThenTimeoutAgent:
-    call_kwargs: list[dict[str, object]] = []
-    def __init__(self, **_kwargs):
-        pass
-    async def run(self, *_args, **kwargs):
-        _EventStreamFallbackThenTimeoutAgent.call_kwargs.append(dict(kwargs))
-        if "event_stream_handler" in kwargs:
-            raise RuntimeError("event_stream_handler unsupported by provider")
-        raise RuntimeError("prompt_agent_timeout_after_1.000s")
-    def run_stream(self, *_args, **kwargs):
-        _EventStreamFallbackThenTimeoutAgent.call_kwargs.append(dict(kwargs))
-        if "event_stream_handler" in kwargs:
-            raise RuntimeError("event_stream_handler unsupported by provider")
-        raise RuntimeError("prompt_agent_timeout_after_1.000s")
-class _IdleStallAgent:
-    def __init__(self, **_kwargs):
-        pass
-    async def run(self, *_args, **kwargs):
-        handler = kwargs.get("event_stream_handler")
-        if callable(handler):
-            async def _event_stream():
-                yield FunctionToolCallEvent("read_file", tool_call_id="stall-1")
-                yield FunctionToolResultEvent("read_file", tool_call_id="stall-1")
-            await handler(None, _event_stream())
-        await asyncio.sleep(0.2)
-        return _AsyncRunResult()
-    def run_stream(self, *_args, **kwargs):
-        return _IdleStallStreamContext()
-class _ChurnAgent:
-    def __init__(self, **_kwargs):
-        pass
-    async def run(self, *_args, **kwargs):
-        handler = kwargs.get("event_stream_handler")
-        if callable(handler):
-            async def _event_stream():
-                for idx in range(20):
-                    tool_name = "list_directory" if idx < 18 else ("read_file" if idx == 18 else "grep_search")
-                    tool_call_id = f"churn-{idx}"
-                    yield FunctionToolCallEvent(tool_name, tool_call_id=tool_call_id)
-                    yield FunctionToolResultEvent(tool_name, tool_call_id=tool_call_id)
-            await handler(None, _event_stream())
-        await asyncio.sleep(0.2)
-        return _AsyncRunResult()
-    def run_stream(self, *_args, **kwargs):
-        return _ChurnStreamContext()
-class _ProviderErrorAgent:
-    def __init__(self, **_kwargs):
-        pass
-    async def run(self, *_args, **_kwargs):
-        raise RuntimeError("status_code: 503 service unavailable")
-    def run_stream(self, *_args, **_kwargs):
-        raise RuntimeError("status_code: 503 service unavailable")
-class _FakeLogger:
-    def __init__(self) -> None:
-        self.records: list[tuple[str, dict[str, object]]] = []
-    def bind(self, **_kwargs):
-        return self
-    def info(self, event: str, **kwargs):
-        self.records.append((event, dict(kwargs)))
-    def warning(self, event: str, **kwargs):
-        self.records.append((event, dict(kwargs)))
-class _FakeStream:
-    def __init__(self) -> None:
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "reason",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    async def get_output(self):
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _FakeStreamContext:
-    def __init__(self, stream: _FakeStream):
-        self._stream = stream
-    async def __aenter__(self):
-        return self._stream
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _SlowStreamContext:
-    async def __aenter__(self):
-        return self
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-    async def get_output(self):
-        await asyncio.sleep(3600)
-class _ProgressStreamContext:
-    def __init__(self, agent, *args, **kwargs):
-        self._agent = agent
-        self._args = args
-        self._kwargs = kwargs
-    async def __aenter__(self):
-        return _ProgressStream(self._agent, *self._args, **self._kwargs)
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _ProgressStream:
-    def __init__(self, agent, *args, **kwargs):
-        self._agent = agent
-        self._kwargs = kwargs
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "reason",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    async def get_output(self):
-        handler = self._kwargs.get("event_stream_handler")
-        if callable(handler):
-            events = [
-                FunctionToolCallEvent("read_file", tool_call_id="call-1"),
-                FunctionToolResultEvent("read_file", tool_call_id="call-1"),
-                FunctionToolCallEvent("grep_search", tool_call_id="call-2"),
-                FunctionToolResultEvent("grep_search", tool_call_id="call-2"),
-            ]
-            async def _event_stream():
-                for event in events:
-                    await asyncio.sleep(0.015)
-                    yield event
-            await handler(None, _event_stream())
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _IdleStallStream:
-    def __init__(self) -> None:
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "stalled_after_tool_activity",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    async def get_output(self):
-        # Sleep longer than the 1.0s timeout so the heartbeat loop fires timeout.
-        # Note: streaming path does not populate run_telemetry with tool events,
-        # so idle/churn detection cannot trigger — timeout falls through to agent_timeout.
-        await asyncio.sleep(5.0)
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _IdleStallStreamContext:
-    async def __aenter__(self):
-        return _IdleStallStream()
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _ChurnStream:
-    def __init__(self) -> None:
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "list_directory_churn detected",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    async def get_output(self):
-        # Sleep longer than the 1.0s timeout so the heartbeat loop fires timeout.
-        # Note: streaming path does not populate run_telemetry with tool events,
-        # so churn detection cannot trigger — timeout falls through to agent_timeout.
-        await asyncio.sleep(5.0)
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _ChurnStreamContext:
-    async def __aenter__(self):
-        return _ChurnStream()
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _CodexStream:
-    def __init__(self) -> None:
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "reason",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": "finding",
-        }
-    async def get_output(self):
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _CodexStreamContext:
-    def __init__(self, stream: _CodexStream):
-        self._stream = stream
-    async def __aenter__(self):
-        return self._stream
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _CodexAgent:
-    run_sync_calls = 0
-    run_calls = 0
-    run_stream_calls = 0
-    created_kwargs: list[dict] = []
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        _CodexAgent.created_kwargs.append(dict(kwargs))
-    def run_sync(self, *_args, **_kwargs):
-        _CodexAgent.run_sync_calls += 1
-        raise AssertionError("run_sync should not be used for openai-codex prompt backend")
-    async def run(self, *_args, **_kwargs):
-        _CodexAgent.run_calls += 1
-        raise AssertionError("run should not be used for openai-codex prompt backend")
-    def run_stream(self, *_args, **_kwargs):
-        _CodexAgent.run_stream_calls += 1
-        return _CodexStreamContext(_CodexStream())
-class _NonCodexStream:
-    def __init__(self, status: str = "streamed") -> None:
-        self._output = {
-            "reasoning": "reasoning",
-            "reason": "reason",
-            "score": 50.0,
-            "confidence": 0.8,
-            "finding": status,
-        }
-    async def get_output(self):
-        return self._output
-    def all_messages(self):
-        return []
-    def usage(self):
-        return None
-class _NonCodexStreamContext:
-    def __init__(self, stream: _NonCodexStream):
-        self._stream = stream
-    async def __aenter__(self):
-        return self._stream
-    async def __aexit__(self, exc_type, exc, tb):
-        return False
-class _NonCodexAgent:
-    run_sync_calls = 0
-    run_stream_calls = 0
-    created_kwargs: list[dict] = []
-    def __init__(self, **kwargs):
-        self.kwargs = kwargs
-        _NonCodexAgent.created_kwargs.append(dict(kwargs))
-    def run_sync(self, *_args, **_kwargs):
-        _NonCodexAgent.run_sync_calls += 1
-        return SimpleNamespace(
-            output={
-                "reasoning": "reasoning",
-                "reason": "reason",
-                "score": 50.0,
-                "confidence": 0.8,
-                "finding": "non-stream",
-            }
-        )
-    def run_stream(self, *_args, **_kwargs):
-        _NonCodexAgent.run_stream_calls += 1
-        return _NonCodexStreamContext(_NonCodexStream())
-def _fake_model_settings(**kwargs):
-    return SimpleNamespace(**kwargs)
-def test_row_evaluator_uses_max_of_agent_and_output_retries(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_FakeAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=1,
-        output_retries=4,
-        max_tokens=1024,
-        model_standard="claude-haiku-4-5-20251001",
-        protocol=SimpleNamespace(value="anthropic"),
-        reasoning_effort="high",
-        agent_stream=False,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _FakeAgent.created_retries.clear()
-    _FakeAgent.created_kwargs = []
-    module._ROW_EVAL_AGENTS.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="claude-haiku-4-5-20251001",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = evaluator._invoke_sync_agent(
-        key="test-agent",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert _FakeAgent.created_retries == [4]
-    assert _FakeAgent.created_kwargs[0]["output_retries"] == 4
-    model_settings = _FakeAgent.created_kwargs[0].get("model_settings")
-    assert getattr(model_settings, "anthropic_effort", None) == "high"
-    assert response.reason == "reason"
-def test_row_evaluator_sync_agent_suppresses_tool_choice_for_dashscope_openai(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_FakeAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="qwen3.5-plus",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://coding-intl.dashscope.aliyuncs.com/v1",
-        reasoning_effort=None,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _FakeAgent.created_retries.clear()
-    _FakeAgent.created_kwargs = []
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="qwen3.5-plus",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    evaluator._invoke_sync_agent(
-        key="test-agent-dashscope-openai",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    model_settings = _FakeAgent.created_kwargs[0].get("model_settings")
-    assert getattr(model_settings, "extra_body", None) == {"tool_choice": None}
-def test_row_evaluator_sync_agent_suppresses_tool_choice_for_anthropic_proxy(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_FakeAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="qwen3.5-plus",
-        protocol=SimpleNamespace(value="anthropic"),
-        base_url="https://coding-intl.dashscope.aliyuncs.com/apps/anthropic",
-        reasoning_effort=None,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _FakeAgent.created_retries.clear()
-    _FakeAgent.created_kwargs = []
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="qwen3.5-plus",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    evaluator._invoke_sync_agent(
-        key="test-agent-dashscope-anthropic",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    model_settings = _FakeAgent.created_kwargs[0].get("model_settings")
-    assert getattr(model_settings, "extra_body", None) == {"tool_choice": None}
-@pytest.mark.asyncio
-async def test_row_evaluator_async_agent_passes_output_retries(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_ProgressAsyncAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=1,
-        output_retries=4,
-        max_tokens=1024,
-        model_standard="z-ai/glm5",
-        protocol=SimpleNamespace(value="anthropic"),
-        reasoning_effort=None,
-        agent_stream=False,
-        agent_event_stream_enabled=False,
-        agent_timeout_seconds=5.0,
-        agent_timeout_max_seconds=5.0,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.1,
-        agent_idle_timeout_seconds=5.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _ProgressAsyncAgent.created_kwargs.clear()
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="z-ai/glm5",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = await evaluator._invoke_async_agent(
-        key="test-async-agent",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert _ProgressAsyncAgent.created_kwargs[0]["output_retries"] == 4
-    assert response.reason == "reason"
-def test_row_evaluator_sync_agent_uses_max_tokens_per_request(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_FakeAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens_per_request=1024,
-        model_standard="z-ai/glm5",
-        protocol=SimpleNamespace(value="anthropic"),
-        reasoning_effort=None,
-        agent_stream=False,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _FakeAgent.created_kwargs = []
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="z-ai/glm5",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    evaluator._invoke_sync_agent(
-        key="test-max-tokens",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    model_settings = _FakeAgent.created_kwargs[0]["model_settings"]
-    assert getattr(model_settings, "max_tokens", None) == 1024
-@pytest.mark.asyncio
-async def test_invoke_async_agent_applies_hard_timeout(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_SlowAsyncAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=0.05,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    _SlowAsyncAgent.created_retries.clear()
-    _SlowAsyncAgent.created_kwargs = []
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent(
-            key="test-agent-timeout",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-        )
-    assert _SlowAsyncAgent.created_retries == [0]
-    model_settings = _SlowAsyncAgent.created_kwargs[0].get("model_settings")
-    assert not hasattr(model_settings, "openai_reasoning_effort")
-    assert exc_info.value.retry_metadata.get("reason_code") == "agent_timeout"
-    assert exc_info.value.retry_metadata.get("timeout_policy", {}).get("initial_timeout_seconds") == pytest.approx(0.05)
-def test_resolve_async_agent_timeout_policy_clamps_to_row_timeout_budget(monkeypatch):
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    llm_cfg = SimpleNamespace(
-        agent_timeout_seconds=300.0,
-        agent_timeout_max_seconds=900.0,
-        agent_timeout_extension_seconds=180.0,
-        agent_timeout_extension_attempts=4,
-        agent_timeout_heartbeat_seconds=30.0,
-        agent_idle_timeout_seconds=600.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={"row_timeout_ms": 20000},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    policy = evaluator._resolve_async_agent_timeout_policy()
-    assert policy.initial_timeout_seconds == pytest.approx(25.0)
-    assert policy.max_timeout_seconds == pytest.approx(25.0)
-def test_resolve_async_agent_timeout_policy_uses_lease_budget_when_enabled(monkeypatch):
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    llm_cfg = SimpleNamespace(
-        agent_timeout_seconds=300.0,
-        agent_timeout_max_seconds=900.0,
-        agent_timeout_extension_seconds=180.0,
-        agent_timeout_extension_attempts=4,
-        agent_timeout_heartbeat_seconds=30.0,
-        agent_idle_timeout_seconds=600.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={
-                "row_timeout_ms": 120000,
-                "row_progress_lease_seconds": 45,
-                "row_absolute_timeout_ms": 180000,
-            },
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    policy = evaluator._resolve_async_agent_timeout_policy()
-    assert policy.initial_timeout_seconds == pytest.approx(45.0)
-    assert policy.max_timeout_seconds == pytest.approx(180.0)
-@pytest.mark.asyncio
-async def test_invoke_async_agent_extends_timeout_when_event_stream_progress_continues(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_ProgressAsyncAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=0.02,
-        agent_timeout_max_seconds=0.08,
-        agent_timeout_extension_seconds=0.03,
-        agent_timeout_extension_attempts=1,
-        agent_timeout_heartbeat_seconds=0.005,
-        agent_idle_timeout_seconds=1.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _ProgressAsyncAgent.created_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={"row_timeout_ms": 1000},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = await evaluator._invoke_async_agent(
-        key="test-agent-progress",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    telemetry = evaluator.get_last_prompt_tool_telemetry_snapshot()
-    assert telemetry is not None
-    assert telemetry["prompt_tool_telemetry"]["event_tool_calls_completed"] == 2
-    assert "read_file" in telemetry["prompt_tool_telemetry"]["event_tool_names"]
-@pytest.mark.asyncio
-async def test_invoke_async_agent_heartbeat_includes_runtime_context(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_ProgressAsyncAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=0.02,
-        agent_timeout_max_seconds=0.08,
-        agent_timeout_extension_seconds=0.03,
-        agent_timeout_extension_attempts=1,
-        agent_timeout_heartbeat_seconds=0.005,
-        agent_idle_timeout_seconds=1.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    fake_logger = _FakeLogger()
-    monkeypatch.setattr(module, "logger", fake_logger)
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _ProgressAsyncAgent.created_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={
-                "thread_id": "thread-1",
-                "run_id": "run-1",
-                "audit_id": "run-1",
-                "project_key": "project-1",
-                "repo_key": "repo-1",
-            },
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = await evaluator._invoke_async_agent(
-        key="test-agent-progress-runtime-fields",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    heartbeat_payloads = [payload for event, payload in fake_logger.records if event == "prompt_agent_call_heartbeat"]
-    assert heartbeat_payloads
-    assert any(payload.get("thread_id") == "thread-1" for payload in heartbeat_payloads)
-    assert any(payload.get("repo_key") == "repo-1" for payload in heartbeat_payloads)
-@pytest.mark.asyncio
-async def test_invoke_async_agent_retries_without_event_stream(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_EventStreamFallbackAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=0.2,
-        agent_timeout_max_seconds=0.2,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.01,
-        agent_idle_timeout_seconds=1.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _EventStreamFallbackAgent.call_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = await evaluator._invoke_async_agent(
-        key="test-agent-event-stream-fallback",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    assert len(_EventStreamFallbackAgent.call_kwargs) == 2
-    assert "event_stream_handler" in _EventStreamFallbackAgent.call_kwargs[0]
-    assert "event_stream_handler" not in _EventStreamFallbackAgent.call_kwargs[1]
-@pytest.mark.asyncio
-async def test_invoke_async_agent_normalizes_fallback_failure(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_EventStreamFallbackThenTimeoutAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=0.2,
-        agent_timeout_max_seconds=0.2,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.01,
-        agent_idle_timeout_seconds=1.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _EventStreamFallbackThenTimeoutAgent.call_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent(
-            key="test-agent-event-stream-fallback-timeout",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-        )
-    assert len(_EventStreamFallbackThenTimeoutAgent.call_kwargs) == 2
-    assert exc_info.value.retry_metadata.get("reason_code") == "agent_timeout"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_reports_idle_timeout_reason_code(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_IdleStallAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=1.0,
-        agent_timeout_max_seconds=1.0,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.005,
-        agent_idle_timeout_seconds=0.02,
-        agent_stream=False,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent(
-            key="test-agent-idle-timeout",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-        )
-    assert exc_info.value.retry_metadata.get("reason_code") == "agent_idle_timeout"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_reports_tool_churn_reason_code(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_ChurnAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=1.0,
-        agent_timeout_max_seconds=1.0,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.005,
-        agent_idle_timeout_seconds=1.0,
-        agent_stream=False,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent(
-            key="test-agent-tool-churn",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-        )
-    assert exc_info.value.retry_metadata.get("reason_code") == "agent_tool_churn"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_preserves_provider_server_error_reason_code(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_ProviderErrorAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gemini-3-flash",
-        protocol=SimpleNamespace(value="gemini"),
-        agent_timeout_seconds=1.0,
-        agent_timeout_max_seconds=1.0,
-        agent_timeout_extension_seconds=0.0,
-        agent_timeout_extension_attempts=0,
-        agent_timeout_heartbeat_seconds=0.01,
-        agent_idle_timeout_seconds=1.0,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent(
-            key="test-agent-provider-server-error",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-        )
-    assert exc_info.value.retry_metadata.get("reason_code") == "provider_server_error"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_with_provider_retry_recovers_from_provider_server_error(monkeypatch):
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="gemini"),
-            model="gemini-3-flash",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    calls = {"count": 0}
-    async def _fake_invoke_async_agent(**_kwargs):
-        calls["count"] += 1
-        if calls["count"] == 1:
-            raise module.RowEvaluationBackendError(
-                "provider failed",
-                retry_metadata={"reason_code": "provider_server_error"},
-            )
-        return RowEvaluationResponse(
-            reasoning="reasoning",
-            reason="reason",
-            score=50.0,
-            confidence=0.8,
-            finding="finding",
-        )
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 2)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    response = await evaluator._invoke_async_agent_with_provider_retry(
-        key="retry-test",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-        row_id="CL-001",
-        runtime_fields={},
-    )
-    assert response.reason == "reason"
-    assert calls["count"] == 2
-    telemetry = evaluator.get_last_prompt_tool_telemetry_snapshot() or {}
-    provider_retry = telemetry.get("prompt_provider_retry") or evaluator._last_prompt_tool_telemetry.get(
-        "prompt_provider_retry"
-    )
-    assert provider_retry["reason_code"] == "provider_server_error"
-    assert provider_retry["recovered"] is True
-@pytest.mark.asyncio
-async def test_invoke_async_agent_with_provider_retry_recovers_from_wrapped_internal_network_failure(monkeypatch):
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="glm-5",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    calls = {"count": 0}
-    async def _fake_invoke_async_agent(**_kwargs):
-        calls["count"] += 1
-        if calls["count"] == 1:
-            raise module.RowEvaluationBackendError(
-                "{'type': 'error', 'error': {'type': 'api_error', 'message': 'Internal Network Failure'}}",
-                retry_metadata={"reason_code": "agent_timeout"},
-            )
-        return RowEvaluationResponse(
-            reasoning="reasoning",
-            reason="reason",
-            score=50.0,
-            confidence=0.8,
-            finding="finding",
-        )
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 2)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    response = await evaluator._invoke_async_agent_with_provider_retry(
-        key="retry-test-wrapped-network-failure",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-        row_id="CL-011",
-        runtime_fields={},
-    )
-    assert response.reason == "reason"
-    assert calls["count"] == 2
-    telemetry = evaluator.get_last_prompt_tool_telemetry_snapshot() or {}
-    provider_retry = telemetry.get("prompt_provider_retry") or evaluator._last_prompt_tool_telemetry.get(
-        "prompt_provider_retry"
-    )
-    assert provider_retry["reason_code"] == "provider_transient_error"
-    assert provider_retry["recovered"] is True
-@pytest.mark.asyncio
-async def test_invoke_async_agent_with_provider_retry_fails_over_to_fallback_profile(monkeypatch):
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="primary-model",
-            use_dspy=False,
-            runtime_context={},
-            row_failover_profiles=["fallback-openai"],
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    primary_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="primary-model",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://primary.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=["fallback-openai"],
-        row_failover_max_provider_hops=1,
-    )
-    fallback_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="fallback-model",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://fallback.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=[],
-        row_failover_max_provider_hops=1,
-    )
-    async def _fake_invoke_async_agent(**kwargs):
-        llm_cfg = kwargs.get("llm_cfg")
-        if getattr(llm_cfg, "model_standard", "") == "primary-model":
-            raise module.RowEvaluationBackendError(
-                "status_code: 503 service unavailable",
-                retry_metadata={"reason_code": "provider_server_error"},
-            )
-        return RowEvaluationResponse(
-            reasoning="reasoning",
-            reason="reason",
-            score=50.0,
-            confidence=0.8,
-            finding="finding",
-        )
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 1)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(module, "inherit_runtime_llm_policy", lambda profile_name, *, source_llm=None: fallback_cfg)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    response = await evaluator._invoke_async_agent_with_provider_retry(
-        key="retry-test-provider-failover",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-        row_id="CL-001",
-        runtime_fields={"check_id": "CL-001"},
-        llm_cfg=primary_cfg,
-        effective_model="primary-model",
-        runtime_profile_name="primary-openai",
-    )
-    assert response.reason == "reason"
-    retry_telemetry = evaluator.get_last_retry_telemetry_snapshot() or {}
-    assert retry_telemetry["provider_failover_attempted"] is True
-    assert retry_telemetry["provider_failover_final_provider"] == "fallback-openai"
-    assert retry_telemetry["provider_failover_chain"] == ["primary-openai", "fallback-openai"]
-    assert retry_telemetry["provider_failover_hops"] == 1
-@pytest.mark.asyncio
-async def test_invoke_async_agent_with_provider_retry_marks_failover_exhausted_when_no_fallback_remains(monkeypatch):
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="primary-model",
-            use_dspy=False,
-            runtime_context={},
-            row_failover_profiles=["fallback-openai", "dead-openai"],
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    primary_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="primary-model",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://primary.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=["fallback-openai", "dead-openai"],
-        row_failover_max_provider_hops=1,
-    )
-    async def _fake_invoke_async_agent(**_kwargs):
-        raise module.RowEvaluationBackendError(
-            "status_code: 503 service unavailable",
-            retry_metadata={"reason_code": "provider_server_error"},
-        )
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_ROW_FAILOVER_AVAILABLE_PROFILES", "fallback-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 1)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent_with_provider_retry(
-            key="retry-test-failover-exhausted",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-            row_id="CL-011",
-            runtime_fields={"check_id": "CL-011"},
-            llm_cfg=primary_cfg,
-            effective_model="primary-model",
-            runtime_profile_name="primary-openai",
-            row_failover_context=module.RowFailoverContext(
-                row_id="CL-011",
-                check_id="CL-011",
-                original_provider="primary-openai",
-                failover_count=1,
-                failover_chain=["primary-openai", "fallback-openai"],
-            ),
-            provider_health_memory=module.ProviderHealthMemory(),
-        )
-    retry_metadata = exc_info.value.retry_metadata or {}
-    assert retry_metadata["timeout_kind"] == "timeout_failover_exhausted"
-    retry_telemetry = evaluator.get_last_retry_telemetry_snapshot() or {}
-    assert retry_telemetry["provider_failover_final_provider"] == "primary-openai"
-    assert retry_telemetry["provider_failover_profiles_considered"] == ["fallback-openai"]
-def test_classify_prompt_provider_retry_reason_treats_429_as_retryable():
-    exc = RuntimeError("status_code: 429, body: {'error': {'type': 'rate_limit_error'}}")
-    assert module._classify_prompt_provider_retry_reason(exc) == "provider_transient_error"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_uses_run_stream_for_openai_codex(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_CodexAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard="gpt-5.3-codex",
-        protocol=SimpleNamespace(value="openai-codex"),
-        agent_timeout_seconds=5.0,
-        reasoning_effort="medium",
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _CodexAgent.run_sync_calls = 0
-    _CodexAgent.run_calls = 0
-    _CodexAgent.run_stream_calls = 0
-    _CodexAgent.created_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai-codex"),
-            model="gpt-5.3-codex",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = await evaluator._invoke_async_agent(
-        key="test-agent-codex-async",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    assert _CodexAgent.run_stream_calls == 1
-    assert _CodexAgent.run_calls == 0
-    model_settings = _CodexAgent.created_kwargs[0].get("model_settings")
-    assert getattr(model_settings, "max_tokens", None) is None
-    assert getattr(model_settings, "openai_reasoning_effort", None) == "medium"
-def test_invoke_sync_agent_uses_run_stream_for_openai_codex(monkeypatch):
-    fake_pydantic_ai = SimpleNamespace(Agent=_CodexAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=1,
-        output_retries=1,
-        max_tokens=256,
-        model_standard="gpt-5.3-codex",
-        protocol=SimpleNamespace(value="openai-codex"),
-        reasoning_effort="xhigh",
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _CodexAgent.run_sync_calls = 0
-    _CodexAgent.run_calls = 0
-    _CodexAgent.run_stream_calls = 0
-    _CodexAgent.created_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai-codex"),
-            model="gpt-5.3-codex",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = evaluator._invoke_sync_agent(
-        key="test-agent-codex-sync",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    assert _CodexAgent.run_stream_calls == 1
-    assert _CodexAgent.run_sync_calls == 0
-    model_settings = _CodexAgent.created_kwargs[0].get("model_settings")
-    assert getattr(model_settings, "max_tokens", None) is None
-    assert getattr(model_settings, "openai_reasoning_effort", None) == "xhigh"
-@pytest.mark.parametrize(
-    ("agent_stream", "stream", "expected_run_sync", "expected_run_stream"),
-    [
-        (False, False, 1, 0),
-        (True, False, 0, 1),
-        (False, True, 0, 1),
-    ],
-)
-def test_invoke_sync_agent_honors_non_codex_stream_mode_matrix(
-    monkeypatch, agent_stream, stream, expected_run_sync, expected_run_stream
-):
-    fake_pydantic_ai = SimpleNamespace(Agent=_NonCodexAgent, RunContext=object)
-    fake_settings_module = SimpleNamespace(ModelSettings=_fake_model_settings)
-    monkeypatch.setitem(sys.modules, "pydantic_ai", fake_pydantic_ai)
-    monkeypatch.setitem(sys.modules, "pydantic_ai.settings", fake_settings_module)
-    import vds_audit_orchestrator.engine.row_evaluator as row_module
-    from vds_audit_orchestrator.agents import pydantic_base as _pydantic_base
-    llm_cfg = SimpleNamespace(
-        agent_retries=1,
-        output_retries=1,
-        max_tokens=256,
-        model_standard="gpt-4o-mini",
-        protocol=SimpleNamespace(value="openai"),
-        agent_stream=agent_stream,
-        stream=stream,
-        reasoning_effort=None,
-    )
-    monkeypatch.setattr(row_module, "get_config", lambda: SimpleNamespace(llm=llm_cfg))
-    monkeypatch.setattr(_pydantic_base, "_build_model", lambda *_args, **_kwargs: "fake-model")
-    module._ROW_EVAL_AGENTS.clear()
-    module._ROW_EVAL_AGENT_TOOLSET_SIGNATURES.clear()
-    _NonCodexAgent.run_sync_calls = 0
-    _NonCodexAgent.run_stream_calls = 0
-    _NonCodexAgent.created_kwargs = []
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="gpt-4o-mini",
-            use_dspy=False,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    response = evaluator._invoke_sync_agent(
-        key=f"test-agent-openai-sync-{agent_stream}-{stream}",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-    )
-    assert response.reason == "reason"
-    assert _NonCodexAgent.run_sync_calls == expected_run_sync
-    assert _NonCodexAgent.run_stream_calls == expected_run_stream
-def test_sanitize_status_score_contradiction_caps_strong_non_compliance_to_fail():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Yêu cầu chưa được đáp ứng do thiếu bằng chứng triển khai.",
-        score=96.0,
-        confidence=0.94,
-        finding="Thiếu cấu hình và minh chứng vận hành.",
-        evidence_anchors=[],
-    )
-    sanitized = module.LLMRowEvaluator._sanitize_status_score_contradiction(response)
-    assert sanitized.score == 39.0
-    assert sanitized.confidence == 0.8
-def test_sanitize_status_score_contradiction_caps_partial_non_compliance_to_partial():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Yêu cầu chưa được đáp ứng đầy đủ ở một số tiêu chí phụ.",
-        score=95.0,
-        confidence=0.9,
-        finding="Đáp ứng một phần, cần bổ sung runbook.",
-        evidence_anchors=[],
-    )
-    sanitized = module.LLMRowEvaluator._sanitize_status_score_contradiction(response)
-    assert sanitized.score == 89.0
-    assert sanitized.confidence == 0.85
-def test_sanitize_status_score_contradiction_keeps_positive_pass_intact():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Yêu cầu đã được đáp ứng đầy đủ và có bằng chứng rõ ràng.",
-        score=95.0,
-        confidence=0.92,
-        finding="Không phát hiện khoảng trống cần khắc phục.",
-        evidence_anchors=[],
-    )
-    sanitized = module.LLMRowEvaluator._sanitize_status_score_contradiction(response)
-    assert sanitized.score == 95.0
-    assert sanitized.confidence == 0.92
-def test_normalize_failure_retry_metadata_defaults_to_synthesis_when_evidence_retrieved():
-    metadata = module.LLMRowEvaluator._normalize_failure_retry_metadata(
-        retry_metadata={},
-        evidence_retrieved=True,
-        fallback_reason_code="fallback_reason",
-    )
-    assert metadata["reason_code"] == "fallback_reason"
-    assert metadata["failure_stage"] == "synthesis"
-    assert metadata["stage"] == "synthesis_failed"
-    assert metadata["evidence_retrieved"] is True
-def test_normalize_failure_retry_metadata_defaults_to_retrieval_when_no_evidence():
-    metadata = module.LLMRowEvaluator._normalize_failure_retry_metadata(
-        retry_metadata={},
-        evidence_retrieved=False,
-        fallback_reason_code="fallback_reason",
-    )
-    assert metadata["reason_code"] == "fallback_reason"
-    assert metadata["failure_stage"] == "evidence_retrieval"
-    assert metadata["stage"] == "evidence_retrieval_failed"
-    assert metadata["evidence_retrieved"] is False
-def test_normalize_failure_retry_metadata_maps_tool_first_stage_to_retrieval():
-    metadata = module.LLMRowEvaluator._normalize_failure_retry_metadata(
-        retry_metadata={"stage": "tool_first_timeout"},
-        evidence_retrieved=True,
-        fallback_reason_code="fallback_reason",
-    )
-    assert metadata["failure_stage"] == "evidence_retrieval"
-    assert metadata["stage"] == "tool_first_timeout"
-def test_convert_response_derives_status_from_final_score():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "gemini"
-    evaluator._effective_model = "gemini-3-flash"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Đáp ứng đầy đủ.",
-        score=80.0,
-        confidence=0.4,
-        finding="Cần bổ sung.",
-        evidence_anchors=[],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-TEST"),
-        row_id="CL-TEST:row_0",
-        fallback_evidence_refs=[],
-    )
-    assert result.score == 32.0
-    assert result.score_breakdown.final_score == 32.0
-    assert result.status == RowStatus.FAIL
-    assert result.score_1_5 == 2
-def test_convert_response_records_duration_and_anchor_allowlist_violation():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai-codex"
-    evaluator._effective_model = "gpt-5.3-codex"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Đáp ứng một phần.",
-        score=70.0,
-        confidence=0.8,
-        finding="test",
-        evidence_anchors=[{"ref_type": "code_path", "ref_value": "src/not-allowed.py", "excerpt": ""}],
-        cited_anchor_ids=["src/not-allowed.py"],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-TEST"),
-        row_id="CL-TEST:row_0",
-        fallback_evidence_refs=["src/allowed.py"],
-        evaluation_duration_ms=123,
-    )
-    assert result.provenance.evaluation_duration_ms == 123
-    assert result.status == RowStatus.ERROR
-    assert result.error_message == "anchor_allowlist_violation"
-    assert isinstance(result.retrieval_trace, dict)
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_anchor_count") == 1
-    assert (
-        result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_reason_code") == "anchor_id_not_allowed"
-    )
-    assert result.retrieval_trace.get("anchor_id_not_allowed", {}).get("count") == 1
-def test_convert_response_recovers_allowlisted_cited_anchor_when_model_anchor_objects_are_invalid():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai-codex"
-    evaluator._effective_model = "gpt-5.3-codex"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Đã có một phần bằng chứng hợp lệ.",
-        score=70.0,
-        confidence=0.8,
-        finding="test",
-        evidence_anchors=[{"ref_type": "code_path", "ref_value": "src/not-allowed.py", "excerpt": ""}],
-        cited_anchor_ids=["src/not-allowed.py", "chunk:allowed-doc"],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-TEST"),
-        row_id="CL-TEST:row_0",
-        fallback_evidence_refs=["chunk:allowed-doc"],
-        evaluation_duration_ms=123,
-    )
-    assert result.provenance.evaluation_duration_ms == 123
-    assert result.status != RowStatus.ERROR
-    assert result.error_message is None
-    assert [anchor.ref_value for anchor in result.evidence_anchors] == ["chunk:allowed-doc"]
-    assert result.evidence_anchors[0].verification_reason == "fallback_ref_inherited"
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_anchor_count") == 1
-    assert result.retrieval_trace.get("anchor_id_not_allowed", {}).get("count") == 1
-def test_convert_response_docs_primary_uses_requirement_interpretation_without_name_error():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai-codex"
-    evaluator._effective_model = "gpt-5.3-codex"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Đã có một phần bằng chứng hợp lệ.",
-        score=70.0,
-        confidence=0.8,
-        finding="test",
-        evidence_anchors=[{"ref_type": "code_path", "ref_value": "src/not-allowed.py", "excerpt": ""}],
-        cited_anchor_ids=["src/not-allowed.py", "chunk:allowed-doc"],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-052"),
-        row_id="CL-052:row_51",
-        fallback_evidence_refs=["chunk:allowed-doc", "src/not-allowed.py"],
-        requirement_interpretation={
-            "required_anchor_modalities": ["docs"],
-            "finalization_policy": "docs_primary",
-        },
-        evaluation_duration_ms=123,
-    )
-    assert result.provenance.evaluation_duration_ms == 123
-    assert result.status != RowStatus.ERROR
-    assert result.error_message is None
-    assert [anchor.ref_value for anchor in result.evidence_anchors] == ["chunk:allowed-doc"]
-def test_convert_response_accepts_canonical_equivalent_anchor_ids():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai-codex"
-    evaluator._effective_model = "gpt-5.3-codex"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Đã có bằng chứng hợp lệ.",
-        score=72.0,
-        confidence=0.8,
-        finding="Bằng chứng đã được neo đúng.",
-        evidence_anchors=[
-            {
-                "ref_type": "code_path",
-                "ref_value": "code:src/main/java/com/example/DatabaseConfig.java:18",
-                "excerpt": "@Bean",
-            }
-        ],
-        cited_anchor_ids=["code:src/main/java/com/example/DatabaseConfig.java:18"],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-TEST"),
-        row_id="CL-TEST:row_0",
-        fallback_evidence_refs=["src/main/java/com/example/DatabaseConfig.java"],
-        evaluation_duration_ms=123,
-    )
-    assert result.status != RowStatus.ERROR
-    assert [anchor.ref_value for anchor in result.evidence_anchors] == [
-        "code:src/main/java/com/example/DatabaseConfig.java:18"
-    ]
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_anchor_count") == 0
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_cited_anchor_ids") == []
-def test_convert_response_ignores_internal_analysis_refs_for_allowlist_enforcement():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai-codex"
-    evaluator._effective_model = "gpt-5.3-codex"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Có bằng chứng docs hợp lệ và một số ref nội bộ để giải thích phân tích.",
-        score=70.0,
-        confidence=0.8,
-        finding="test",
-        evidence_anchors=[
-            {"ref_type": "doc", "ref_value": "chunk:6eb413f3e9764797", "excerpt": "Version 1.2.3"},
-            {"ref_type": "doc", "ref_value": "references/row-analysis.md", "excerpt": "analysis"},
-        ],
-        cited_anchor_ids=[
-            "chunk:6eb413f3e9764797",
-            "http://confluence.digital.vn/display/TTCN24/PAR+Project+Audit",
-            "references/row-analysis.md",
-        ],
-    )
-    result = evaluator._convert_response(
-        response=response,
-        check=SimpleNamespace(id="CL-TEST"),
-        row_id="CL-TEST:row_0",
-        fallback_evidence_refs=["chunk:6eb413f3e9764797"],
-        evaluation_duration_ms=123,
-    )
-    assert result.status != RowStatus.ERROR
-    assert result.error_message != "anchor_allowlist_violation"
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_anchor_count") == 0
-    assert result.retrieval_trace.get("anchor_identity_contract", {}).get("invalid_cited_anchor_ids") == []
-def test_create_error_result_records_evaluation_duration():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "gemini"
-    evaluator._effective_model = "gemini-3-flash"
-    evaluator._last_retry_telemetry = {}
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    result = evaluator._create_error_result(
-        check=SimpleNamespace(id="CL-ERR"),
-        row_id="CL-ERR:row_1",
-        error_message="synthetic error",
-        evaluation_duration_ms=77,
-    )
-    assert result.status == RowStatus.ERROR
-    assert result.provenance.evaluation_duration_ms == 77
-def test_create_error_result_applies_failover_provenance_from_retry_telemetry():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(mode=SimpleNamespace(value="selective"))
-    evaluator.template_hash = "tpl"
-    evaluator.evidence_hash = "evidence"
-    evaluator._effective_protocol = "openai"
-    evaluator._effective_model = "fallback-model"
-    evaluator._instruction_trace_metadata = {
-        "instruction_profile": "none|inline|vi|pydanticai",
-        "instruction_inputs": {},
-        "instruction_hash": "abc123",
-    }
-    evaluator._last_retry_telemetry = {
-        "provider_failover_attempted": True,
-        "provider_failover_reason": "quota_or_capacity_signal",
-        "provider_failover_final_provider": "fallback-openai",
-        "provider_failover_chain": ["primary-openai", "fallback-openai"],
-        "provider_failover_hops": 1,
-        "provider_failover_original_provider": "primary-openai",
-    }
-    result = evaluator._create_error_result(
-        check=SimpleNamespace(id="CL-ERR"),
-        row_id="CL-ERR:row_1",
-        error_message="synthetic error",
-        evaluation_duration_ms=77,
-    )
-    assert result.provenance.original_provider == "primary-openai"
-    assert result.provenance.final_provider == "fallback-openai"
-    assert result.provenance.failover_count == 1
-    assert result.provenance.failover_chain == ["primary-openai", "fallback-openai"]
-    assert result.provenance.failover_reason == "quota_or_capacity_signal"
-def test_filter_prompt_toolsets_keeps_skill_tools_optional_when_skills_not_needed():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_docs": object(),
-        }
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-055:row_54",
-        requirement_text="MTTR policy",
-        requirement_category="SLA",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["docs/sla.md", "src/service.py"],
-        project_profile={},
-        requirement_interpretation={"skills_needed": False},
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert policy["applied"] is True
-    assert policy["mode"] == "skills_unavailable_disable"
-    assert policy["removed_tool_bindings"] == 4
-    assert len(filtered) == 1
-    assert set(filtered[0].tools.keys()) == {"search_docs"}
-def test_effective_user_prompt_places_excerpt_discipline_in_output_schema_section(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(
-        row_unified_mode="off",
-        instruction_mode="none",
-        evidence_strategy="inline",
-        instruction_language="vi",
-        agent_runtime_mode="pydanticai",
-    )
-    monkeypatch.setattr(evaluator, "_resolve_global_llm_config", lambda: None)
-    request = RowEvaluationRequest(
-        row_id="CL-386:row_1",
-        requirement_text="Ground every cited anchor with excerpt discipline",
-        requirement_category="Security",
-        requirement_guidance="",
-        evidence_context="context",
-        evidence_refs=["docs/security.md", "src/service.py"],
-        project_profile={},
-        requirement_interpretation={},
-    )
-    prompt = evaluator._effective_user_prompt(request)
-    assert "[Anchor identity contract]" in prompt
-    assert "You MUST cite only from the allowed anchor ids below." in prompt
-    assert "Return cited_anchor_ids as a subset of allowed_anchor_ids." in prompt
-    assert "YÊU CẦU OUTPUT SCHEMA (ưu tiên cao, bắt buộc):" in prompt
-    assert "tool-read content" in prompt
-    assert 'Ví dụ hợp lệ: excerpt: "spring.datasource.url=jdbc:postgresql://..."' in prompt
-    assert 'Ví dụ không hợp lệ: excerpt: "" (thiếu excerpt_unavailable_reason).' in prompt
-    anchor_contract = prompt.split("[Anchor identity contract]", 1)[1]
-    assert "verbatim 1-2 line excerpt from retrieved content" not in anchor_contract
-    assert 'allowed_anchor_ids=["docs/security.md", "src/service.py"]' in prompt
-def test_build_instruction_config_disables_skills_when_none_are_resolved(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(
-        row_unified_mode="off",
-        instruction_mode="minimal",
-        evidence_strategy="tool_first",
-        instruction_language="vi",
-        agent_runtime_mode="pydanticai",
-    )
-    monkeypatch.setattr(
-        evaluator,
-        "_resolve_global_llm_config",
-        lambda: SimpleNamespace(skills_toolset_enabled=True, mcp_toolsets_enabled=False),
-    )
-    monkeypatch.setattr(evaluator, "_resolve_available_skills", lambda _cfg: [])
-    cfg = evaluator._build_instruction_config()
-    assert cfg.skills_toolset_enabled is False
-    assert cfg.available_skills == []
-def test_build_instruction_trace_metadata_can_focus_available_skills(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(
-        row_unified_mode="off",
-        instruction_mode="minimal",
-        evidence_strategy="tool_first",
-        instruction_language="vi",
-        agent_runtime_mode="pydanticai",
-    )
-    monkeypatch.setattr(
-        evaluator,
-        "_resolve_global_llm_config",
-        lambda: SimpleNamespace(skills_toolset_enabled=True, mcp_toolsets_enabled=True),
-    )
-    monkeypatch.setattr(
-        evaluator,
-        "_resolve_available_skills",
-        lambda _cfg: [
-            module.SkillDescriptor(name="audit-orchestrator-skill", description="audit"),
-            module.SkillDescriptor(name="hexagonal-compliance-skill", description="hex"),
-            module.SkillDescriptor(name="research-skill", description="research"),
-        ],
-    )
-    trace = evaluator._build_instruction_trace_metadata(
-        available_skill_names_override=["hexagonal-compliance-skill", "research-skill"]
-    )
-    assert trace["instruction_inputs"]["available_skills"] == [
-        "hexagonal-compliance-skill",
-        "research-skill",
-    ]
-    assert trace["instruction_inputs"]["available_skill_count"] == 2
-def test_skill_operation_directives_forbid_skill_calls_when_available_skills_empty():
-    from vds_audit_orchestrator.llm.prompts.instruction_templates import AgentRuntimeMode, _runtime_directives
-    directives = _runtime_directives(AgentRuntimeMode.PYDANTICAI, "vi", skills_toolset_enabled=True)
-    assert "Nếu `Available Skills` đang rỗng" in directives
-    assert "không gọi `list_skills`, `load_skill`, `read_skill_resource`, hoặc `run_skill_script`" in directives
-def test_resolve_skill_directories_falls_back_when_configured_directory_is_missing(monkeypatch, tmp_path: Path):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    missing_dir = tmp_path / "missing-skills"
-    fallback_dir = tmp_path / "default-skills"
-    fallback_dir.mkdir()
-    import sys
-    fake_module = SimpleNamespace(DEFAULT_AUDIT_SKILLS=[str(fallback_dir)])
-    monkeypatch.setitem(sys.modules, "vds_audit_orchestrator.agents.toolsets.skills_toolset", fake_module)
-    resolved = evaluator._resolve_skill_directories(SimpleNamespace(skills_directories=[str(missing_dir)]))
-    assert resolved == [str(fallback_dir)]
-def test_filter_prompt_toolsets_disables_skill_tools_when_tool_first_skill_already_effective():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator._tool_first_skill_usage_hint = {
-        "skill_calls": 1,
-        "skill_execution_calls": 1,
-        "skill_effective_calls": 1,
-    }
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_docs": object(),
-        }
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-050:row_49",
-        requirement_text="SAST/DAST policy",
-        requirement_category="Security",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["docs/security.md", "src/security.py"],
-        project_profile={},
-        requirement_interpretation={"skills_needed": True},
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert policy["applied"] is True
-    assert policy["mode"] == "tool_first_skill_already_effective"
-    assert policy["removed_tool_bindings"] == 4
-    assert policy["tool_first_skill_effective_calls"] == 1
-    assert len(filtered) == 1
-    assert set(filtered[0].tools.keys()) == {"search_docs"}
-def test_filter_prompt_toolsets_disables_skill_tools_when_no_skills_are_available():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_docs": object(),
-        },
-        skills={},
-        _vds_skill_metadata=[],
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-051:row_50",
-        requirement_text="Need grounded evidence without skill drift",
-        requirement_category="Delivery workflow",
-        requirement_guidance="",
-        evidence_context="thin context",
-        evidence_refs=["chunk:abc"],
-        project_profile={},
-        requirement_interpretation={"skills_needed": False},
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert policy["applied"] is True
-    assert policy["mode"] == "skills_unavailable_disable"
-    assert policy["removed_tool_bindings"] == 4
-    assert len(filtered) == 1
-    assert set(filtered[0].tools.keys()) == {"search_docs"}
-def test_call_llm_appends_override_prompt_when_skills_are_unavailable(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(tool_first_enabled=True, runtime_context=None)
-    captured: dict[str, object] = {}
-    monkeypatch.setattr(evaluator, "_effective_system_prompt", lambda **_kwargs: "SYSTEM")
-    monkeypatch.setattr(evaluator, "_effective_user_prompt", lambda _request: "USER")
-    monkeypatch.setattr(evaluator, "_build_prompt_evaluator_toolsets", lambda: ["toolset"])
-    evaluator._effective_protocol = "openai"
-    evaluator._effective_model = "glm-5"
-    monkeypatch.setattr(
-        evaluator,
-        "_filter_prompt_toolsets_for_request",
-        lambda *, toolsets, request: (
-            ["toolset-no-skills"],
-            {"mode": "skills_unavailable_disable", "applied": True, "removed_tool_bindings": 4},
-        ),
-    )
-    monkeypatch.setattr(module, "_resolved_protocol_model", lambda *_args: ("openai", "glm-5"))
-    monkeypatch.setattr(module.LLMRowEvaluator, "_runtime_log_fields", staticmethod(lambda _ctx: {}))
-    def _fake_call_prompt_backend(*, user_prompt, **kwargs):
-        captured["user_prompt"] = user_prompt
-        return module.RowEvaluationResponse(
-            row_id="CL-052:row_51",
-            status="PASS",
-            summary="ok",
-            findings=[],
-            evidence_refs=[],
-        ), {}
-    monkeypatch.setattr(evaluator, "_build_agent_key", lambda **_kwargs: "agent-key")
-    monkeypatch.setattr(evaluator, "_invoke_sync_agent_with_provider_retry", _fake_call_prompt_backend)
-    request = RowEvaluationRequest(
-        row_id="CL-052:row_51",
-        requirement_text="Need grounded output",
-        requirement_category="Security",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["docs/security.md"],
-        project_profile={},
-        requirement_interpretation={"skills_needed": False},
-    )
-    evaluator._call_llm(request, row_id=request.row_id)
-    user_prompt = str(captured["user_prompt"])
-    assert "runtime không resolve được skill khả dụng nào" in user_prompt
-    assert "Không gọi list_skills/load_skill/read_skill_resource/run_skill_script" in user_prompt
-def test_call_llm_retries_with_reinforced_prompt_for_unavailable_focused_skill(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    evaluator.config = SimpleNamespace(tool_first_enabled=True, runtime_context=None)
-    evaluator._prompt_telemetry_merged_exception_ids = set()
-    evaluator._pending_prompt_telemetry = {}
-    evaluator._last_prompt_tool_telemetry = {}
-    monkeypatch.setattr(evaluator, "_merge_prompt_telemetry_from_exception", lambda _exc: None)
-    captured_prompts: list[str] = []
-    monkeypatch.setattr(evaluator, "_effective_system_prompt", lambda **_kwargs: "SYSTEM")
-    monkeypatch.setattr(evaluator, "_effective_user_prompt", lambda _request: "USER")
-    monkeypatch.setattr(
-        evaluator,
-        "_build_prompt_evaluator_toolsets",
-        lambda: [SimpleNamespace(tools={"load_skill": object(), "read_skill_resource": object()}, skills={})],
-    )
-    monkeypatch.setattr(
-        evaluator,
-        "_filter_prompt_toolsets_for_request",
-        lambda *, toolsets, request: (
-            [SimpleNamespace(tools={"load_skill": object(), "read_skill_resource": object()}, skills={})],
-            {
-                "mode": "skills_needed_focus",
-                "applied": True,
-                "focused_skills": ["hexagonal-compliance-skill"],
-                "removed_tool_bindings": 0,
-                "removed_skill_entries": 24,
-            },
-        ),
-    )
-    evaluator._effective_protocol = "openai"
-    evaluator._effective_model = "qwen3.5-plus"
-    monkeypatch.setattr(module, "_resolved_protocol_model", lambda *_args: ("openai", "qwen3.5-plus"))
-    monkeypatch.setattr(module.LLMRowEvaluator, "_runtime_log_fields", staticmethod(lambda _ctx: {}))
-    monkeypatch.setattr(evaluator, "_build_agent_key", lambda **_kwargs: "agent-key")
-    call_counter = {"count": 0}
-    def _fake_call_prompt_backend(*, user_prompt, **kwargs):
-        captured_prompts.append(str(user_prompt))
-        call_counter["count"] += 1
-        if call_counter["count"] == 1:
-            raise RuntimeError("Skill 'audit-orchestrator-skill' not found. Available: hexagonal-compliance-skill")
-        return module.RowEvaluationResponse(summary="ok", findings=[], evidence_refs=[])
-    monkeypatch.setattr(evaluator, "_invoke_sync_agent_with_provider_retry", _fake_call_prompt_backend)
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text="Hexagonal / Clean Architecture",
-        requirement_category="1. Decoupling",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["src/main/java/com/example/domain/OrderAggregate.java"],
-        project_profile={},
-        requirement_interpretation={"skills_needed": True, "skill_objectives": ["analyze_architecture_compliance"]},
-    )
-    response = evaluator._call_llm(request, row_id=request.row_id)
-    assert isinstance(response, module.RowEvaluationResponse)
-    assert call_counter["count"] == 2
-    assert len(captured_prompts) == 2
-    assert "OVERRIDE SKILL DISCIPLINE CHO ROW NÀY" in captured_prompts[1]
-    assert "allowlist" in captured_prompts[1]
-    assert "audit-orchestrator-skill" not in captured_prompts[1]
-    assert "hexagonal-compliance-skill" in captured_prompts[1]
-    assert "research-skill" not in captured_prompts[1]
-    assert "code-review-graph-skill" not in captured_prompts[1]
-def test_prompt_tool_telemetry_marks_missing_skill_resource_as_unsuccessful():
-    result = SimpleNamespace(
-        all_messages=[
-            {
-                "parts": [
-                    {
-                        "part_kind": "tool-call",
-                        "tool_name": "read_skill_resource",
-                        "tool_call_id": "call-1",
-                        "args": {
-                            "skill_name": "audit-orchestrator-skill",
-                            "resource_path": "sonarqube_orchestrator/src/vds_sonarqube_orchestrator/external_sca.py",
-                        },
-                    },
-                    {
-                        "part_kind": "tool-return",
-                        "tool_name": "read_skill_resource",
-                        "tool_call_id": "call-1",
-                        "content": "[skill-resource-missing] Requested resource is unavailable for the selected skill.",
-                    },
-                ]
-            }
-        ]
-    )
-    telemetry = module.LLMRowEvaluator._extract_prompt_tool_telemetry(result)["prompt_tool_telemetry"]
-    usage = telemetry["event_skill_tool_usage"]
-    assert len(usage) == 1
-    assert usage[0]["tool"] == "read_skill_resource"
-    assert usage[0]["success"] is False
-    assert usage[0]["effective"] is False
-    assert usage[0]["error"] == "resource_not_found"
-def test_filter_prompt_toolsets_disables_skill_tools_for_advisory_skip(monkeypatch):
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    monkeypatch.setenv("VDS_AUDIT_AGENTIC_STRICT_REQUIRE_EFFECTIVE_SKILL", "false")
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_docs": object(),
-        }
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-055:row_54",
-        requirement_text="MTTR policy",
-        requirement_category="SLA",
-        requirement_guidance="",
-        evidence_context="A" * 900,
-        evidence_refs=[
-            "docs/sla.md",
-            "chunk:abc",
-            "src/service.py",
-            "src/controller.ts",
-            "src/config.yaml",
-        ],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "evidence_targets": ["docs", "code"],
-            "skill_objectives": ["analyze_documentation_artifacts"],
-        },
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert policy["applied"] is True
-    assert policy["mode"] == "advisory_skip_grounded"
-    assert policy["removed_tool_bindings"] == 4
-    assert len(filtered) == 1
-    assert set(filtered[0].tools.keys()) == {"search_docs"}
-def test_toolset_cache_signature_changes_when_tool_names_change():
-    toolset_with_skills = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "search_docs": object(),
-        }
-    )
-    toolset_without_skills = SimpleNamespace(
-        tools={
-            "search_docs": object(),
-        }
-    )
-    sig_with_skills = module.LLMRowEvaluator._toolset_cache_signature([toolset_with_skills])
-    sig_without_skills = module.LLMRowEvaluator._toolset_cache_signature([toolset_without_skills])
-    assert sig_with_skills != sig_without_skills
-def test_filter_prompt_toolsets_focuses_skill_allowlist_for_skills_needed_rows():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_docs": object(),
-        },
-        skills={
-            "audit-orchestrator-skill": object(),
-            "research-skill": object(),
-            "vds-scripts-skill": object(),
-            "grepai-skill": object(),
-        },
-        _vds_skill_metadata=[
-            SimpleNamespace(name="audit-orchestrator-skill"),
-            SimpleNamespace(name="research-skill"),
-            SimpleNamespace(name="vds-scripts-skill"),
-            SimpleNamespace(name="grepai-skill"),
-        ],
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-037:row_36",
-        requirement_text="Need workflow automation commands and runbook validation",
-        requirement_category="Delivery workflow",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=["chunk:abc"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_tooling_automation_guidance"],
-            "evidence_targets": ["docs"],
-        },
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert policy["applied"] is True
-    assert policy["mode"] == "skills_needed_focus"
-    assert "vds-scripts-skill" in policy["focused_skills"]
-    assert len(filtered) == 1
-    assert set(filtered[0].skills.keys()) == {
-        "vds-scripts-skill",
-        "audit-orchestrator-skill",
-        "research-skill",
-    }
-def test_filter_prompt_toolsets_removes_doc_tools_for_code_only_decoupling_rows():
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    toolset = SimpleNamespace(
-        tools={
-            "search_docs": object(),
-            "read_doc_chunk": object(),
-            "search_code": object(),
-            "read_code_file": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-        },
-        skills={
-            "hexagonal-compliance-skill": object(),
-            "research-skill": object(),
-        },
-        _vds_skill_metadata=[
-            SimpleNamespace(name="hexagonal-compliance-skill"),
-            SimpleNamespace(name="research-skill"),
-        ],
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text=(
-            "1. Hexagonal / Clean Architecture: Domain Logic không phụ thuộc vào Framework, UI, hay Database."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=[
-            "src/main/java/com/example/config/AppConfig.java",
-            "src/main/java/com/example/domain/OrderAggregate.java",
-        ],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-            "evidence_targets": ["docs", "code"],
-            "docs_markers": ["architecture"],
-            "code_markers": ["decoupling"],
-        },
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert len(filtered) == 1
-    assert "search_docs" not in filtered[0].tools
-    assert "read_doc_chunk" not in filtered[0].tools
-    assert "search_code" in filtered[0].tools
-    assert "read_code_file" in filtered[0].tools
-    assert policy["removed_tool_bindings"] >= 2
-def test_filter_prompt_toolsets_disables_skill_tools_for_explicit_interface_rows() -> None:
-    evaluator = module.LLMRowEvaluator.__new__(module.LLMRowEvaluator)
-    toolset = SimpleNamespace(
-        tools={
-            "search_docs": object(),
-            "read_doc_chunk": object(),
-            "search_code": object(),
-            "read_code_file": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-        },
-        skills={
-            "audit-orchestrator-skill": object(),
-            "research-skill": object(),
-        },
-        _vds_skill_metadata=[
-            SimpleNamespace(name="audit-orchestrator-skill"),
-            SimpleNamespace(name="research-skill"),
-        ],
-    )
-    request = RowEvaluationRequest(
-        row_id="CL-002:row_1",
-        requirement_text=(
-            "2. Explicit Public Interface: Cross-module call phải thông qua interface, API client, "
-            "message contract. Không import concrete implementation từ module khác."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=["src/main/java/com/example/client/PaymentApiClient.java"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": False,
-            "skill_objectives": [],
-            "evidence_targets": ["code"],
-        },
-    )
-    filtered, policy = evaluator._filter_prompt_toolsets_for_request(toolsets=[toolset], request=request)
-    assert len(filtered) == 1
-    assert "search_docs" not in filtered[0].tools
-    assert "read_doc_chunk" not in filtered[0].tools
-    assert "load_skill" not in filtered[0].tools
-    assert "read_skill_resource" not in filtered[0].tools
-    assert "run_skill_script" not in filtered[0].tools
-    assert "search_code" in filtered[0].tools
-    assert "read_code_file" in filtered[0].tools
-    assert policy["mode"] == "explicit_interface_disable_optional_skills"
-def test_apply_skill_guided_code_focus_replaces_generic_refs_with_substantive_code_targets():
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text=(
-            "1. Hexagonal / Clean Architecture: Domain Logic không phụ thuộc vào Framework, UI, hay Database."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=[
-            "Dockerfile",
-            "pom.xml",
-            "src/main/java/com/example/domain/OrderAggregate.java",
-        ],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-            "code_targets": [
-                "pom.xml",
-                "src/main/java/com/example/domain/OrderAggregate.java",
-                "src/main/java/com/example/repository/OrderRepositoryImpl.java",
-            ],
-        },
-    )
-    focused = module.LLMRowEvaluator._apply_skill_guided_code_focus(request)
-    assert focused is not request
-    assert focused.evidence_refs == [
-        "src/main/java/com/example/domain/OrderAggregate.java",
-        "src/main/java/com/example/repository/OrderRepositoryImpl.java",
-    ]
-    assert "SKILL-GUIDED CODE FOCUS" in focused.evidence_context
-def test_apply_skill_guided_code_focus_keeps_original_refs_when_substantive_targets_missing():
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text=(
-            "1. Hexagonal / Clean Architecture: Domain Logic không phụ thuộc vào Framework, UI, hay Database."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=["Dockerfile", "pom.xml"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-            "code_targets": ["pom.xml"],
-        },
-    )
-    focused = module.LLMRowEvaluator._apply_skill_guided_code_focus(request)
-    assert focused is request
-def test_apply_skill_guided_code_focus_prefers_domain_repository_over_config_aop():
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text=(
-            "1. Hexagonal / Clean Architecture: Domain Logic không phụ thuộc vào Framework, UI, hay Database."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=[
-            "src/main/java/com/example/config/LogBookConfig.java",
-            "src/main/java/com/example/aop/RepositoryAspect.java",
-        ],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-            "code_targets": [
-                "src/main/java/com/example/config/LogBookConfig.java",
-                "src/main/java/com/example/aop/RepositoryAspect.java",
-                "src/main/java/com/example/domain/OrderAggregate.java",
-                "src/main/java/com/example/repository/OrderRepositoryImpl.java",
-            ],
-        },
-    )
-    focused = module.LLMRowEvaluator._apply_skill_guided_code_focus(request)
-    assert focused is not request
-    assert focused.evidence_refs == [
-        "src/main/java/com/example/domain/OrderAggregate.java",
-        "src/main/java/com/example/repository/OrderRepositoryImpl.java",
-    ]
-def test_preload_prompt_skill_context_strips_live_skill_tools_for_exact_match():
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text=(
-            "1. Hexagonal / Clean Architecture: Domain Logic không phụ thuộc vào Framework, UI, hay Database."
-        ),
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="limited",
-        evidence_refs=["src/main/java/com/example/domain/OrderAggregate.java"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-            "code_targets": [
-                "src/main/java/com/example/domain/OrderAggregate.java",
-                "src/main/java/com/example/repository/OrderRepositoryImpl.java",
-            ],
-        },
-    )
-    fake_skill = SimpleNamespace(
-        description="Hexagonal architecture guidance",
-        content="Prefer domain/model/entity and repository adapter evidence before config classes.",
-        resources=[SimpleNamespace(name="references/row-analysis.md")],
-        scripts=[],
-    )
-    toolset = SimpleNamespace(
-        tools={
-            "list_skills": object(),
-            "load_skill": object(),
-            "read_skill_resource": object(),
-            "run_skill_script": object(),
-            "search_code": object(),
-        },
-        skills={"hexagonal-compliance-skill": object()},
-        get_skill=lambda name: fake_skill if name == "hexagonal-compliance-skill" else None,
-        _vds_skill_metadata=[SimpleNamespace(name="hexagonal-compliance-skill")],
-    )
-    filtered_toolsets, policy, preloaded = evaluator._preload_prompt_skill_context_if_exact_match(
-        toolsets=[toolset],
-        request=request,
-        toolset_policy={
-            "mode": "skills_needed_focus",
-            "focused_skills": ["hexagonal-compliance-skill"],
-            "removed_tool_bindings": 0,
-        },
-    )
-    assert policy["mode"] == "skills_needed_preloaded_exact_match"
-    assert policy["preloaded_skills"] == ["hexagonal-compliance-skill"]
-    assert "Hexagonal architecture guidance" in preloaded
-    assert "hexagonal-compliance-skill" in preloaded
-    assert "list_skills" not in filtered_toolsets[0].tools
-    assert "load_skill" not in filtered_toolsets[0].tools
-    assert "read_skill_resource" not in filtered_toolsets[0].tools
-    assert "run_skill_script" not in filtered_toolsets[0].tools
-    assert "search_code" in filtered_toolsets[0].tools
-def test_instruction_skill_override_for_policy_prefers_preloaded_then_focused():
-    assert module.LLMRowEvaluator._instruction_skill_override_for_policy(
-        {
-            "mode": "skills_needed_preloaded_exact_match",
-            "preloaded_skills": ["hexagonal-compliance-skill"],
-            "focused_skills": ["research-skill"],
-        }
-    ) == ["hexagonal-compliance-skill"]
-    assert module.LLMRowEvaluator._instruction_skill_override_for_policy(
-        {
-            "mode": "skills_needed_focus",
-            "focused_skills": ["hexagonal-compliance-skill"],
-        }
-    ) == ["hexagonal-compliance-skill"]
-    assert module.LLMRowEvaluator._instruction_skill_override_for_policy({"mode": "none"}) is None
-def test_preloaded_exact_match_counts_as_effective_for_single_required_skill():
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text="Hexagonal architecture",
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["src/main/java/com/example/domain/OrderAggregate.java"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-        },
-    )
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["hexagonal-compliance-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["hexagonal-compliance-skill"]
-    assert evaluator._preloaded_exact_match_counts_as_effective(request=request, execution_calls=1) is True
-def test_needs_skill_policy_retry_skips_false_retry_for_preloaded_exact_match():
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-001:row_0",
-        requirement_text="Hexagonal architecture",
-        requirement_category="1. Decoupling||SourceCode Organization",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["src/main/java/com/example/domain/OrderAggregate.java"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_architecture_compliance"],
-        },
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["hexagonal-compliance-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["hexagonal-compliance-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (1, 1, 0)
-    assert evaluator._needs_skill_policy_retry(request=request, backend_trace={}) is False
-def test_maybe_unverified_ref_retry_skips_for_authoritative_single_skill_result():
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    evaluator._last_retry_telemetry = {}
-    request = RowEvaluationRequest(
-        row_id="CL-007:row_6",
-        requirement_text="Migration script detection",
-        requirement_category="1. Decoupling||Database migration",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["src/main/resources/application.properties"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["detect_migration_scripts"],
-        },
-    )
-    response = RowEvaluationResponse(
-        reasoning="Skill result is already authoritative for this row.",
-        reason="Không tìm thấy migration script độc lập.",
-        score=35.0,
-        confidence=0.9,
-        finding="Repo thiếu migration asset chuẩn.",
-        evidence_anchors=[
-            {
-                "ref_type": "config",
-                "ref_value": "src/main/resources/application.properties",
-                "excerpt": "",
-            }
-        ],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-        fix_suggestions=[],
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["migration-script-detection-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["migration-script-detection-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (1, 1, 1)
-    call_count = {"n": 0}
-    def _unexpected_call_llm(*_args, **_kwargs):
-        call_count["n"] += 1
-        return response
-    evaluator._call_llm = _unexpected_call_llm
-    updated, trace = evaluator._maybe_unverified_ref_retry(
-        response=response,
-        request=request,
-        row_id="CL-007:row_6",
-        arbitration_metadata={
-            "tool_first_loop": {
-                "trace_steps": [
-                    {"tool": "read_skill_resource", "output": {"success": True}},
-                    {
-                        "tool": "record_evidence_refs",
-                        "output": {"refs": [{"ref_value": "src/main/resources/application.properties"}]},
-                    },
-                ]
-            }
-        },
-    )
-    assert updated is response
-    assert call_count["n"] == 0
-    assert (trace.get("retry_outcome") or "") == "retry_skipped_authoritative_skill_result"
-    assert trace.get("retry_attempted") is False
-    assert trace.get("strict_skill_short_circuit_reason") == "authoritative_skill_result"
-    assert trace.get("authoritative_skill_result") == {
-        "recommended_skills": ["migration-script-detection-skill"],
-        "observed_skill_calls": 1,
-        "observed_skill_execution_calls": 1,
-        "observed_skill_effective_calls": 1,
-    }
-def test_authoritative_skill_trace_payload_includes_recommended_skills_and_usage_counts() -> None:
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-004:row_trace",
-        requirement_text="No shared datasource across service boundaries",
-        requirement_category="1. Decoupling||Database",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["alpha/application.properties"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_datasource_isolation"],
-        },
-    )
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["app-config-discovery-skill"]
-    evaluator._resolve_skill_usage_counts_for_policy = lambda request, backend_trace: (1, 1, 1)
-    payload = evaluator._build_authoritative_skill_result_trace_payload(
-        request=request,
-        retrieval_trace={"skill_policy_retry": {"observed_skill_effective_calls": 1}},
-    )
-    assert payload == {
-        "strict_skill_short_circuit_reason": "authoritative_skill_result",
-        "authoritative_skill_result": {
-            "recommended_skills": ["app-config-discovery-skill"],
-            "observed_skill_calls": 1,
-            "observed_skill_execution_calls": 1,
-            "observed_skill_effective_calls": 1,
-        },
-    }
-def test_authoritative_skill_result_still_short_circuits_retry_for_decisive_low_confidence_fail() -> None:
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-004:row_3",
-        requirement_text="No shared datasource across service boundaries",
-        requirement_category="1. Decoupling||Database",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["alpha/application.properties"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_datasource_isolation"],
-        },
-    )
-    response = RowEvaluationResponse(
-        reasoning="Strict app-config skill found a shared datasource violation.",
-        reason="Service đang dùng chung datasource nghiệp vụ với boundary participant_management.",
-        score=13.8,
-        confidence=0.2,
-        finding="Datasource isolation violated.",
-        evidence_anchors=[
-            {
-                "ref_type": "config",
-                "ref_value": "alpha/application.properties",
-                "excerpt": "",
-            }
-        ],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-        fix_suggestions=[],
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["app-config-discovery-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["app-config-discovery-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (1, 1, 1)
-    assert (
-        evaluator._should_short_circuit_unverified_ref_retry_for_authoritative_skill_result(
-            request=request,
-            retrieval_trace={"tool_first_loop": {"skill_effective_call_count": 1}},
-            response=response,
-        )
-        is True
-    )
-def test_authoritative_skill_result_short_circuits_retry_even_with_grounding_language() -> None:
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-004:row_grounding",
-        requirement_text="No shared datasource across service boundaries",
-        requirement_category="1. Decoupling||Database",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["alpha/application.properties"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_datasource_isolation"],
-        },
-    )
-    response = RowEvaluationResponse(
-        reasoning="Strict app-config skill found the decisive datasource reference.",
-        reason="Chưa đủ grounding tổng quát, nhưng app-config đã xác nhận shared datasource.",
-        score=20.0,
-        confidence=0.3,
-        finding="Datasource isolation violated.",
-        evidence_anchors=[
-            {
-                "ref_type": "config",
-                "ref_value": "alpha/application.properties",
-                "excerpt": "",
-            }
-        ],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-        fix_suggestions=[],
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["app-config-discovery-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["app-config-discovery-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (1, 1, 1)
-    assert (
-        evaluator._should_short_circuit_unverified_ref_retry_for_authoritative_skill_result(
-            request=request,
-            retrieval_trace={"tool_first_loop": {"skill_effective_call_count": 1}},
-            response=response,
-        )
-        is True
-    )
-def test_authoritative_skill_lane_short_circuits_retry_before_usage_counts_are_recorded() -> None:
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-004:row_pre_policy",
-        requirement_text="No shared datasource across service boundaries",
-        requirement_category="1. Decoupling||Database",
-        requirement_guidance="",
-        evidence_context="grounded",
-        evidence_refs=["alpha/application.properties"],
-        project_profile={},
-        requirement_interpretation={
-            "skills_needed": True,
-            "skill_objectives": ["analyze_datasource_isolation"],
-        },
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["app-config-discovery-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["app-config-discovery-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (0, 0, 0)
-    assert (
-        evaluator._should_short_circuit_unverified_ref_retry_for_authoritative_skill_result(
-            request=request,
-            retrieval_trace={"prompt_tool_telemetry": {"event_tool_calls_completed": 0}},
-        )
-        is True
-    )
-def test_toolset_cache_signature_changes_when_skill_names_change_even_with_same_tool_names():
-    toolset_skills_a = SimpleNamespace(
-        tools={"search_docs": object(), "load_skill": object()},
-        skills={"research-skill": object()},
-    )
-    toolset_skills_b = SimpleNamespace(
-        tools={"search_docs": object(), "load_skill": object()},
-        skills={"audit-orchestrator-skill": object()},
-    )
-    sig_a = module.LLMRowEvaluator._toolset_cache_signature([toolset_skills_a])
-    sig_b = module.LLMRowEvaluator._toolset_cache_signature([toolset_skills_b])
-    assert sig_a != sig_b
-def test_authoritative_single_skill_result_skips_dspy_judge():
-    evaluator = object.__new__(module.LLMRowEvaluator)
-    request = RowEvaluationRequest(
-        row_id="CL-004:row_3",
-        requirement_text="No circular dependency across project",
-        requirement_category="1. Decoupling",
-        requirement_guidance="Use project dependency artifacts",
-        evidence_context="context",
-        evidence_refs=["project-dependency-graph.json", "project-cycle-report.json"],
-        project_profile={},
-        requirement_interpretation={},
-        code_evidence=None,
-        traceability=[],
-    )
-    evaluator._request_skills_needed = lambda _request: True
-    evaluator._strict_require_effective_skill = lambda: True
-    evaluator._instruction_skills_enabled = lambda: True
-    evaluator._instruction_available_skills = lambda: ["circular-dependency-skill"]
-    evaluator._recommend_skills_for_request = lambda _request, limit=3: ["circular-dependency-skill"]
-    evaluator._extract_skill_usage_counts = lambda _trace: (1, 1, 1)
-    assert (
-        evaluator._should_skip_judge_for_authoritative_skill_result(
-            request=request,
-            retrieval_trace={
-                "tool_first_loop": {
-                    "trace_steps": [
-                        {"tool": "read_skill_resource", "output": {"success": True}},
-                        {
-                            "tool": "record_evidence_refs",
-                            "output": {"refs": [{"ref_value": "project-cycle-report.json"}]},
-                        },
-                    ]
-                }
-            },
-        )
-        is True
-    )
-@pytest.mark.parametrize(
-    ("raw", "expected"),
-    [
-        (True, True),
-        (False, False),
-        ("true", True),
-        ("false", False),
-        ("1", True),
-        ("0", False),
-        ("yes", True),
-        ("no", False),
-        ("", False),
-    ],
-)
-def test_coerce_bool_flag_handles_string_flags(raw, expected):
-    assert module._coerce_bool_flag(raw) is expected
-def test_infer_ref_type_classifies_chunk_as_confluence_chunk():
-    assert module.LLMRowEvaluator._infer_ref_type("chunk:abc123") == "confluence_chunk"
-def test_filter_allowed_anchor_ids_for_docs_primary_excludes_code_refs():
-    filtered = module.LLMRowEvaluator._filter_allowed_anchor_ids_for_requirement(
-        [
-            "src/main/java/com/example/Dto.java",
-            "https://confluence.local/pages/viewpage.action?pageId=123",
-            "docs/security.md",
-        ],
-        requirement_interpretation={
-            "required_anchor_modalities": ["docs"],
-            "finalization_policy": "docs_primary",
-        },
-    )
-    assert "src/main/java/com/example/dto.java" not in filtered
-    assert "https://confluence.local/pages/viewpage.action?pageid=123" in filtered
-def test_sanitize_unverified_retry_rejection_omits_raw_preview_for_project_scope_rows():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Base reason.",
-        score=62.0,
-        confidence=0.8,
-        finding="Base finding.",
-        evidence_anchors=[],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-    )
-    updated = module.LLMRowEvaluator._sanitize_unverified_retry_rejection(
-        response,
-        evidence_refs=[
-            "https://confluence.local/pages/viewpage.action?pageId=25123049",
-            "pom.xml",
-            "https://confluence.local/pages/viewpage.action?pageId=33916401",
-        ],
-        ratio=1.0,
-        project_scope_required=True,
-    )
-    assert "refs ưu tiên" not in updated.reason
-    assert "pom.xml" not in updated.reason
-    assert "unverified_ratio=1.00" in updated.reason
-    assert "refs ưu tiên" not in updated.finding
-    assert "artifact dependency graph/cycle ở phạm vi project" in updated.finding
-def test_sanitize_unverified_retry_rejection_keeps_preview_for_non_project_scope_rows():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Base reason.",
-        score=62.0,
-        confidence=0.8,
-        finding="Base finding.",
-        evidence_anchors=[],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-    )
-    updated = module.LLMRowEvaluator._sanitize_unverified_retry_rejection(
-        response,
-        evidence_refs=["src/main/java/AppConfig.java", "docs/adr-001.md"],
-        ratio=0.75,
-        project_scope_required=False,
-    )
-    assert "refs ưu tiên" in updated.reason
-    assert "src/main/java/AppConfig.java" in updated.reason
-def test_sanitize_no_evidence_contradiction_omits_raw_preview_for_project_scope_rows():
-    response = RowEvaluationResponse(
-        reasoning="reasoning",
-        reason="Base reason.",
-        score=62.0,
-        confidence=0.8,
-        finding="Base finding.",
-        evidence_anchors=[],
-        allowed_anchor_ids=[],
-        cited_anchor_ids=[],
-    )
-    updated = module.LLMRowEvaluator._sanitize_no_evidence_contradiction(
-        response,
-        evidence_refs=[
-            "attachment://57037192/143098445",
-            "https://confluence.local/pages/viewpage.action?pageId=25123049",
-            "pom.xml",
-        ],
-        project_scope_required=True,
-    )
-    assert "attachment://" not in updated.reason
-    assert "pageId=" not in updated.reason
-    assert "pom.xml" not in updated.reason
-    assert "Evidence references are present in the input context." in updated.reason
-def test_resolve_terminal_repair_llm_context_preserves_active_runtime_policy(monkeypatch):
-    active_llm_cfg = SimpleNamespace(
-        output_retries=4,
-        agent_retries=2,
-        agent_stream=False,
-        stream=False,
-        model_standard="z-ai/glm5",
-        base_url="http://localhost:8082",
-        protocol=SimpleNamespace(value="anthropic"),
-    )
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="z-ai/glm5",
-            use_dspy=False,
-            runtime_context={},
-            llm=active_llm_cfg,
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    monkeypatch.setattr(
-        evaluator,
-        "get_last_retry_telemetry_snapshot",
-        lambda: {"provider_failover_final_provider": "nvidia-glm-anthropic"},
-    )
-    monkeypatch.setattr(evaluator, "_resolve_active_runtime_profile_name", lambda: "nvidia-glm-anthropic")
-    monkeypatch.setattr(
-        module,
-        "inherit_runtime_llm_policy",
-        lambda *_args, **_kwargs: (_ for _ in ()).throw(AssertionError("should not rebuild active provider config")),
-    )
-    llm_cfg, protocol, model = evaluator._resolve_terminal_repair_llm_context()
-    assert llm_cfg is active_llm_cfg
-    assert protocol == "anthropic"
-    assert model == "z-ai/glm5"
-@pytest.mark.asyncio
-async def test_invoke_async_agent_with_provider_retry_preserves_runtime_policy_on_failover(monkeypatch):
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="primary-model",
-            use_dspy=False,
-            runtime_context={},
-            row_failover_profiles=["fallback-openai"],
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    primary_cfg = SimpleNamespace(
-        agent_retries=2,
-        output_retries=4,
-        max_tokens_per_request=1024,
-        model_standard="primary-model",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://primary.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=["fallback-openai"],
-        row_failover_max_provider_hops=1,
-        row_progress_lease_seconds=180,
-        row_stall_detection_seconds=45,
-        row_absolute_timeout_ms=900000,
-        agent_event_stream_enabled=False,
-    )
-    fallback_cfg = SimpleNamespace(
-        agent_retries=2,
-        output_retries=4,
-        max_tokens_per_request=1024,
-        model_standard="fallback-model",
-        protocol=SimpleNamespace(value="openai"),
-        base_url="https://fallback.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=["fallback-openai"],
-        row_failover_max_provider_hops=1,
-        row_progress_lease_seconds=180,
-        row_stall_detection_seconds=45,
-        row_absolute_timeout_ms=900000,
-        agent_event_stream_enabled=False,
-    )
-    inherited_calls: list[tuple[str, object]] = []
-    async def _fake_invoke_async_agent(**kwargs):
-        llm_cfg = kwargs.get("llm_cfg")
-        if getattr(llm_cfg, "model_standard", "") == "primary-model":
-            raise module.RowEvaluationBackendError(
-                "status_code: 503 service unavailable",
-                retry_metadata={"reason_code": "provider_server_error"},
-            )
-        return RowEvaluationResponse(
-            reasoning="reasoning",
-            reason="reason",
-            score=50.0,
-            confidence=0.8,
-            finding="finding",
-        )
-    def _fake_inherit_runtime_llm_policy(profile_name, *, source_llm=None):
-        inherited_calls.append((profile_name, source_llm))
-        return fallback_cfg
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 1)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(module, "inherit_runtime_llm_policy", _fake_inherit_runtime_llm_policy)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    response = await evaluator._invoke_async_agent_with_provider_retry(
-        key="retry-test-provider-failover-policy",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-        row_id="CL-004",
-        runtime_fields={"check_id": "CL-004"},
-        llm_cfg=primary_cfg,
-        effective_model="primary-model",
-        runtime_profile_name="primary-openai",
-    )
-    assert response.reason == "reason"
-    assert inherited_calls == [("fallback-openai", primary_cfg)]
-def test_build_agent_key_varies_with_runtime_policy() -> None:
-    evaluator = module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="anthropic"),
-            model="z-ai/glm5",
-            use_dspy=False,
-            runtime_context={},
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-    llm_low_retry = SimpleNamespace(
-        base_url="http://localhost:8082",
-        stream=False,
-        agent_stream=False,
-        max_tokens_per_request=1024,
-        max_tokens=1024,
-        agent_retries=1,
-        output_retries=1,
-    )
-    llm_high_retry = SimpleNamespace(
-        base_url="http://localhost:8082",
-        stream=False,
-        agent_stream=False,
-        max_tokens_per_request=1024,
-        max_tokens=1024,
-        agent_retries=1,
-        output_retries=4,
-    )
-    low_key = evaluator._build_agent_key(
-        async_mode=False,
-        protocol="anthropic",
-        model="z-ai/glm5",
-        instruction_suffix="tf",
-        system_prompt="system",
-        fallback_suffix=":terminal_repair",
-        llm_cfg=llm_low_retry,
-    )
-    high_key = evaluator._build_agent_key(
-        async_mode=False,
-        protocol="anthropic",
-        model="z-ai/glm5",
-        instruction_suffix="tf",
-        system_prompt="system",
-        fallback_suffix=":terminal_repair",
-        llm_cfg=llm_high_retry,
-    )
-    assert low_key != high_key
-# ---------------------------------------------------------------------------
-# TSK-166.6: Route-aware failover tests for LLMRowEvaluator prompt paths
-# ---------------------------------------------------------------------------
-def _make_p166_llm_cfg(
-    *,
-    model: str = "primary-model",
-    failover_profiles: list[str] | None = None,
-    max_hops: int = 1,
-) -> SimpleNamespace:
-    return SimpleNamespace(
-        agent_retries=0,
-        output_retries=0,
-        max_tokens=256,
-        model_standard=model,
-        protocol=SimpleNamespace(value="openai"),
-        base_url=f"https://{model}.example/v1",
-        agent_stream=False,
-        stream=False,
-        row_failover_profiles=failover_profiles or [],
-        row_failover_max_provider_hops=max_hops,
-    )
-def _make_p166_evaluator(*, failover_profiles: list[str] | None = None) -> module.LLMRowEvaluator:
-    return module.LLMRowEvaluator(
-        config=SimpleNamespace(
-            protocol=SimpleNamespace(value="openai"),
-            model="primary-model",
-            use_dspy=False,
-            runtime_context={},
-            row_failover_profiles=failover_profiles or [],
-        ),
-        template_hash="tpl",
-        evidence_hash="evidence",
-    )
-@pytest.mark.asyncio
-async def test_route_credential_gap_triggers_immediate_failover_to_fallback_profile(monkeypatch):
-    """HTTP 400 + 'No credentials for provider:' must skip same-provider retry and
-    failover directly to the fallback profile (TSK-166.6 — route gap + failover)."""
-    evaluator = _make_p166_evaluator(failover_profiles=["fallback-openai"])
-    primary_cfg = _make_p166_llm_cfg(model="primary-model", failover_profiles=["fallback-openai"])
-    fallback_cfg = _make_p166_llm_cfg(model="fallback-model", failover_profiles=[])
-    calls: list[str] = []
-    async def _fake_invoke_async_agent(**kwargs):
-        llm_cfg = kwargs.get("llm_cfg")
-        model = getattr(llm_cfg, "model_standard", "")
-        calls.append(model)
-        if model == "primary-model":
-            # Production error shape: proxy returns 400 when route has no credential for provider
-            raise module.RowEvaluationBackendError(
-                "no credentials for provider: anthropic",
-                retry_metadata={"reason_code": "provider_route_unavailable"},
-            )
-        return RowEvaluationResponse(
-            reasoning="ok",
-            reason="pass",
-            score=80.0,
-            confidence=0.9,
-            finding="finding",
-        )
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 3)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(module, "inherit_runtime_llm_policy", lambda profile_name, *, source_llm=None: fallback_cfg)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    response = await evaluator._invoke_async_agent_with_provider_retry(
-        key="route-gap-failover-test",
-        name="RowEvaluator",
-        system_prompt="system",
-        user_prompt="user",
-        toolsets=[],
-        profile={},
-        row_id="CL-001",
-        runtime_fields={"check_id": "CL-001"},
-        llm_cfg=primary_cfg,
-        effective_model="primary-model",
-        runtime_profile_name="primary-openai",
-    )
-    # Must succeed via fallback — NOT retry same provider (only 2 calls total)
-    assert response.reason == "pass"
-    assert calls == ["primary-model", "fallback-model"], "must not retry same provider before failover"
-    retry_telemetry = evaluator.get_last_retry_telemetry_snapshot() or {}
-    assert retry_telemetry["provider_failover_attempted"] is True
-    assert retry_telemetry["provider_failover_final_provider"] == "fallback-openai"
-    assert retry_telemetry["provider_failover_reason"] == "route_provider_credential_gap"
-@pytest.mark.asyncio
-async def test_route_credential_gap_raises_when_no_fallback_configured(monkeypatch):
-    """Route-credential gap with no fallback profiles must raise RowEvaluationBackendError
-    immediately — no retry, no silent swallow (TSK-166.6 — route gap exhaustion)."""
-    evaluator = _make_p166_evaluator(failover_profiles=[])
-    primary_cfg = _make_p166_llm_cfg(model="primary-model", failover_profiles=[])
-    async def _fake_invoke_async_agent(**_kwargs):
-        raise module.RowEvaluationBackendError(
-            "no credentials for provider: openai",
-            retry_metadata={"reason_code": "provider_route_unavailable"},
-        )
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 3)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    with pytest.raises(module.RowEvaluationBackendError) as exc_info:
-        await evaluator._invoke_async_agent_with_provider_retry(
-            key="route-gap-no-fallback-test",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-            row_id="CL-002",
-            runtime_fields={"check_id": "CL-002"},
-            llm_cfg=primary_cfg,
-            effective_model="primary-model",
-            runtime_profile_name="primary-openai",
-        )
-    retry_metadata = exc_info.value.retry_metadata or {}
-    assert retry_metadata["reason_code"] == "provider_route_unavailable"
-@pytest.mark.asyncio
-async def test_terminal_auth_401_does_not_enter_failover_path(monkeypatch):
-    """HTTP 401 is TERMINAL_AUTH (is_failoverable=False) — must raise immediately,
-    never touching the failover branch (TSK-166.6 — auth remains non-failoverable)."""
-    import httpx
-    evaluator = _make_p166_evaluator(failover_profiles=["fallback-openai"])
-    primary_cfg = _make_p166_llm_cfg(model="primary-model", failover_profiles=["fallback-openai"])
-    failover_calls: list[str] = []
-    async def _fake_invoke_async_agent(**_kwargs):
-        request = httpx.Request("POST", "https://api.openai.com/v1/chat/completions")
-        response = httpx.Response(401, request=request)
-        raise httpx.HTTPStatusError("401 Unauthorized", request=request, response=response)
-    monkeypatch.setenv("VDS_AUDIT_ACTIVE_PROFILE", "primary-openai")
-    monkeypatch.setenv("VDS_AUDIT_LLM__ROW_FAILOVER_PROFILES", "configured")
-    monkeypatch.setattr(module, "_prompt_provider_retry_attempts", lambda: 1)
-    monkeypatch.setattr(module, "_prompt_provider_retry_backoff_seconds", lambda: 0.0)
-    monkeypatch.setattr(evaluator, "_invoke_async_agent", _fake_invoke_async_agent)
-    with pytest.raises(httpx.HTTPStatusError):
-        await evaluator._invoke_async_agent_with_provider_retry(
-            key="terminal-auth-test",
-            name="RowEvaluator",
-            system_prompt="system",
-            user_prompt="user",
-            toolsets=[],
-            profile={},
-            row_id="CL-003",
-            runtime_fields={"check_id": "CL-003"},
-            llm_cfg=primary_cfg,
-            effective_model="primary-model",
-            runtime_profile_name="primary-openai",
-        )
-    assert failover_calls == [], "401 must not reach failover path"