PyPI - aetherdialect - Versions diffs - 0.1.0__py3-none-any.whl - Mend

aetherdialect 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

aetherdialect-0.1.0.dist-info/METADATA +197 -0
aetherdialect-0.1.0.dist-info/RECORD +34 -0
aetherdialect-0.1.0.dist-info/WHEEL +5 -0
aetherdialect-0.1.0.dist-info/licenses/LICENSE +7 -0
aetherdialect-0.1.0.dist-info/top_level.txt +1 -0
text2sql/__init__.py +7 -0
text2sql/config.py +1063 -0
text2sql/contracts_base.py +952 -0
text2sql/contracts_core.py +1890 -0
text2sql/core_utils.py +834 -0
text2sql/dialect.py +1134 -0
text2sql/expansion_ops.py +1218 -0
text2sql/expansion_rules.py +496 -0
text2sql/intent_expr.py +1759 -0
text2sql/intent_process.py +2133 -0
text2sql/intent_repair.py +1733 -0
text2sql/intent_resolve.py +1292 -0
text2sql/live_testing.py +1117 -0
text2sql/main_execution.py +799 -0
text2sql/pipeline.py +1662 -0
text2sql/qsim_ops.py +1286 -0
text2sql/qsim_sample.py +609 -0
text2sql/qsim_struct.py +569 -0
text2sql/schema.py +973 -0
text2sql/schema_profiling.py +2075 -0
text2sql/simulator.py +970 -0
text2sql/sql_gen.py +1537 -0
text2sql/templates.py +1037 -0
text2sql/text2sql.py +726 -0
text2sql/utils.py +973 -0
text2sql/validation_agg.py +1033 -0
text2sql/validation_execute.py +1092 -0
text2sql/validation_schema.py +1847 -0
text2sql/validation_semantic.py +2122 -0

text2sql/live_testing.py ADDED Viewed

@@ -0,0 +1,1117 @@
+"""Reusable live-testing framework for the text2sql pipeline.
+Provides data models (scenarios, expected outcomes, step results), a soft-assertion collector, a pipeline-capture context manager that monkey-patches interactive I/O, and a LiveTestRunner that orchestrates end-to-end execution of a scenario against a real LLM and live database.
+All classes and helpers are database-agnostic; the only database-specific pieces live in the scenario definitions and conftest fixtures provided by the caller.
+"""
+from __future__ import annotations
+import os
+import time
+import traceback
+from collections.abc import Callable
+from contextlib import contextmanager
+from dataclasses import dataclass, field, replace as dc_replace
+from typing import Any
+from unittest.mock import patch
+from .config import PolicyConfig
+from .contracts_core import RuntimeIntent
+from .core_utils import debug, normalize_question, substitute_params
+from .intent_process import match_template_for_union
+from .pipeline import (
+    check_and_handle_hard_block,
+    check_template_reuse,
+    compute_final_metrics,
+    confirm_intent_with_user,
+    display_final_results,
+    generate_and_validate_sql,
+    generate_join_candidates,
+    handle_direct_sql_reuse,
+    handle_user_feedback,
+    load_pipeline_resources,
+    parse_intent_via_llm,
+    save_result_csv,
+)
+from .templates import save_template_store
+from .utils import flatten_param_values, intent_key, validate_question
+from .validation_execute import execute_sql, get_spark_sql_for_execution
+@dataclass
+class Expected:
+    """Declarative expectations for a single pipeline run.
+    Every field is optional. When a field is ``None`` or the default, the corresponding assertion is skipped by the runner.
+    Args:
+        tables: Expected tables referenced by the generated intent.
+        min_rows: Minimum row count (inclusive) for the result set.
+        max_rows: Maximum row count (inclusive) for the result set.
+        min_confidence: Lower-bound confidence score (inclusive).
+        reuse_type: One of ``"direct_reuse"``, ``"intent_direct_reuse"``, or ``"none"``; when set, the runner asserts the template-match type.
+        contains_join: When ``True`` the SQL must include a JOIN clause.
+        contains_group_by: When ``True`` the SQL must include GROUP BY.
+        contains_cte: When ``True`` the SQL must include a CTE (``WITH``).
+        sql_contains: List of substrings that must all appear in the final SQL.
+        sql_excludes: List of substrings that must not appear in the final SQL.
+        grain: Expected intent grain (``"row_level"`` or ``"grouped"``).
+        should_hard_block: When ``True`` the pipeline should trigger a hard-block.
+        should_fail_validation: When ``True`` schema or result validation should fail.
+        column_names_one_of: Allowed column header lists; result must match one exactly.
+        row_value_check: Optional callable ``(rows) -> bool`` for custom value checks.
+        min_semantic_warnings: Minimum number of semantic warnings expected.
+        status: Expected pipeline exit status when the default ``"ok"`` is not appropriate (for example ``"restricted"`` or ``"hard_blocked"``).
+        status_in: When set, status must be one of the given values.
+        tables_one_of: When set, tables must equal one of the given lists (order-independent).
+        grain_in: When set, grain must be one of the given values.
+    """
+    tables: list[str] | None = None
+    tables_one_of: list[list[str]] | None = None
+    grain_in: tuple[str, ...] | None = None
+    min_rows: int | None = None
+    max_rows: int | None = None
+    min_confidence: float | None = None
+    reuse_type: str | tuple[str, ...] | None = None
+    contains_join: bool | None = None
+    contains_group_by: bool | None = None
+    contains_cte: bool | None = None
+    sql_contains: list[str] | None = None
+    sql_excludes: list[str] | None = None
+    grain: str | tuple[str, ...] | None = None
+    should_hard_block: bool = False
+    should_fail_validation: bool = False
+    column_names_one_of: list[list[str]] | None = None
+    row_value_check: Callable[[list[tuple]], bool] | None = None
+    min_semantic_warnings: int | None = None
+    status: str | None = None
+    status_in: tuple[str, ...] | None = None
+@dataclass
+class Scenario:
+    """A single question-and-expectation pair sent to the live pipeline.
+    Args:
+        id: Unique scenario identifier (for example ``"ST-001"``).
+        question: Natural-language question fed to the pipeline.
+        expected: Declarative expectations to assert against the result.
+        category: Logical grouping label used for pytest parametrization.
+        auto_responses: Pre-programmed ``y``/``n`` answers consumed in FIFO order by every ``ask_user_choice`` call during the run; defaults to ``["y", "y", "y"]`` (accept everything).
+        feedback: Final ``y``/``n`` feedback value for the "is this correct?" prompt; defaults to ``"y"``.
+        reject_reason: Pre-canned rejection reason text supplied when *feedback* is ``"n"`` and the pipeline prompts for a reason.
+        sequence_id: When this scenario is run as part of a sequence, the sequence id (for result storage so failures show full logs).
+    """
+    id: str
+    question: str
+    expected: Expected
+    category: str = ""
+    auto_responses: list[str] | None = None
+    feedback: str = "y"
+    reject_reason: str = "incorrect results"
+    sequence_id: str | None = None
+@dataclass
+class SequenceScenario:
+    """An ordered list of scenarios executed in series to test statefulness.
+    Args:
+        id: Unique sequence identifier.
+        steps: Ordered ``Scenario`` objects run one after the other against a shared template store.
+        category: Logical grouping label.
+    """
+    id: str
+    steps: list[Scenario]
+    category: str = ""
+@dataclass
+class SoftFailure:
+    """A single soft-assertion failure recorded during evaluation.
+    Args:
+        field: The name of the checked property (for example ``"min_rows"``).
+        expected: The expected value.
+        actual: The observed value.
+        message: Human-readable description of the mismatch.
+    """
+    field: str
+    expected: Any
+    actual: Any
+    message: str
+@dataclass
+class StepResult:
+    """Captured output from a single pipeline execution.
+    Args:
+        scenario_id: The ``Scenario.id`` that produced this result.
+        question: The raw question string.
+        status: Pipeline exit status (for example ``"ok"``, ``"restricted"``, ``"empty_input"``, ``"validation_failed"``, ``"hard_blocked"``, or ``"error"``).
+        intent: Parsed ``RuntimeIntent``, if the pipeline got far enough.
+        sql: Generated SQL string, if available.
+        rows: Result-set rows, if execution succeeded.
+        confidence: Final confidence score, if computed.
+        reuse_type: Template-match reuse type string.
+        template_id: Matched or created template id.
+        hard_blocked: Whether a hard-block fired.
+        validation_failed: Whether schema or result validation failed.
+        feedback: The feedback value (``"y"`` or ``"n"``) that was applied.
+        error: Stringified traceback when the run raised an exception.
+        duration_seconds: Wall-clock time for the run.
+        captured_logs: All ``[LOG]`` and ``[DEBUG]`` messages emitted.
+        semantic_warnings: Semantic warning messages collected during parsing.
+        soft_warnings: Informational result-validation messages (non-blocking).
+        hard_remaining: Unresolved hard result-validation failures.
+        rejection_classifications: Keyword categories for rejection reasons.
+        llm_calls: Total number of LLM API calls during intent parsing.
+        reject_reason_actual: When feedback is ``"n"``, the raw reason text (e.g. from scenario).
+        classified_category: When feedback is ``"n"``, the LLM-classified category.
+        classified_reason: When feedback is ``"n"``, the normalized reason summary.
+        generation_path: Which SQL generation branch was taken — ``"direct_reuse"``, ``"union_match"``, or ``"fresh"``.  ``None`` when the pipeline exited before reaching SQL generation.
+    """
+    scenario_id: str
+    question: str
+    status: str = "unknown"
+    intent: RuntimeIntent | None = None
+    sql: str | None = None
+    rows: list[tuple] | None = None
+    confidence: float | None = None
+    reuse_type: str | None = None
+    template_id: str | None = None
+    hard_blocked: bool = False
+    validation_failed: bool = False
+    feedback: str | None = None
+    error: str | None = None
+    duration_seconds: float = 0.0
+    captured_logs: list[str] = field(default_factory=list)
+    semantic_warnings: list[str] = field(default_factory=list)
+    soft_warnings: list[str] = field(default_factory=list)
+    hard_remaining: list[str] = field(default_factory=list)
+    rejection_classifications: list[str] = field(default_factory=list)
+    llm_calls: int = 0
+    reject_reason_actual: str | None = None
+    classified_category: str | None = None
+    classified_reason: str | None = None
+    generation_path: str | None = None
+class SoftAssert:
+    """Accumulates assertion failures instead of raising immediately.
+    Call :meth:`check` for each condition. After all checks, call :meth:`report` to raise a single ``AssertionError`` that lists every failure, or do nothing when all checks passed.
+    """
+    def __init__(self) -> None:
+        self.failures: list[SoftFailure] = []
+    def check(
+        self,
+        condition: bool,
+        field_name: str,
+        expected: Any,
+        actual: Any,
+        message: str = "",
+    ) -> None:
+        """Record a failure when *condition* is ``False``.
+        Args:
+            condition: The boolean result of the assertion expression.
+            field_name: Symbolic name of the property being checked.
+            expected: The expected value (for reporting).
+            actual: The observed value (for reporting).
+            message: Optional human-readable explanation.
+        """
+        if not condition:
+            msg = message or f"{field_name}: expected {expected!r}, got {actual!r}"
+            self.failures.append(SoftFailure(field=field_name, expected=expected, actual=actual, message=msg))
+    @property
+    def passed(self) -> bool:
+        """Return ``True`` when no failures have been recorded."""
+        return len(self.failures) == 0
+    def report(self, header: str = "") -> None:
+        """Raise ``AssertionError`` listing all accumulated failures.
+        Does nothing when :attr:`passed` is ``True``.
+        Args:
+            header: Optional leading line for the error message.
+        """
+        if self.passed:
+            return
+        lines = [header] if header else []
+        for f in self.failures:
+            lines.append(f"  [{f.field}] {f.message}")
+        raise AssertionError("\n".join(lines))
+def _make_auto_responder(responses: list[str]) -> Callable:
+    """Build a replacement for ``ask_user_choice`` that drains a response queue.
+    Args:
+        responses: FIFO list of ``"y"``/``"n"`` strings.
+    Returns:
+        A callable with the same signature as ``ask_user_choice``.
+    """
+    queue = list(responses)
+    def _responder(prompt: str, options: list[str], silent_no: bool = False) -> str | None:
+        if queue:
+            return queue.pop(0)
+        return "y"
+    return _responder
+def _make_input_responder(reject_reason: str = "incorrect results") -> Callable:
+    """Build a replacement for ``builtins.input`` that supplies canned text.
+    The first call returns the *reject_reason*. Subsequent calls return ``"n"`` to bail out of any further interactive prompts.
+    Args:
+        reject_reason: Text returned on the first ``input()`` call.
+    Returns:
+        A callable replacing the built-in ``input``.
+    """
+    call_count = {"n": 0}
+    def _fake_input(prompt: str = "") -> str:
+        call_count["n"] += 1
+        if call_count["n"] == 1:
+            return reject_reason
+        return "n"
+    return _fake_input
+@contextmanager
+def _pipeline_capture(
+    auto_responses: list[str],
+    reject_reason: str = "incorrect results",
+    csv_dir: str = "",
+):
+    """Context manager that patches interactive I/O for programmatic pipeline runs.
+    Replaces ``ask_user_choice`` with a FIFO auto-responder and ``builtins.input`` with a canned-text responder so that the pipeline runs without blocking on stdin. When *csv_dir* is set, ``save_result_csv`` is redirected so that the CSV file lands in that directory instead of the current working directory.
+    Args:
+        auto_responses: FIFO list of ``"y"``/``"n"`` strings for ``ask_user_choice``.
+        reject_reason: Canned rejection reason for ``input()`` prompts.
+        csv_dir: If non-empty, redirect ``results.csv`` writes into this directory.
+    Yields:
+        A dict ``{"logs": []}`` populated with captured log lines during the run.
+    """
+    capture: dict[str, Any] = {"logs": []}
+    responder = _make_auto_responder(auto_responses)
+    input_responder = _make_input_responder(reject_reason)
+    import text2sql.core_utils as _cu
+    import text2sql.dialect as _di
+    import text2sql.expansion_ops as _eo
+    import text2sql.intent_expr as _ie
+    import text2sql.intent_process as _ip
+    import text2sql.intent_repair as _ir
+    import text2sql.intent_resolve as _irs
+    import text2sql.main_execution as _me
+    import text2sql.pipeline as _pl
+    import text2sql.qsim_ops as _qo
+    import text2sql.qsim_sample as _qs
+    import text2sql.qsim_struct as _qst
+    import text2sql.schema as _sc
+    import text2sql.schema_profiling as _sp
+    import text2sql.simulator as _sim
+    import text2sql.sql_gen as _sg
+    import text2sql.templates as _tp
+    import text2sql.utils as _ut
+    import text2sql.validation_agg as _va
+    import text2sql.validation_execute as _ve
+    import text2sql.validation_schema as _vs
+    import text2sql.validation_semantic as _vsm
+    original_log = _cu.log
+    original_debug = _cu.debug
+    def _capturing_log(msg: str) -> None:
+        capture["logs"].append(f"[LOG] {msg}")
+        original_log(msg)
+    def _capturing_debug(msg: str) -> None:
+        capture["logs"].append(f"[DEBUG] {msg}")
+        original_debug(msg)
+    _debug_modules = [
+        _cu,
+        _pl,
+        _sg,
+        _va,
+        _ve,
+        _vs,
+        _vsm,
+        _ie,
+        _ip,
+        _ir,
+        _irs,
+        _di,
+        _eo,
+        _ut,
+        _tp,
+        _sc,
+        _sp,
+        _qs,
+        _qst,
+        _qo,
+        _me,
+        _sim,
+    ]
+    _log_modules = [_cu, _pl, _me, _eo, _sim]
+    extra_patches: list[Any] = []
+    for mod in _debug_modules:
+        if hasattr(mod, "debug"):
+            extra_patches.append(patch.object(mod, "debug", _capturing_debug))
+    for mod in _log_modules:
+        if hasattr(mod, "log"):
+            extra_patches.append(patch.object(mod, "log", _capturing_log))
+    if csv_dir:
+        _original_save = _pl.save_result_csv
+        def _redirected_save(rows, intent, sql):
+            orig_cwd = os.getcwd()
+            try:
+                os.chdir(csv_dir)
+                _original_save(rows, intent, sql)
+            finally:
+                os.chdir(orig_cwd)
+        import text2sql.live_testing as _lt
+        extra_patches.append(patch.object(_pl, "save_result_csv", _redirected_save))
+        extra_patches.append(patch.object(_lt, "save_result_csv", _redirected_save))
+    with (
+        patch.object(_cu, "ask_user_choice", responder),
+        patch.object(_pl, "ask_user_choice", responder),
+        patch.object(_me, "ask_user_choice", responder),
+        patch("builtins.input", input_responder),
+    ):
+        for p in extra_patches:
+            p.start()
+        try:
+            yield capture
+        finally:
+            for p in extra_patches:
+                p.stop()
+def _extract_reuse_sql(tmpl: Any, q_norm: str) -> str:
+    """Reconstruct the final SQL that ``handle_direct_sql_reuse`` would produce."""
+    vh = tmpl.value_history
+    matched_params: dict[str, str] = {}
+    for i, hq in enumerate(vh.questions):
+        if hq and q_norm == hq:
+            matched_params = dict(vh.param_values[i])
+            break
+    if matched_params:
+        return substitute_params(tmpl.sql_param, matched_params)
+    return tmpl.sql_param
+def _build_reuse_intent(tmpl: Any) -> RuntimeIntent:
+    """Build a lightweight ``RuntimeIntent`` from a template's intent signature."""
+    sig = tmpl.intent_signature
+    return RuntimeIntent(
+        tables=sig.tables or [],
+        grain=sig.grain or "row_level",
+        select_cols=sig.select_cols or [],
+        group_by_cols=sig.group_by_cols or [],
+        order_by_cols=sig.order_by_cols or [],
+        filters_param=sig.filters_param or [],
+        having_param=getattr(sig, "having_param", None) or [],
+        column_map=getattr(sig, "column_map", None) or {},
+        natural_language="",
+    )
+def _run_pipeline_core(
+    question: str,
+    schema: Any,
+    store: dict[str, Any],
+    templates: dict,
+    rejected: dict,
+    schema_terms: set[str],
+    feedback: str,
+    captured_logs: list[str],
+    reject_reason: str = "",
+) -> StepResult:
+    """Execute the pipeline steps for a single question and return captured state.
+    This mirrors the control flow of ``interactive_run_once`` but accepts all arguments programmatically and returns a ``StepResult`` instead of printing.
+    Args:
+        question: Natural-language question string.
+        schema: Loaded ``SchemaGraph``.
+        store: Mutable template store dict.
+        templates: Accepted templates dict.
+        rejected: Rejected templates dict.
+        schema_terms: Set of schema term tokens.
+        feedback: Pre-determined ``"y"``/``"n"`` feedback value.
+        captured_logs: Mutable list to which log lines are appended.
+        reject_reason: When feedback is ``"n"``, the canned reason supplied to the pipeline (for recording on ``StepResult``).
+    Returns:
+        Populated ``StepResult``.
+    """
+    result = StepResult(scenario_id="", question=question, captured_logs=captured_logs)
+    valid, query_type, corrected = validate_question(question)
+    if not valid:
+        result.status = "restricted" if query_type == "restricted" else "invalid_question"
+        return result
+    if corrected != question:
+        question = corrected
+    q_norm = normalize_question(question)
+    dialect, engine, schema, store, templates, rejected, schema_terms = load_pipeline_resources(
+        schema, store, templates, rejected, schema_terms
+    )
+    tmpl_match = check_template_reuse(q_norm, templates)
+    result.reuse_type = tmpl_match.reuse_type
+    if tmpl_match.reuse_type == "direct_reuse":
+        result.reuse_type = "direct_reuse"
+        result.template_id = tmpl_match.best_template.id if tmpl_match.best_template else None
+        result.generation_path = "direct_reuse"
+        handled = handle_direct_sql_reuse(
+            q_norm,
+            tmpl_match.best_template,
+            dialect,
+            store,
+            templates,
+            schema,
+            engine=engine,
+            existing_nl=None,
+        )
+        if handled:
+            result.status = "ok"
+            result.sql = _extract_reuse_sql(tmpl_match.best_template, q_norm)
+            result.intent = _build_reuse_intent(tmpl_match.best_template)
+            return result
+    intent = tmpl_match.intent
+    semantic_warnings: list[dict[str, Any]] = []
+    if intent is None:
+        parsed = parse_intent_via_llm(q_norm, schema, templates, store)
+        if parsed is None:
+            result.status = "intent_parse_failed"
+            return result
+        intent, semantic_warnings, llm_calls = parsed
+        result.llm_calls = llm_calls
+        debug(f"[live_testing] intent parsed: tables={intent.tables} grain={intent.grain} llm_calls={llm_calls}")
+        if llm_calls > 2:
+            debug(f"[live_testing] WARNING: intent parse required {llm_calls} LLM calls for: {q_norm}")
+    result.intent = intent
+    ikey = intent_key(intent)
+    result.semantic_warnings = [w.get("message", "") if isinstance(w, dict) else str(w) for w in semantic_warnings]
+    union_result = match_template_for_union(intent, templates)
+    matched_template = None
+    union_select_cols = None
+    cols_changed = False
+    has_union_match = union_result is not None
+    if union_result is not None:
+        matched_template, union_select_cols, cols_changed = union_result
+        result.reuse_type = "intent_reuse" if cols_changed else "intent_direct_reuse"
+        debug(f"[live_testing] union match: template={matched_template.id if matched_template else None} cols_changed={cols_changed}")
+    result.generation_path = "union_match" if has_union_match else "fresh"
+    if not confirm_intent_with_user(
+        intent, store, semantic_warnings, has_union_match=has_union_match,
+    ):
+        result.status = "intent_rejected"
+        return result
+    join_candidates, cmap, cte_join_hints = generate_join_candidates(intent, schema)
+    if join_candidates is None:
+        save_template_store(store)
+        result.status = "join_failed"
+        return result
+    (
+        hard_block_override,
+        hard_block_rejected_template,
+        matched_rejected_template,
+        proceed,
+    ) = check_and_handle_hard_block(rejected, ikey, intent)
+    result.hard_blocked = not proceed
+    if not proceed:
+        save_template_store(store)
+        result.status = "hard_blocked"
+        return result
+    sql, ok = generate_and_validate_sql(
+        q_norm,
+        intent,
+        schema,
+        join_candidates,
+        cmap,
+        dialect,
+        store,
+        engine=engine,
+        cte_join_hints=cte_join_hints,
+        matched_template=matched_template,
+        union_select_cols=union_select_cols,
+        cols_changed=cols_changed,
+    )
+    result.sql = sql
+    if not ok:
+        debug(f"[live_testing] SQL validation failed for: {q_norm}")
+        result.status = "validation_failed"
+        result.validation_failed = True
+        return result
+    spark_sql = get_spark_sql_for_execution(
+        intent.sql_param or "",
+        dict(flatten_param_values(intent)),
+        schema,
+        intent,
+        dialect,
+    )
+    rows = execute_sql(
+        dialect,
+        sql,
+        spark_sql_for_execution=spark_sql if spark_sql else None,
+    )
+    result.sql = sql
+    result.rows = rows
+    debug(f"[live_testing] SQL generated ({result.generation_path}): rows={len(rows) if rows else 0}")
+    conf = compute_final_metrics(sql, intent, schema, templates, join_candidates, store)
+    result.confidence = conf
+    ux_summary = display_final_results(q_norm, intent, sql, rows)
+    if conf >= PolicyConfig.FINAL_SQL_AUTO_ACCEPT_THRESHOLD:
+        effective_feedback = "y"
+    else:
+        effective_feedback = feedback
+    result.feedback = effective_feedback
+    if effective_feedback == "y":
+        save_result_csv(rows, intent, sql)
+    reject_info = handle_user_feedback(
+        effective_feedback,
+        intent,
+        sql,
+        schema,
+        store,
+        templates,
+        rejected,
+        q_norm,
+        hard_block_override,
+        hard_block_rejected_template,
+        matched_rejected_template,
+        ux_summary,
+        dialect=dialect,
+    )
+    if effective_feedback == "n" and reject_info:
+        result.reject_reason_actual = reject_reason or reject_info.get("reject_reason")
+        result.classified_category = reject_info.get("category")
+        result.classified_reason = reject_info.get("normalized_reason")
+    result.status = "ok"
+    return result
+class LiveTestRunner:
+    """Orchestrates single-scenario and sequence-scenario execution.
+    Holds pre-loaded pipeline resources and exposes :meth:`run` and :meth:`run_sequence` which wrap the pipeline in a capture context, execute the question or questions, and return ``StepResult`` objects ready for assertion.
+    Args:
+        schema: Profiled ``SchemaGraph``.
+        store: Mutable template store dict.
+        templates: Accepted templates dict.
+        rejected: Rejected templates dict.
+        schema_terms: Set of schema term tokens.
+        csv_dir: Directory for ``results.csv`` output (empty means current working directory).
+    """
+    def __init__(
+        self,
+        schema: Any,
+        store: dict[str, Any],
+        templates: dict,
+        rejected: dict,
+        schema_terms: set[str],
+        csv_dir: str = "",
+    ) -> None:
+        self.schema = schema
+        self.store = store
+        self.templates = templates
+        self.rejected = rejected
+        self.schema_terms = schema_terms
+        self.csv_dir = csv_dir
+    def run(self, scenario: Scenario, retries: int = 0) -> StepResult:
+        """Execute a single scenario against the live pipeline.
+        Args:
+            scenario: The ``Scenario`` to execute.
+            retries: Number of additional attempts on failure (0 means a single try).
+        Returns:
+            ``StepResult`` from the last attempt.
+        """
+        auto = scenario.auto_responses if scenario.auto_responses is not None else ["y", "y", "y"]
+        last_result: StepResult | None = None
+        for _ in range(1 + retries):
+            t0 = time.monotonic()
+            try:
+                with _pipeline_capture(list(auto), scenario.reject_reason, csv_dir=self.csv_dir) as cap:
+                    step = _run_pipeline_core(
+                        question=scenario.question,
+                        schema=self.schema,
+                        store=self.store,
+                        templates=self.templates,
+                        rejected=self.rejected,
+                        schema_terms=self.schema_terms,
+                        feedback=scenario.feedback,
+                        captured_logs=cap["logs"],
+                        reject_reason=scenario.reject_reason,
+                    )
+            except Exception:
+                step = StepResult(
+                    scenario_id=scenario.id,
+                    question=scenario.question,
+                    status="error",
+                    error=traceback.format_exc(),
+                    captured_logs=cap.get("logs", []) if "cap" in dir() else [],
+                )
+            step.scenario_id = scenario.id
+            step.duration_seconds = time.monotonic() - t0
+            last_result = step
+            if step.status == "ok":
+                break
+        return last_result
+    def run_sequence(self, seq: SequenceScenario, retries: int = 0) -> list[StepResult]:
+        """Execute an ordered list of scenarios sharing template state.
+        Args:
+            seq: The ``SequenceScenario`` whose steps are run in order.
+            retries: Per-step retry count passed to :meth:`run`.
+        Returns:
+            List of ``StepResult`` objects, one per step.
+        """
+        results: list[StepResult] = []
+        for idx, step_scenario in enumerate(seq.steps):
+            debug(
+                f"[LiveTestRunner.run_sequence] step {idx}/{len(seq.steps)} "
+                f"id={step_scenario.id} rejected_keys={sorted(self.rejected.keys())}"
+            )
+            step_with_seq = dc_replace(step_scenario, sequence_id=seq.id)
+            r = self.run(step_with_seq, retries=retries)
+            debug(f"[LiveTestRunner.run_sequence] step {idx} result: status={r.status} hard_blocked={r.hard_blocked}")
+            results.append(r)
+        return results
+def _assert_scenario(result: StepResult, expected: Expected, soft: SoftAssert | None = None) -> SoftAssert:
+    """Evaluate a ``StepResult`` against an ``Expected`` specification.
+    When *soft* is ``None`` a new ``SoftAssert`` is created. All applicable assertions are run and failures accumulated on the returned ``SoftAssert``.
+    Args:
+        result: The ``StepResult`` from a pipeline run.
+        expected: The ``Expected`` to assert against.
+        soft: Optional existing ``SoftAssert`` to append to.
+    Returns:
+        The ``SoftAssert`` instance (same object when one was passed in).
+    """
+    if soft is None:
+        soft = SoftAssert()
+    if expected.status is not None:
+        soft.check(
+            result.status == expected.status,
+            "status",
+            expected.status,
+            result.status,
+        )
+    elif expected.status_in is not None:
+        soft.check(
+            result.status in expected.status_in,
+            "status_in",
+            expected.status_in,
+            result.status,
+        )
+    if result.intent is not None:
+        actual_tables = sorted(result.intent.tables or [])
+        if expected.tables_one_of is not None:
+            allowed = [sorted(t) for t in expected.tables_one_of]
+            soft.check(
+                actual_tables in allowed,
+                "tables",
+                expected.tables_one_of,
+                actual_tables,
+            )
+        elif expected.tables is not None:
+            expected_tables = sorted(expected.tables)
+            soft.check(
+                actual_tables == expected_tables,
+                "tables",
+                expected_tables,
+                actual_tables,
+            )
+    if expected.grain is not None and result.intent is not None:
+        if isinstance(expected.grain, tuple):
+            soft.check(
+                result.intent.grain in expected.grain,
+                "grain",
+                expected.grain,
+                result.intent.grain,
+            )
+        else:
+            soft.check(
+                result.intent.grain == expected.grain,
+                "grain",
+                expected.grain,
+                result.intent.grain,
+            )
+    elif expected.grain_in is not None and result.intent is not None:
+        soft.check(
+            result.intent.grain in expected.grain_in,
+            "grain",
+            expected.grain_in,
+            result.intent.grain,
+        )
+    if expected.reuse_type is not None:
+        if isinstance(expected.reuse_type, tuple):
+            soft.check(
+                result.reuse_type in expected.reuse_type,
+                "reuse_type",
+                expected.reuse_type,
+                result.reuse_type,
+            )
+        else:
+            soft.check(
+                result.reuse_type == expected.reuse_type,
+                "reuse_type",
+                expected.reuse_type,
+                result.reuse_type,
+            )
+    sql_upper = (result.sql or "").upper()
+    if expected.contains_join is not None:
+        has_join = "JOIN" in sql_upper
+        soft.check(
+            has_join == expected.contains_join,
+            "contains_join",
+            expected.contains_join,
+            has_join,
+        )
+    if expected.contains_group_by is not None:
+        has_gb = "GROUP BY" in sql_upper
+        soft.check(
+            has_gb == expected.contains_group_by,
+            "contains_group_by",
+            expected.contains_group_by,
+            has_gb,
+        )
+    if expected.contains_cte is not None:
+        has_cte = sql_upper.lstrip().startswith("WITH ")
+        soft.check(
+            has_cte == expected.contains_cte,
+            "contains_cte",
+            expected.contains_cte,
+            has_cte,
+        )
+    if expected.sql_contains is not None and result.sql is not None:
+        for substr in expected.sql_contains:
+            found = substr.upper() in sql_upper
+            soft.check(found, "sql_contains", substr, f"not found in: {result.sql[:120]}")
+    if expected.sql_excludes is not None and result.sql is not None:
+        for substr in expected.sql_excludes:
+            found = substr.upper() in sql_upper
+            soft.check(
+                not found,
+                "sql_excludes",
+                f"absent: {substr}",
+                f"found in: {result.sql[:120]}",
+            )
+    if expected.min_rows is not None and result.rows is not None:
+        soft.check(
+            len(result.rows) >= expected.min_rows,
+            "min_rows",
+            expected.min_rows,
+            len(result.rows),
+        )
+    if expected.max_rows is not None and result.rows is not None:
+        soft.check(
+            len(result.rows) <= expected.max_rows,
+            "max_rows",
+            expected.max_rows,
+            len(result.rows),
+        )
+    if expected.min_confidence is not None and result.confidence is not None:
+        soft.check(
+            result.confidence >= expected.min_confidence,
+            "min_confidence",
+            expected.min_confidence,
+            result.confidence,
+        )
+    if expected.column_names_one_of is not None and result.rows is not None and result.intent is not None:
+        actual_cols = []
+        for c in result.intent.select_cols or []:
+            name = getattr(c, "alias", None) or c.expr.primary_term
+            actual_cols.append(name.split(".")[-1] if name and "." in name else (name or ""))
+        allowed = [sorted(cols) for cols in expected.column_names_one_of]
+        soft.check(
+            sorted(actual_cols) in allowed,
+            "column_names",
+            expected.column_names_one_of,
+            actual_cols,
+        )
+    if expected.row_value_check is not None and result.rows is not None:
+        check_ok = expected.row_value_check(result.rows)
+        soft.check(check_ok, "row_value_check", "True", check_ok)
+    if expected.min_semantic_warnings is not None:
+        soft.check(
+            len(result.semantic_warnings) >= expected.min_semantic_warnings,
+            "min_semantic_warnings",
+            expected.min_semantic_warnings,
+            len(result.semantic_warnings),
+        )
+    if expected.should_hard_block:
+        soft.check(
+            result.hard_blocked,
+            "should_hard_block",
+            True,
+            result.hard_blocked,
+        )
+    if expected.should_fail_validation:
+        soft.check(
+            result.validation_failed,
+            "should_fail_validation",
+            True,
+            result.validation_failed,
+        )
+    return soft
+def run_and_assert(
+    runner: LiveTestRunner,
+    scenario: Scenario,
+    header: str,
+    max_attempts: int = 2,
+    retries: int = 1,
+) -> None:
+    """Run a scenario and assert expectations, retrying from scratch on failure.
+    On the first attempt the pipeline is run and assertions are
+    checked.  When any assertion fails and *max_attempts* > 1 the
+    entire pipeline is re-executed from scratch and assertions are
+    re-evaluated on the fresh result.
+    Args:
+        runner: ``LiveTestRunner`` instance configured for the target
+            database.
+        scenario: The ``Scenario`` to execute.
+        header: Label used in the ``AssertionError`` message.
+        max_attempts: Total attempts including the initial run.
+        retries: Per-attempt pipeline retry count passed to
+            ``runner.run``.
+    """
+    last_soft: SoftAssert | None = None
+    for _ in range(max_attempts):
+        result = runner.run(scenario, retries=retries)
+        last_soft = _assert_scenario(result, scenario.expected)
+        if last_soft.passed:
+            return
+    if last_soft is not None:
+        last_soft.report(header=header)
+def run_sequence_and_assert(
+    runner: LiveTestRunner,
+    seq: SequenceScenario,
+    max_attempts: int = 2,
+    retries: int = 1,
+) -> None:
+    """Run a sequence of scenarios and assert each step, retrying on failure.
+    When any step's assertions fail and *max_attempts* > 1, the
+    entire sequence is re-executed from scratch.
+    Args:
+        runner: ``LiveTestRunner`` instance configured for the target
+            database.
+        seq: The ``SequenceScenario`` whose steps are run in order.
+        max_attempts: Total attempts including the initial run.
+        retries: Per-step pipeline retry count passed to
+            ``runner.run``.
+    """
+    last_soft: SoftAssert | None = None
+    for _ in range(max_attempts):
+        results = runner.run_sequence(seq, retries=retries)
+        last_soft = SoftAssert()
+        for step_scenario, result in zip(seq.steps, results):
+            _assert_scenario(result, step_scenario.expected, soft=last_soft)
+        if last_soft.passed:
+            return
+    if last_soft is not None:
+        last_soft.report(header=f"[{seq.id}]")