PyPI - touchstone-platform - Versions diffs - 1.0.2__py3-none-any.whl - Mend

touchstone-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (313) hide show

app/__init__.py +1 -0
app/agent_tasks.py +34 -0
app/audit/__init__.py +4 -0
app/audit/ledger.py +186 -0
app/audit/normalizer.py +62 -0
app/audit/orchestrator.py +117 -0
app/audit/task_matcher.py +85 -0
app/auth.py +437 -0
app/billing.py +337 -0
app/catalog/__init__.py +1 -0
app/catalog/certify.py +109 -0
app/catalog/certify_store.py +175 -0
app/catalog/custom.py +198 -0
app/catalog/custom_scenario.py +92 -0
app/catalog/custom_store.py +101 -0
app/catalog/discover.py +375 -0
app/catalog/govern.py +175 -0
app/catalog/llm.py +166 -0
app/catalog/monitor.py +119 -0
app/catalog/monitor_store.py +132 -0
app/catalog/outcomes.py +169 -0
app/catalog/outcomes_store.py +141 -0
app/catalog/pipeline.py +362 -0
app/catalog/pipeline_store.py +206 -0
app/catalog/plan.py +391 -0
app/catalog/plan_store.py +117 -0
app/catalog/portfolio.py +360 -0
app/catalog/prism.py +202 -0
app/catalog/prism_classifier.py +241 -0
app/catalog/prism_store.py +254 -0
app/catalog/ranking.py +250 -0
app/catalog/registry_link.py +37 -0
app/catalog/router_catalog.py +44 -0
app/catalog/router_certify.py +161 -0
app/catalog/router_custom.py +89 -0
app/catalog/router_discover.py +298 -0
app/catalog/router_govern.py +42 -0
app/catalog/router_monitor.py +53 -0
app/catalog/router_outcomes.py +174 -0
app/catalog/router_pipeline.py +127 -0
app/catalog/router_plan.py +117 -0
app/catalog/router_portfolio.py +30 -0
app/catalog/router_prism.py +298 -0
app/catalog/router_select.py +99 -0
app/catalog/schema.py +146 -0
app/catalog/select.py +261 -0
app/catalog/store.py +271 -0
app/certification.py +189 -0
app/connectors/base.py +36 -0
app/connectors/billing_aws.py +169 -0
app/connectors/billing_azure.py +162 -0
app/connectors/cloud.py +382 -0
app/connectors/factory.py +33 -0
app/connectors/git.py +156 -0
app/connectors/local.py +70 -0
app/connectors/log_cloudwatch.py +183 -0
app/connectors/manager.py +78 -0
app/database.py +409 -0
app/deepeval_integration.py +108 -0
app/engine.py +312 -0
app/env_compat.py +40 -0
app/eval/__init__.py +19 -0
app/eval/adapter.py +35 -0
app/eval/agentic_runner.py +100 -0
app/eval/aggregation.py +57 -0
app/eval/atr/__init__.py +9 -0
app/eval/atr/loader.py +171 -0
app/eval/attestation.py +217 -0
app/eval/attestation_pdf.py +190 -0
app/eval/attestation_revocation.py +180 -0
app/eval/config.py +28 -0
app/eval/cost.py +66 -0
app/eval/drift.py +158 -0
app/eval/drift_alerts.py +129 -0
app/eval/evidence_bundle.py +284 -0
app/eval/horizontal/__init__.py +12 -0
app/eval/horizontal/loader.py +406 -0
app/eval/inspect_export.py +278 -0
app/eval/leaderboard.py +44 -0
app/eval/leaderboards/__init__.py +0 -0
app/eval/leaderboards/agent_threat_safety.py +46 -0
app/eval/leaderboards/compliance_boards.py +205 -0
app/eval/leaderboards/genai_red_team.py +50 -0
app/eval/leaderboards/horizontal_boards.py +49 -0
app/eval/leaderboards/medhelm.py +89 -0
app/eval/leaderboards/medical_safety.py +45 -0
app/eval/leaderboards/modernization_agentic.py +79 -0
app/eval/leaderboards/modernization_classic.py +44 -0
app/eval/leaderboards/modernization_evidence.py +71 -0
app/eval/leaderboards/modernization_robustness.py +53 -0
app/eval/leaderboards/modernization_safety.py +53 -0
app/eval/leaderboards/schemas/agent_threat_safety.yaml +10 -0
app/eval/leaderboards/schemas/compliance_ca_ai_laws.yaml +46 -0
app/eval/leaderboards/schemas/compliance_cms_interop.yaml +22 -0
app/eval/leaderboards/schemas/compliance_gdpr.yaml +89 -0
app/eval/leaderboards/schemas/compliance_hipaa.yaml +33 -0
app/eval/leaderboards/schemas/compliance_hti_1.yaml +30 -0
app/eval/leaderboards/schemas/compliance_il_ai_laws.yaml +26 -0
app/eval/leaderboards/schemas/compliance_mitre_atlas.yaml +90 -0
app/eval/leaderboards/schemas/compliance_nyc_ll144.yaml +26 -0
app/eval/leaderboards/schemas/compliance_owasp_agentic.yaml +42 -0
app/eval/leaderboards/schemas/compliance_owasp_llm_top10.yaml +90 -0
app/eval/leaderboards/schemas/compliance_pci_dss.yaml +37 -0
app/eval/leaderboards/schemas/compliance_sox.yaml +61 -0
app/eval/leaderboards/schemas/genai_red_team.yaml +11 -0
app/eval/leaderboards/schemas/horizontal_customer_support.yaml +10 -0
app/eval/leaderboards/schemas/horizontal_finance_ap.yaml +10 -0
app/eval/leaderboards/schemas/horizontal_hr_assist.yaml +10 -0
app/eval/leaderboards/schemas/horizontal_it_servicedesk.yaml +10 -0
app/eval/leaderboards/schemas/horizontal_procurement.yaml +10 -0
app/eval/leaderboards/schemas/horizontal_sales_ops.yaml +10 -0
app/eval/leaderboards/schemas/medhelm_modernization.yaml +46 -0
app/eval/leaderboards/schemas/medical_safety.yaml +10 -0
app/eval/leaderboards/schemas/modernization_agentic.yaml +31 -0
app/eval/leaderboards/schemas/modernization_classic.yaml +42 -0
app/eval/leaderboards/schemas/modernization_evidence.yaml +48 -0
app/eval/leaderboards/schemas/modernization_robustness.yaml +45 -0
app/eval/leaderboards/schemas/modernization_safety.yaml +30 -0
app/eval/leaderboards/schemas/swe_bench_verified.yaml +10 -0
app/eval/leaderboards/schemas/vertical_edu_k12.yaml +8 -0
app/eval/leaderboards/schemas/vertical_fs_insurance_pc.yaml +9 -0
app/eval/leaderboards/schemas/vertical_fs_payments.yaml +8 -0
app/eval/leaderboards/schemas/vertical_fs_retail_banking.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hcls_ambulatory.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hcls_lab.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hcls_medtech.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hcls_payer.yaml +8 -0
app/eval/leaderboards/schemas/vertical_hcls_pbm.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hcls_pharma.yaml +9 -0
app/eval/leaderboards/schemas/vertical_hitech_semi.yaml +8 -0
app/eval/leaderboards/schemas/vertical_hitech_software.yaml +9 -0
app/eval/leaderboards/schemas/vertical_industry_energy.yaml +9 -0
app/eval/leaderboards/schemas/vertical_industry_pubsec.yaml +9 -0
app/eval/leaderboards/schemas/vertical_retail_apparel.yaml +9 -0
app/eval/leaderboards/schemas/vertical_retail_bigbox.yaml +9 -0
app/eval/leaderboards/schemas/vertical_retail_cpg_brands.yaml +8 -0
app/eval/leaderboards/schemas/vertical_retail_cstore.yaml +9 -0
app/eval/leaderboards/schemas/vertical_retail_grocery.yaml +9 -0
app/eval/leaderboards/swe_bench.py +58 -0
app/eval/leaderboards/vertical_boards.py +71 -0
app/eval/metric.py +51 -0
app/eval/metrics/__init__.py +0 -0
app/eval/metrics/agentic/__init__.py +1 -0
app/eval/metrics/agentic/action_sequence.py +103 -0
app/eval/metrics/agentic/replan_count.py +36 -0
app/eval/metrics/agentic/step_success_rate.py +37 -0
app/eval/metrics/agentic/task_completion.py +90 -0
app/eval/metrics/agentic/trajectory_length.py +27 -0
app/eval/metrics/code/__init__.py +1 -0
app/eval/metrics/code/patch_execution.py +111 -0
app/eval/metrics/code/patch_validity.py +142 -0
app/eval/metrics/compliance/__init__.py +7 -0
app/eval/metrics/compliance/control_match.py +231 -0
app/eval/metrics/evidence/__init__.py +6 -0
app/eval/metrics/evidence/audit_ledger_integrity.py +63 -0
app/eval/metrics/evidence/citation_f1.py +58 -0
app/eval/metrics/evidence/hallucination_rate.py +35 -0
app/eval/metrics/evidence/kg_groundedness.py +57 -0
app/eval/metrics/evidence/lineage_f1.py +47 -0
app/eval/metrics/exact_match.py +12 -0
app/eval/metrics/hcls/__init__.py +1 -0
app/eval/metrics/hcls/bertscore.py +139 -0
app/eval/metrics/hcls/jury_score.py +111 -0
app/eval/metrics/hcls/phi_leak.py +101 -0
app/eval/metrics/judge_rubric.py +89 -0
app/eval/metrics/robustness_delta.py +25 -0
app/eval/metrics/rouge.py +114 -0
app/eval/metrics/safety/__init__.py +1 -0
app/eval/metrics/safety/atr_detection.py +57 -0
app/eval/metrics/safety/bias.py +181 -0
app/eval/metrics/safety/bias_extended.py +185 -0
app/eval/metrics/safety/medical_red_team.py +77 -0
app/eval/metrics/safety/redteam_detection.py +83 -0
app/eval/metrics/safety/wmdp_score.py +67 -0
app/eval/perturbations/__init__.py +7 -0
app/eval/perturbations/base.py +31 -0
app/eval/perturbations/comment_noise.py +23 -0
app/eval/perturbations/dead_code_injection.py +26 -0
app/eval/perturbations/dialect_drift.py +23 -0
app/eval/perturbations/identifier_mangling.py +36 -0
app/eval/perturbations/partial_copybook.py +22 -0
app/eval/perturbations/registry.py +19 -0
app/eval/perturbed_scenario.py +35 -0
app/eval/redteam/__init__.py +8 -0
app/eval/redteam/loader.py +289 -0
app/eval/request_cache.py +169 -0
app/eval/risk_tier.py +134 -0
app/eval/run_spec.py +29 -0
app/eval/runner.py +107 -0
app/eval/runners/__init__.py +5 -0
app/eval/runners/swe_bench_docker.py +336 -0
app/eval/runners/tau_bench/__init__.py +57 -0
app/eval/runners/tau_bench/airline_state.py +154 -0
app/eval/runners/tau_bench/airline_tools.py +205 -0
app/eval/runners/tau_bench/golden.py +90 -0
app/eval/runners/tau_bench/retail_state.py +208 -0
app/eval/runners/tau_bench/runner.py +259 -0
app/eval/runners/tau_bench/tools.py +241 -0
app/eval/runspec_overrides.py +82 -0
app/eval/scenario.py +54 -0
app/eval/schema.py +28 -0
app/eval/snapshot.py +85 -0
app/eval/specs/__init__.py +0 -0
app/eval/specs/agentic_cobol_modernization.py +31 -0
app/eval/specs/atr_red_team.py +44 -0
app/eval/specs/cobol_billing_scenario.py +11 -0
app/eval/specs/cobol_lineage_scenario.py +10 -0
app/eval/specs/edelweiss_investing_scenario.py +10 -0
app/eval/specs/genai_red_team.py +51 -0
app/eval/specs/healthcare_data_scenario.py +10 -0
app/eval/specs/horizontal_agent_scenarios.py +99 -0
app/eval/specs/huggingface_dataset_scenario.py +357 -0
app/eval/specs/java_refactor_scenario.py +10 -0
app/eval/specs/medical_red_team.py +128 -0
app/eval/specs/sap_modernization_scenario.py +10 -0
app/eval/specs/swe_bench_scenarios.py +92 -0
app/eval/specs/tau_bench_scenarios.py +98 -0
app/eval/specs/vertical_scenarios.py +317 -0
app/eval/specs/wmdp_scenarios.py +106 -0
app/eval/specs/yaml_spec_scenario.py +53 -0
app/eval/trace.py +39 -0
app/eval/trajectory.py +43 -0
app/fitment.py +843 -0
app/frameworks/__init__.py +40 -0
app/frameworks/autogen_runner.py +40 -0
app/frameworks/base.py +29 -0
app/frameworks/crewai_runner.py +123 -0
app/frameworks/datagol_runner.py +36 -0
app/frameworks/direct_api.py +441 -0
app/frameworks/galileo_runner.py +88 -0
app/frameworks/langchain_runner.py +35 -0
app/frameworks/langgraph_runner.py +38 -0
app/frameworks/letta_runner.py +35 -0
app/frameworks/llamaindex_runner.py +38 -0
app/frameworks/mem0_runner.py +38 -0
app/frameworks/n8n_runner.py +37 -0
app/frameworks/openclawd_runner.py +84 -0
app/frameworks/semantic_kernel_runner.py +35 -0
app/frameworks/stackai_runner.py +35 -0
app/frameworks/superagi_runner.py +35 -0
app/frameworks/zep_runner.py +35 -0
app/git_utils.py +53 -0
app/health.py +132 -0
app/knowledge.py +894 -0
app/licensing.py +170 -0
app/main.py +1765 -0
app/models.py +151 -0
app/onboarding.py +274 -0
app/outputs.py +91 -0
app/parsers/apex.py +45 -0
app/parsers/asp.py +49 -0
app/parsers/base.py +20 -0
app/parsers/cobol.py +146 -0
app/parsers/config.py +35 -0
app/parsers/enterprise.py +101 -0
app/parsers/factory.py +76 -0
app/parsers/legacy_c.py +35 -0
app/parsers/modern.py +53 -0
app/parsers/sql.py +91 -0
app/pricing.py +46 -0
app/scoring.py +251 -0
app/security.py +206 -0
app/spec_loader.py +186 -0
app/spec_v1.py +192 -0
app/static/agent_catalog.json +1198 -0
app/static/ai_inventory.html +111 -0
app/static/app.js +2231 -0
app/static/audit.js +89 -0
app/static/auth.js +245 -0
app/static/catalog.html +55 -0
app/static/css/catalog.css +128 -0
app/static/css/leaderboards.css +50 -0
app/static/css/portfolio.css +218 -0
app/static/css/prism.css +187 -0
app/static/custom.html +137 -0
app/static/discover.html +133 -0
app/static/govern.html +183 -0
app/static/index.html +698 -0
app/static/instance_trace.html +18 -0
app/static/js/catalog.js +267 -0
app/static/js/custom.js +146 -0
app/static/js/discover.js +112 -0
app/static/js/govern.js +167 -0
app/static/js/instance_trace.js +28 -0
app/static/js/leaderboard_compare.js +31 -0
app/static/js/leaderboard_detail.js +62 -0
app/static/js/leaderboards.js +90 -0
app/static/js/monitor.js +192 -0
app/static/js/plan.js +180 -0
app/static/js/portfolio.js +205 -0
app/static/js/prism.js +399 -0
app/static/leaderboard_compare.html +18 -0
app/static/leaderboard_detail.html +22 -0
app/static/leaderboards/.gitkeep +0 -0
app/static/leaderboards/index.html +9 -0
app/static/leaderboards/modernization_classic.html +29 -0
app/static/leaderboards.html +17 -0
app/static/login.html +266 -0
app/static/logo_mark.png +0 -0
app/static/monitor.html +147 -0
app/static/onboarding.html +85 -0
app/static/onboarding.js +96 -0
app/static/plan.html +87 -0
app/static/portfolio.html +107 -0
app/static/prism.html +47 -0
app/static/styles.css +2290 -0
app/terraform.py +1028 -0
app/vector_stores.py +303 -0
touchstone_platform-1.0.2.dist-info/METADATA +512 -0
touchstone_platform-1.0.2.dist-info/RECORD +313 -0
touchstone_platform-1.0.2.dist-info/WHEEL +4 -0
touchstone_platform-1.0.2.dist-info/entry_points.txt +2 -0
touchstone_platform-1.0.2.dist-info/licenses/LICENSE.md +242 -0

app/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # HuggingFace Model Benchmarking Gym

app/agent_tasks.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Agent tasks — spec-driven v2. All content comes from Gym Spec v1.0."""
+from __future__ import annotations
+from app.spec_loader import (
+    get_active_spec,
+    get_artifacts_info,
+    get_roles_info,
+    load_artifact_content,
+)
+def get_artifacts_list() -> list[dict]:
+    return get_artifacts_info()
+def get_artifact_content(name: str) -> str | None:
+    return load_artifact_content(name)
+def get_all_roles_info() -> list[dict]:
+    return get_roles_info()
+def get_tasks_for_role(agent_id: str) -> list:
+    spec = get_active_spec()
+    return [t for t in spec.tasks if t.agent_id == agent_id]
+def build_prompt(task) -> str:
+    prompt = task.prompt
+    for artifact_name in task.required_artifacts:
+        content = load_artifact_content(artifact_name)
+        if content:
+            prompt = prompt.replace(f"{{{artifact_name}}}", content)
+        else:
+            prompt = prompt.replace(f"{{{artifact_name}}}", f"[Artifact {artifact_name} not found]")
+    return prompt

app/audit/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .ledger import AuditLedger, audit_ledger
+from .normalizer import InvocationRecord, normalize_artifact, normalize_batch
+from .orchestrator import AuditOrchestrator
+from .task_matcher import TaskMatcher

app/audit/ledger.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Immutable audit ledger for SOC2 compliance."""
+import hashlib
+import hmac
+import json
+import logging
+import os
+import time
+from pathlib import Path
+logger = logging.getLogger(__name__)
+AUDIT_LOG_FILE = os.environ.get(
+    "MODEL_GYM_AUDIT_LOG",
+    os.path.join("scratch", "audit", "audit_ledger.jsonl"),
+)
+# ---------------------------------------------------------------------------
+# HMAC secret key
+# The key is loaded from the AUDIT_HMAC_SECRET environment variable.
+# In production, set this to a strong random value (e.g. 32+ random bytes,
+# base64-encoded).  If the variable is absent we fall back to a well-known
+# dev-only placeholder and emit a warning — never use this in production.
+# ---------------------------------------------------------------------------
+_SECRET_FROM_ENV: str | None = os.environ.get("AUDIT_HMAC_SECRET")
+_IS_PRODUCTION: bool = os.environ.get("MODEL_GYM_ENV", "development").lower() == "production"
+if _SECRET_FROM_ENV:
+    AUDIT_HMAC_KEY: bytes = _SECRET_FROM_ENV.encode()
+elif _IS_PRODUCTION:
+    raise RuntimeError(
+        "AUDIT_HMAC_SECRET must be set in production. "
+        "Generate with: python -c \"import secrets,base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\""
+    )
+else:
+    logger.warning(
+        "AUDIT_HMAC_SECRET env var not set — using insecure dev-only key. "
+        "Set TOUCHSTONE_ENV=production (or MODEL_GYM_ENV=production) to enforce this at startup."
+    )
+    AUDIT_HMAC_KEY = b"dev-only-secret"
+def _compute_hmac(previous_hash: str, event_json_bytes: bytes) -> str:
+    """Return HMAC-SHA256(key=AUDIT_HMAC_KEY, msg=previous_hash_bytes + event_json_bytes)."""
+    msg = previous_hash.encode() + event_json_bytes
+    return hmac.new(AUDIT_HMAC_KEY, msg, hashlib.sha256).hexdigest()
+class AuditLedger:
+    """Manages HMAC-signed hash-linked audit entries for processing integrity."""
+    def __init__(self, log_file: str = AUDIT_LOG_FILE):
+        self.log_file = Path(log_file)
+        self._ensure_log_exists()
+        # Cache: restored from the last line of the log on startup.
+        self._last_hash: str = "GENESIS"
+        self._last_seq: int = 0
+        self._restore_cache()
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _ensure_log_exists(self):
+        self.log_file.parent.mkdir(parents=True, exist_ok=True)
+        if not self.log_file.exists():
+            self.log_file.touch()
+    def _restore_cache(self):
+        """Read only the last line of the log to prime the in-memory cache — O(1)."""
+        if not self.log_file.exists() or self.log_file.stat().st_size == 0:
+            return
+        # Efficient last-line read without loading the whole file.
+        last_line: str = ""
+        with open(self.log_file, "rb") as fh:
+            # Seek to end; scan backwards for the previous newline.
+            fh.seek(0, 2)
+            size = fh.tell()
+            if size == 0:
+                return
+            # Step back past any trailing newline.
+            pos = size - 1
+            while pos >= 0:
+                fh.seek(pos)
+                ch = fh.read(1)
+                if ch == b"\n" and pos < size - 1:
+                    break
+                pos -= 1
+            fh.seek(pos + 1)
+            last_line = fh.read().decode("utf-8").strip()
+        if last_line:
+            try:
+                data = json.loads(last_line)
+                self._last_hash = data.get("hash", "GENESIS")
+                self._last_seq = data.get("seq", 0)
+            except json.JSONDecodeError:
+                pass  # Leave defaults; verify_integrity() will catch corruption.
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    def log_action(self, action: str, actor: str, payload: dict, run_id: str | None = None):
+        """Appends an HMAC-signed, hash-linked entry to the ledger."""
+        previous_hash = self._last_hash
+        seq = self._last_seq + 1
+        entry = {
+            "seq": seq,
+            "timestamp": time.time(),
+            "action": action,
+            "actor": actor,
+            "run_id": run_id,
+            "payload": payload,
+            "previous_hash": previous_hash,
+        }
+        event_json_bytes = json.dumps(entry, sort_keys=True).encode()
+        entry_hash = _compute_hmac(previous_hash, event_json_bytes)
+        with open(self.log_file, "a", encoding="utf-8") as f:
+            f.write(json.dumps({**entry, "hash": entry_hash}) + "\n")
+        # Update cache.
+        self._last_hash = entry_hash
+        self._last_seq = seq
+        return entry_hash
+    # Keep the old method name as an alias for backward compatibility.
+    def log_event(self, action: str, actor: str, payload: dict, run_id: str | None = None):
+        return self.log_action(action, actor, payload, run_id=run_id)
+    def verify_integrity(self) -> bool:
+        """Verifies the entire HMAC chain for tampering using constant-time comparison."""
+        if not self.log_file.exists() or self.log_file.stat().st_size == 0:
+            return True
+        expected_prev_hash = "GENESIS"
+        with open(self.log_file, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    return False
+                stored_hash = data.pop("hash", None)
+                if stored_hash is None:
+                    return False
+                # Verify the previous-hash link.
+                if data.get("previous_hash") != expected_prev_hash:
+                    return False
+                # Recompute HMAC.
+                event_json_bytes = json.dumps(data, sort_keys=True).encode()
+                recomputed_hash = _compute_hmac(expected_prev_hash, event_json_bytes)
+                # Constant-time comparison to prevent timing attacks.
+                if not hmac.compare_digest(stored_hash, recomputed_hash):
+                    return False
+                expected_prev_hash = stored_hash
+        return True
+    def get_logs(self, limit: int = 100) -> list[dict]:
+        """Returns the most recent audit logs."""
+        if not self.log_file.exists():
+            return []
+        logs = []
+        with open(self.log_file, encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    logs.append(json.loads(line))
+        return logs[-limit:]
+audit_ledger = AuditLedger()

app/audit/normalizer.py ADDED Viewed

@@ -0,0 +1,62 @@
+from dataclasses import dataclass, field
+from typing import Any
+@dataclass
+class InvocationRecord:
+    """Normalized representation of a single LLM invocation from any provider."""
+    uid: str                      # from connector artifact["uid"]
+    provider: str                 # "aws_bedrock" | "azure_openai" | "local"
+    model_id: str
+    prompt: str
+    completion: str
+    timestamp_iso: str
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float = 0.0
+    latency_ms: int = 0
+    task_id: str = ""             # populated during matching
+    task_title: str = ""          # populated during matching
+    role: str = ""                # populated during matching
+    metadata: dict[str, Any] = field(default_factory=dict)  # raw source dict
+def normalize_artifact(artifact: dict[str, Any]) -> InvocationRecord | None:
+    """
+    Convert a connector artifact dict to an InvocationRecord.
+    Returns None if artifact["type"] != "invocation_log" or prompt is empty.
+    """
+    if artifact.get("type") != "invocation_log":
+        return None
+    prompt = artifact.get("prompt", "").strip()
+    if not prompt:
+        return None
+    return InvocationRecord(
+        uid=artifact["uid"],
+        provider=artifact.get("provider", "unknown"),
+        model_id=artifact["model_id"],
+        prompt=str(prompt)[:8000],
+        completion=str(artifact.get("completion", ""))[:8000],
+        timestamp_iso=artifact["timestamp_iso"],
+        input_tokens=artifact.get("input_tokens", 0),
+        output_tokens=artifact.get("output_tokens", 0),
+        cost_usd=artifact.get("cost_usd", 0.0),
+        latency_ms=artifact.get("latency_ms", 0),
+        metadata=artifact
+    )
+def normalize_batch(artifacts: list[dict[str, Any]]) -> list[InvocationRecord]:
+    """
+    Filter and normalize a list of connector artifacts.
+    Returns list of InvocationRecord, ordered by timestamp_iso ascending.
+    """
+    records = []
+    for art in artifacts:
+        record = normalize_artifact(art)
+        if record:
+            records.append(record)
+    # Sort chronologically
+    records.sort(key=lambda x: x.timestamp_iso)
+    return records

app/audit/orchestrator.py ADDED Viewed

@@ -0,0 +1,117 @@
+import datetime
+import logging
+from typing import Any
+from app.audit.normalizer import InvocationRecord, normalize_batch
+from app.audit.task_matcher import TaskMatcher
+from app.connectors.factory import ConnectorFactory
+from app.spec_v1 import GymSpecV1
+logger = logging.getLogger(__name__)
+class AuditOrchestrator:
+    """
+    Main orchestration engine for Audit Mode.
+    Cohesion point for Ingestion, Normalization, Matching, and Scoring.
+    """
+    def __init__(self, spec: GymSpecV1, scorer: Any = None):
+        self.spec = spec
+        if spec.mode != "audit":
+            raise ValueError("GymSpec is not in audit mode")
+        self.matcher = TaskMatcher(spec)
+        self.scorer = scorer # Should be an instance of ScoringEngine
+        self.incumbent_config = spec.audit.incumbent
+    def run_audit(self) -> dict[str, Any]:
+        """
+        Execute the full audit workflow.
+        """
+        logger.info(f"Starting audit for {self.spec.gym.name}")
+        # 1. Ingestion
+        artifacts = self._ingest_logs()
+        # 2. Normalization
+        records = normalize_batch(artifacts)
+        # 3. Task Matching
+        self.matcher.match_batch(records)
+        stats = self.matcher.match_stats(records)
+        # 4. Replay Scoring (Scoring existing outputs)
+        results = []
+        total_score = 0.0
+        scored_count = 0
+        for record in records:
+            if not record.task_id:
+                continue
+            score_result = self._score_record(record)
+            if score_result:
+                results.append({
+                    "record_uid": record.uid,
+                    "task_id": record.task_id,
+                    "score": score_result.get("overall_score", 0.0),
+                    "cost": record.cost_usd,
+                    "dimensions": score_result.get("dimensions", {})
+                })
+                total_score += score_result.get("overall_score", 0.0)
+                scored_count += 1
+        avg_score = total_score / scored_count if scored_count > 0 else 0.0
+        total_cost = sum(r.cost_usd for r in records)
+        report = {
+            "spec_id": self.spec.gym.id,
+            "timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
+            "status": "completed",
+            "summary": {
+                "total_invocations": len(records),
+                "matched_invocations": stats["matched"],
+                "scored_invocations": scored_count,
+                "average_score": round(avg_score, 4),
+                "total_cost_usd": round(total_cost, 4),
+                "cost_per_matched_outcome": round(total_cost / scored_count, 4) if scored_count > 0 else 0.0
+            },
+            "results": results,
+            "tasks": stats["by_task"]
+        }
+        return report
+    def _ingest_logs(self) -> list[dict[str, Any]]:
+        """Pull logs from the configured source."""
+        conn_id = self.incumbent_config.connection_id or "audit_source"
+        # Build connector config from incumbent settings
+        config = {
+            "provider": self.incumbent_config.provider,
+            "model_id": self.incumbent_config.model_id,
+            "region_name": self.incumbent_config.region,
+            "log_group_name": self.incumbent_config.log_group,
+            "start_time_iso": self.incumbent_config.billing_start,
+            "end_time_iso": self.incumbent_config.billing_end,
+        }
+        connector = ConnectorFactory.get_connector(conn_id, config)
+        return connector.list_artifacts()
+    def _score_record(self, record: InvocationRecord) -> dict[str, Any] | None:
+        """Score a single invocation using the provided scorer."""
+        if not self.scorer:
+            # Fallback mock score if no scorer provided (for testing/demo)
+            return {"overall_score": 0.0, "dimensions": {}}
+        # The scorer should have a method like score_task(task_id, prompt, completion)
+        try:
+            return self.scorer.score_task(
+                task_id=record.task_id,
+                prompt=record.prompt,
+                completion=record.completion
+            )
+        except Exception as e:
+            logger.error(f"Scoring failed for record {record.uid}: {e!s}")
+            return None

app/audit/task_matcher.py ADDED Viewed

@@ -0,0 +1,85 @@
+import logging
+from typing import Any
+from app.audit.normalizer import InvocationRecord
+from app.spec_v1 import GymSpecV1
+from app.vector_stores import LexicalVectorStore
+logger = logging.getLogger(__name__)
+class TaskMatcher:
+    """
+    Matches raw LLM invocation prompts to GymSpec task definitions
+    using lexical similarity.
+    """
+    def __init__(self, spec: GymSpecV1):
+        self.spec = spec
+        self.tasks = spec.tasks
+        self.vector_store = LexicalVectorStore(chunk_size=200, overlap=20)
+        # Build index of task prompts
+        # Index format: "{task.name} {task.prompt}"
+        for task in self.tasks:
+            content = f"{task.name} {task.prompt}"
+            self.vector_store.add_document(
+                run_id="task_index",
+                content=content,
+                metadata={
+                    "task_id": task.id,
+                    "name": task.name,
+                    "agent_id": task.agent_id
+                }
+            )
+    def match(self, record: InvocationRecord, top_k: int = 1) -> str | None:
+        """
+        Match a single InvocationRecord to a task by prompt similarity.
+        """
+        results = self.vector_store.search(
+            run_id="task_index",
+            query=record.prompt,
+            top_k=top_k
+        )
+        if not results:
+            return None
+        best_match = results[0]
+        # Threshold: term overlap frequency > 0.3 (adjusted to avoid false positives)
+        if best_match.get("score", 0) < 0.3:
+            return None
+        task_id = best_match["metadata"]["task_id"]
+        record.task_id = task_id
+        record.task_title = best_match["metadata"]["name"]
+        record.role = best_match["metadata"]["agent_id"]
+        return task_id
+    def match_batch(self, records: list[InvocationRecord]) -> None:
+        """
+        Match all records in place.
+        """
+        for record in records:
+            if not self.match(record):
+                logger.warning(f"Unmatched prompt for record {record.uid}")
+    def match_stats(self, records: list[InvocationRecord]) -> dict[str, Any]:
+        """
+        Returns matching statistics.
+        """
+        total = len(records)
+        matched_records = [r for r in records if r.task_id]
+        matched_count = len(matched_records)
+        by_task = {}
+        for r in matched_records:
+            by_task[r.task_id] = by_task.get(r.task_id, 0) + 1
+        return {
+            "total": total,
+            "matched": matched_count,
+            "unmatched": total - matched_count,
+            "by_task": by_task,
+        }