touchstone-platform 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (313) hide show
  1. app/__init__.py +1 -0
  2. app/agent_tasks.py +34 -0
  3. app/audit/__init__.py +4 -0
  4. app/audit/ledger.py +186 -0
  5. app/audit/normalizer.py +62 -0
  6. app/audit/orchestrator.py +117 -0
  7. app/audit/task_matcher.py +85 -0
  8. app/auth.py +437 -0
  9. app/billing.py +337 -0
  10. app/catalog/__init__.py +1 -0
  11. app/catalog/certify.py +109 -0
  12. app/catalog/certify_store.py +175 -0
  13. app/catalog/custom.py +198 -0
  14. app/catalog/custom_scenario.py +92 -0
  15. app/catalog/custom_store.py +101 -0
  16. app/catalog/discover.py +375 -0
  17. app/catalog/govern.py +175 -0
  18. app/catalog/llm.py +166 -0
  19. app/catalog/monitor.py +119 -0
  20. app/catalog/monitor_store.py +132 -0
  21. app/catalog/outcomes.py +169 -0
  22. app/catalog/outcomes_store.py +141 -0
  23. app/catalog/pipeline.py +362 -0
  24. app/catalog/pipeline_store.py +206 -0
  25. app/catalog/plan.py +391 -0
  26. app/catalog/plan_store.py +117 -0
  27. app/catalog/portfolio.py +360 -0
  28. app/catalog/prism.py +202 -0
  29. app/catalog/prism_classifier.py +241 -0
  30. app/catalog/prism_store.py +254 -0
  31. app/catalog/ranking.py +250 -0
  32. app/catalog/registry_link.py +37 -0
  33. app/catalog/router_catalog.py +44 -0
  34. app/catalog/router_certify.py +161 -0
  35. app/catalog/router_custom.py +89 -0
  36. app/catalog/router_discover.py +298 -0
  37. app/catalog/router_govern.py +42 -0
  38. app/catalog/router_monitor.py +53 -0
  39. app/catalog/router_outcomes.py +174 -0
  40. app/catalog/router_pipeline.py +127 -0
  41. app/catalog/router_plan.py +117 -0
  42. app/catalog/router_portfolio.py +30 -0
  43. app/catalog/router_prism.py +298 -0
  44. app/catalog/router_select.py +99 -0
  45. app/catalog/schema.py +146 -0
  46. app/catalog/select.py +261 -0
  47. app/catalog/store.py +271 -0
  48. app/certification.py +189 -0
  49. app/connectors/base.py +36 -0
  50. app/connectors/billing_aws.py +169 -0
  51. app/connectors/billing_azure.py +162 -0
  52. app/connectors/cloud.py +382 -0
  53. app/connectors/factory.py +33 -0
  54. app/connectors/git.py +156 -0
  55. app/connectors/local.py +70 -0
  56. app/connectors/log_cloudwatch.py +183 -0
  57. app/connectors/manager.py +78 -0
  58. app/database.py +409 -0
  59. app/deepeval_integration.py +108 -0
  60. app/engine.py +312 -0
  61. app/env_compat.py +40 -0
  62. app/eval/__init__.py +19 -0
  63. app/eval/adapter.py +35 -0
  64. app/eval/agentic_runner.py +100 -0
  65. app/eval/aggregation.py +57 -0
  66. app/eval/atr/__init__.py +9 -0
  67. app/eval/atr/loader.py +171 -0
  68. app/eval/attestation.py +217 -0
  69. app/eval/attestation_pdf.py +190 -0
  70. app/eval/attestation_revocation.py +180 -0
  71. app/eval/config.py +28 -0
  72. app/eval/cost.py +66 -0
  73. app/eval/drift.py +158 -0
  74. app/eval/drift_alerts.py +129 -0
  75. app/eval/evidence_bundle.py +284 -0
  76. app/eval/horizontal/__init__.py +12 -0
  77. app/eval/horizontal/loader.py +406 -0
  78. app/eval/inspect_export.py +278 -0
  79. app/eval/leaderboard.py +44 -0
  80. app/eval/leaderboards/__init__.py +0 -0
  81. app/eval/leaderboards/agent_threat_safety.py +46 -0
  82. app/eval/leaderboards/compliance_boards.py +205 -0
  83. app/eval/leaderboards/genai_red_team.py +50 -0
  84. app/eval/leaderboards/horizontal_boards.py +49 -0
  85. app/eval/leaderboards/medhelm.py +89 -0
  86. app/eval/leaderboards/medical_safety.py +45 -0
  87. app/eval/leaderboards/modernization_agentic.py +79 -0
  88. app/eval/leaderboards/modernization_classic.py +44 -0
  89. app/eval/leaderboards/modernization_evidence.py +71 -0
  90. app/eval/leaderboards/modernization_robustness.py +53 -0
  91. app/eval/leaderboards/modernization_safety.py +53 -0
  92. app/eval/leaderboards/schemas/agent_threat_safety.yaml +10 -0
  93. app/eval/leaderboards/schemas/compliance_ca_ai_laws.yaml +46 -0
  94. app/eval/leaderboards/schemas/compliance_cms_interop.yaml +22 -0
  95. app/eval/leaderboards/schemas/compliance_gdpr.yaml +89 -0
  96. app/eval/leaderboards/schemas/compliance_hipaa.yaml +33 -0
  97. app/eval/leaderboards/schemas/compliance_hti_1.yaml +30 -0
  98. app/eval/leaderboards/schemas/compliance_il_ai_laws.yaml +26 -0
  99. app/eval/leaderboards/schemas/compliance_mitre_atlas.yaml +90 -0
  100. app/eval/leaderboards/schemas/compliance_nyc_ll144.yaml +26 -0
  101. app/eval/leaderboards/schemas/compliance_owasp_agentic.yaml +42 -0
  102. app/eval/leaderboards/schemas/compliance_owasp_llm_top10.yaml +90 -0
  103. app/eval/leaderboards/schemas/compliance_pci_dss.yaml +37 -0
  104. app/eval/leaderboards/schemas/compliance_sox.yaml +61 -0
  105. app/eval/leaderboards/schemas/genai_red_team.yaml +11 -0
  106. app/eval/leaderboards/schemas/horizontal_customer_support.yaml +10 -0
  107. app/eval/leaderboards/schemas/horizontal_finance_ap.yaml +10 -0
  108. app/eval/leaderboards/schemas/horizontal_hr_assist.yaml +10 -0
  109. app/eval/leaderboards/schemas/horizontal_it_servicedesk.yaml +10 -0
  110. app/eval/leaderboards/schemas/horizontal_procurement.yaml +10 -0
  111. app/eval/leaderboards/schemas/horizontal_sales_ops.yaml +10 -0
  112. app/eval/leaderboards/schemas/medhelm_modernization.yaml +46 -0
  113. app/eval/leaderboards/schemas/medical_safety.yaml +10 -0
  114. app/eval/leaderboards/schemas/modernization_agentic.yaml +31 -0
  115. app/eval/leaderboards/schemas/modernization_classic.yaml +42 -0
  116. app/eval/leaderboards/schemas/modernization_evidence.yaml +48 -0
  117. app/eval/leaderboards/schemas/modernization_robustness.yaml +45 -0
  118. app/eval/leaderboards/schemas/modernization_safety.yaml +30 -0
  119. app/eval/leaderboards/schemas/swe_bench_verified.yaml +10 -0
  120. app/eval/leaderboards/schemas/vertical_edu_k12.yaml +8 -0
  121. app/eval/leaderboards/schemas/vertical_fs_insurance_pc.yaml +9 -0
  122. app/eval/leaderboards/schemas/vertical_fs_payments.yaml +8 -0
  123. app/eval/leaderboards/schemas/vertical_fs_retail_banking.yaml +9 -0
  124. app/eval/leaderboards/schemas/vertical_hcls_ambulatory.yaml +9 -0
  125. app/eval/leaderboards/schemas/vertical_hcls_lab.yaml +9 -0
  126. app/eval/leaderboards/schemas/vertical_hcls_medtech.yaml +9 -0
  127. app/eval/leaderboards/schemas/vertical_hcls_payer.yaml +8 -0
  128. app/eval/leaderboards/schemas/vertical_hcls_pbm.yaml +9 -0
  129. app/eval/leaderboards/schemas/vertical_hcls_pharma.yaml +9 -0
  130. app/eval/leaderboards/schemas/vertical_hitech_semi.yaml +8 -0
  131. app/eval/leaderboards/schemas/vertical_hitech_software.yaml +9 -0
  132. app/eval/leaderboards/schemas/vertical_industry_energy.yaml +9 -0
  133. app/eval/leaderboards/schemas/vertical_industry_pubsec.yaml +9 -0
  134. app/eval/leaderboards/schemas/vertical_retail_apparel.yaml +9 -0
  135. app/eval/leaderboards/schemas/vertical_retail_bigbox.yaml +9 -0
  136. app/eval/leaderboards/schemas/vertical_retail_cpg_brands.yaml +8 -0
  137. app/eval/leaderboards/schemas/vertical_retail_cstore.yaml +9 -0
  138. app/eval/leaderboards/schemas/vertical_retail_grocery.yaml +9 -0
  139. app/eval/leaderboards/swe_bench.py +58 -0
  140. app/eval/leaderboards/vertical_boards.py +71 -0
  141. app/eval/metric.py +51 -0
  142. app/eval/metrics/__init__.py +0 -0
  143. app/eval/metrics/agentic/__init__.py +1 -0
  144. app/eval/metrics/agentic/action_sequence.py +103 -0
  145. app/eval/metrics/agentic/replan_count.py +36 -0
  146. app/eval/metrics/agentic/step_success_rate.py +37 -0
  147. app/eval/metrics/agentic/task_completion.py +90 -0
  148. app/eval/metrics/agentic/trajectory_length.py +27 -0
  149. app/eval/metrics/code/__init__.py +1 -0
  150. app/eval/metrics/code/patch_execution.py +111 -0
  151. app/eval/metrics/code/patch_validity.py +142 -0
  152. app/eval/metrics/compliance/__init__.py +7 -0
  153. app/eval/metrics/compliance/control_match.py +231 -0
  154. app/eval/metrics/evidence/__init__.py +6 -0
  155. app/eval/metrics/evidence/audit_ledger_integrity.py +63 -0
  156. app/eval/metrics/evidence/citation_f1.py +58 -0
  157. app/eval/metrics/evidence/hallucination_rate.py +35 -0
  158. app/eval/metrics/evidence/kg_groundedness.py +57 -0
  159. app/eval/metrics/evidence/lineage_f1.py +47 -0
  160. app/eval/metrics/exact_match.py +12 -0
  161. app/eval/metrics/hcls/__init__.py +1 -0
  162. app/eval/metrics/hcls/bertscore.py +139 -0
  163. app/eval/metrics/hcls/jury_score.py +111 -0
  164. app/eval/metrics/hcls/phi_leak.py +101 -0
  165. app/eval/metrics/judge_rubric.py +89 -0
  166. app/eval/metrics/robustness_delta.py +25 -0
  167. app/eval/metrics/rouge.py +114 -0
  168. app/eval/metrics/safety/__init__.py +1 -0
  169. app/eval/metrics/safety/atr_detection.py +57 -0
  170. app/eval/metrics/safety/bias.py +181 -0
  171. app/eval/metrics/safety/bias_extended.py +185 -0
  172. app/eval/metrics/safety/medical_red_team.py +77 -0
  173. app/eval/metrics/safety/redteam_detection.py +83 -0
  174. app/eval/metrics/safety/wmdp_score.py +67 -0
  175. app/eval/perturbations/__init__.py +7 -0
  176. app/eval/perturbations/base.py +31 -0
  177. app/eval/perturbations/comment_noise.py +23 -0
  178. app/eval/perturbations/dead_code_injection.py +26 -0
  179. app/eval/perturbations/dialect_drift.py +23 -0
  180. app/eval/perturbations/identifier_mangling.py +36 -0
  181. app/eval/perturbations/partial_copybook.py +22 -0
  182. app/eval/perturbations/registry.py +19 -0
  183. app/eval/perturbed_scenario.py +35 -0
  184. app/eval/redteam/__init__.py +8 -0
  185. app/eval/redteam/loader.py +289 -0
  186. app/eval/request_cache.py +169 -0
  187. app/eval/risk_tier.py +134 -0
  188. app/eval/run_spec.py +29 -0
  189. app/eval/runner.py +107 -0
  190. app/eval/runners/__init__.py +5 -0
  191. app/eval/runners/swe_bench_docker.py +336 -0
  192. app/eval/runners/tau_bench/__init__.py +57 -0
  193. app/eval/runners/tau_bench/airline_state.py +154 -0
  194. app/eval/runners/tau_bench/airline_tools.py +205 -0
  195. app/eval/runners/tau_bench/golden.py +90 -0
  196. app/eval/runners/tau_bench/retail_state.py +208 -0
  197. app/eval/runners/tau_bench/runner.py +259 -0
  198. app/eval/runners/tau_bench/tools.py +241 -0
  199. app/eval/runspec_overrides.py +82 -0
  200. app/eval/scenario.py +54 -0
  201. app/eval/schema.py +28 -0
  202. app/eval/snapshot.py +85 -0
  203. app/eval/specs/__init__.py +0 -0
  204. app/eval/specs/agentic_cobol_modernization.py +31 -0
  205. app/eval/specs/atr_red_team.py +44 -0
  206. app/eval/specs/cobol_billing_scenario.py +11 -0
  207. app/eval/specs/cobol_lineage_scenario.py +10 -0
  208. app/eval/specs/edelweiss_investing_scenario.py +10 -0
  209. app/eval/specs/genai_red_team.py +51 -0
  210. app/eval/specs/healthcare_data_scenario.py +10 -0
  211. app/eval/specs/horizontal_agent_scenarios.py +99 -0
  212. app/eval/specs/huggingface_dataset_scenario.py +357 -0
  213. app/eval/specs/java_refactor_scenario.py +10 -0
  214. app/eval/specs/medical_red_team.py +128 -0
  215. app/eval/specs/sap_modernization_scenario.py +10 -0
  216. app/eval/specs/swe_bench_scenarios.py +92 -0
  217. app/eval/specs/tau_bench_scenarios.py +98 -0
  218. app/eval/specs/vertical_scenarios.py +317 -0
  219. app/eval/specs/wmdp_scenarios.py +106 -0
  220. app/eval/specs/yaml_spec_scenario.py +53 -0
  221. app/eval/trace.py +39 -0
  222. app/eval/trajectory.py +43 -0
  223. app/fitment.py +843 -0
  224. app/frameworks/__init__.py +40 -0
  225. app/frameworks/autogen_runner.py +40 -0
  226. app/frameworks/base.py +29 -0
  227. app/frameworks/crewai_runner.py +123 -0
  228. app/frameworks/datagol_runner.py +36 -0
  229. app/frameworks/direct_api.py +441 -0
  230. app/frameworks/galileo_runner.py +88 -0
  231. app/frameworks/langchain_runner.py +35 -0
  232. app/frameworks/langgraph_runner.py +38 -0
  233. app/frameworks/letta_runner.py +35 -0
  234. app/frameworks/llamaindex_runner.py +38 -0
  235. app/frameworks/mem0_runner.py +38 -0
  236. app/frameworks/n8n_runner.py +37 -0
  237. app/frameworks/openclawd_runner.py +84 -0
  238. app/frameworks/semantic_kernel_runner.py +35 -0
  239. app/frameworks/stackai_runner.py +35 -0
  240. app/frameworks/superagi_runner.py +35 -0
  241. app/frameworks/zep_runner.py +35 -0
  242. app/git_utils.py +53 -0
  243. app/health.py +132 -0
  244. app/knowledge.py +894 -0
  245. app/licensing.py +170 -0
  246. app/main.py +1765 -0
  247. app/models.py +151 -0
  248. app/onboarding.py +274 -0
  249. app/outputs.py +91 -0
  250. app/parsers/apex.py +45 -0
  251. app/parsers/asp.py +49 -0
  252. app/parsers/base.py +20 -0
  253. app/parsers/cobol.py +146 -0
  254. app/parsers/config.py +35 -0
  255. app/parsers/enterprise.py +101 -0
  256. app/parsers/factory.py +76 -0
  257. app/parsers/legacy_c.py +35 -0
  258. app/parsers/modern.py +53 -0
  259. app/parsers/sql.py +91 -0
  260. app/pricing.py +46 -0
  261. app/scoring.py +251 -0
  262. app/security.py +206 -0
  263. app/spec_loader.py +186 -0
  264. app/spec_v1.py +192 -0
  265. app/static/agent_catalog.json +1198 -0
  266. app/static/ai_inventory.html +111 -0
  267. app/static/app.js +2231 -0
  268. app/static/audit.js +89 -0
  269. app/static/auth.js +245 -0
  270. app/static/catalog.html +55 -0
  271. app/static/css/catalog.css +128 -0
  272. app/static/css/leaderboards.css +50 -0
  273. app/static/css/portfolio.css +218 -0
  274. app/static/css/prism.css +187 -0
  275. app/static/custom.html +137 -0
  276. app/static/discover.html +133 -0
  277. app/static/govern.html +183 -0
  278. app/static/index.html +698 -0
  279. app/static/instance_trace.html +18 -0
  280. app/static/js/catalog.js +267 -0
  281. app/static/js/custom.js +146 -0
  282. app/static/js/discover.js +112 -0
  283. app/static/js/govern.js +167 -0
  284. app/static/js/instance_trace.js +28 -0
  285. app/static/js/leaderboard_compare.js +31 -0
  286. app/static/js/leaderboard_detail.js +62 -0
  287. app/static/js/leaderboards.js +90 -0
  288. app/static/js/monitor.js +192 -0
  289. app/static/js/plan.js +180 -0
  290. app/static/js/portfolio.js +205 -0
  291. app/static/js/prism.js +399 -0
  292. app/static/leaderboard_compare.html +18 -0
  293. app/static/leaderboard_detail.html +22 -0
  294. app/static/leaderboards/.gitkeep +0 -0
  295. app/static/leaderboards/index.html +9 -0
  296. app/static/leaderboards/modernization_classic.html +29 -0
  297. app/static/leaderboards.html +17 -0
  298. app/static/login.html +266 -0
  299. app/static/logo_mark.png +0 -0
  300. app/static/monitor.html +147 -0
  301. app/static/onboarding.html +85 -0
  302. app/static/onboarding.js +96 -0
  303. app/static/plan.html +87 -0
  304. app/static/portfolio.html +107 -0
  305. app/static/prism.html +47 -0
  306. app/static/styles.css +2290 -0
  307. app/terraform.py +1028 -0
  308. app/vector_stores.py +303 -0
  309. touchstone_platform-1.0.2.dist-info/METADATA +512 -0
  310. touchstone_platform-1.0.2.dist-info/RECORD +313 -0
  311. touchstone_platform-1.0.2.dist-info/WHEEL +4 -0
  312. touchstone_platform-1.0.2.dist-info/entry_points.txt +2 -0
  313. touchstone_platform-1.0.2.dist-info/licenses/LICENSE.md +242 -0
app/__init__.py ADDED
@@ -0,0 +1 @@
1
+ # HuggingFace Model Benchmarking Gym
app/agent_tasks.py ADDED
@@ -0,0 +1,34 @@
1
+ """Agent tasks — spec-driven v2. All content comes from Gym Spec v1.0."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from app.spec_loader import (
6
+ get_active_spec,
7
+ get_artifacts_info,
8
+ get_roles_info,
9
+ load_artifact_content,
10
+ )
11
+
12
+
13
+ def get_artifacts_list() -> list[dict]:
14
+ return get_artifacts_info()
15
+
16
+ def get_artifact_content(name: str) -> str | None:
17
+ return load_artifact_content(name)
18
+
19
+ def get_all_roles_info() -> list[dict]:
20
+ return get_roles_info()
21
+
22
+ def get_tasks_for_role(agent_id: str) -> list:
23
+ spec = get_active_spec()
24
+ return [t for t in spec.tasks if t.agent_id == agent_id]
25
+
26
+ def build_prompt(task) -> str:
27
+ prompt = task.prompt
28
+ for artifact_name in task.required_artifacts:
29
+ content = load_artifact_content(artifact_name)
30
+ if content:
31
+ prompt = prompt.replace(f"{{{artifact_name}}}", content)
32
+ else:
33
+ prompt = prompt.replace(f"{{{artifact_name}}}", f"[Artifact {artifact_name} not found]")
34
+ return prompt
app/audit/__init__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .ledger import AuditLedger, audit_ledger
2
+ from .normalizer import InvocationRecord, normalize_artifact, normalize_batch
3
+ from .orchestrator import AuditOrchestrator
4
+ from .task_matcher import TaskMatcher
app/audit/ledger.py ADDED
@@ -0,0 +1,186 @@
1
+ """Immutable audit ledger for SOC2 compliance."""
2
+
3
+ import hashlib
4
+ import hmac
5
+ import json
6
+ import logging
7
+ import os
8
+ import time
9
+ from pathlib import Path
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ AUDIT_LOG_FILE = os.environ.get(
14
+ "MODEL_GYM_AUDIT_LOG",
15
+ os.path.join("scratch", "audit", "audit_ledger.jsonl"),
16
+ )
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # HMAC secret key
20
+ # The key is loaded from the AUDIT_HMAC_SECRET environment variable.
21
+ # In production, set this to a strong random value (e.g. 32+ random bytes,
22
+ # base64-encoded). If the variable is absent we fall back to a well-known
23
+ # dev-only placeholder and emit a warning — never use this in production.
24
+ # ---------------------------------------------------------------------------
25
+ _SECRET_FROM_ENV: str | None = os.environ.get("AUDIT_HMAC_SECRET")
26
+ _IS_PRODUCTION: bool = os.environ.get("MODEL_GYM_ENV", "development").lower() == "production"
27
+
28
+ if _SECRET_FROM_ENV:
29
+ AUDIT_HMAC_KEY: bytes = _SECRET_FROM_ENV.encode()
30
+ elif _IS_PRODUCTION:
31
+ raise RuntimeError(
32
+ "AUDIT_HMAC_SECRET must be set in production. "
33
+ "Generate with: python -c \"import secrets,base64; print(base64.b64encode(secrets.token_bytes(32)).decode())\""
34
+ )
35
+ else:
36
+ logger.warning(
37
+ "AUDIT_HMAC_SECRET env var not set — using insecure dev-only key. "
38
+ "Set TOUCHSTONE_ENV=production (or MODEL_GYM_ENV=production) to enforce this at startup."
39
+ )
40
+ AUDIT_HMAC_KEY = b"dev-only-secret"
41
+
42
+
43
+ def _compute_hmac(previous_hash: str, event_json_bytes: bytes) -> str:
44
+ """Return HMAC-SHA256(key=AUDIT_HMAC_KEY, msg=previous_hash_bytes + event_json_bytes)."""
45
+ msg = previous_hash.encode() + event_json_bytes
46
+ return hmac.new(AUDIT_HMAC_KEY, msg, hashlib.sha256).hexdigest()
47
+
48
+
49
+ class AuditLedger:
50
+ """Manages HMAC-signed hash-linked audit entries for processing integrity."""
51
+
52
+ def __init__(self, log_file: str = AUDIT_LOG_FILE):
53
+ self.log_file = Path(log_file)
54
+ self._ensure_log_exists()
55
+ # Cache: restored from the last line of the log on startup.
56
+ self._last_hash: str = "GENESIS"
57
+ self._last_seq: int = 0
58
+ self._restore_cache()
59
+
60
+ # ------------------------------------------------------------------
61
+ # Internal helpers
62
+ # ------------------------------------------------------------------
63
+
64
+ def _ensure_log_exists(self):
65
+ self.log_file.parent.mkdir(parents=True, exist_ok=True)
66
+ if not self.log_file.exists():
67
+ self.log_file.touch()
68
+
69
+ def _restore_cache(self):
70
+ """Read only the last line of the log to prime the in-memory cache — O(1)."""
71
+ if not self.log_file.exists() or self.log_file.stat().st_size == 0:
72
+ return
73
+
74
+ # Efficient last-line read without loading the whole file.
75
+ last_line: str = ""
76
+ with open(self.log_file, "rb") as fh:
77
+ # Seek to end; scan backwards for the previous newline.
78
+ fh.seek(0, 2)
79
+ size = fh.tell()
80
+ if size == 0:
81
+ return
82
+ # Step back past any trailing newline.
83
+ pos = size - 1
84
+ while pos >= 0:
85
+ fh.seek(pos)
86
+ ch = fh.read(1)
87
+ if ch == b"\n" and pos < size - 1:
88
+ break
89
+ pos -= 1
90
+ fh.seek(pos + 1)
91
+ last_line = fh.read().decode("utf-8").strip()
92
+
93
+ if last_line:
94
+ try:
95
+ data = json.loads(last_line)
96
+ self._last_hash = data.get("hash", "GENESIS")
97
+ self._last_seq = data.get("seq", 0)
98
+ except json.JSONDecodeError:
99
+ pass # Leave defaults; verify_integrity() will catch corruption.
100
+
101
+ # ------------------------------------------------------------------
102
+ # Public API
103
+ # ------------------------------------------------------------------
104
+
105
+ def log_action(self, action: str, actor: str, payload: dict, run_id: str | None = None):
106
+ """Appends an HMAC-signed, hash-linked entry to the ledger."""
107
+ previous_hash = self._last_hash
108
+ seq = self._last_seq + 1
109
+
110
+ entry = {
111
+ "seq": seq,
112
+ "timestamp": time.time(),
113
+ "action": action,
114
+ "actor": actor,
115
+ "run_id": run_id,
116
+ "payload": payload,
117
+ "previous_hash": previous_hash,
118
+ }
119
+
120
+ event_json_bytes = json.dumps(entry, sort_keys=True).encode()
121
+ entry_hash = _compute_hmac(previous_hash, event_json_bytes)
122
+
123
+ with open(self.log_file, "a", encoding="utf-8") as f:
124
+ f.write(json.dumps({**entry, "hash": entry_hash}) + "\n")
125
+
126
+ # Update cache.
127
+ self._last_hash = entry_hash
128
+ self._last_seq = seq
129
+
130
+ return entry_hash
131
+
132
+ # Keep the old method name as an alias for backward compatibility.
133
+ def log_event(self, action: str, actor: str, payload: dict, run_id: str | None = None):
134
+ return self.log_action(action, actor, payload, run_id=run_id)
135
+
136
+ def verify_integrity(self) -> bool:
137
+ """Verifies the entire HMAC chain for tampering using constant-time comparison."""
138
+ if not self.log_file.exists() or self.log_file.stat().st_size == 0:
139
+ return True
140
+
141
+ expected_prev_hash = "GENESIS"
142
+ with open(self.log_file, encoding="utf-8") as f:
143
+ for line in f:
144
+ line = line.strip()
145
+ if not line:
146
+ continue
147
+ try:
148
+ data = json.loads(line)
149
+ except json.JSONDecodeError:
150
+ return False
151
+
152
+ stored_hash = data.pop("hash", None)
153
+ if stored_hash is None:
154
+ return False
155
+
156
+ # Verify the previous-hash link.
157
+ if data.get("previous_hash") != expected_prev_hash:
158
+ return False
159
+
160
+ # Recompute HMAC.
161
+ event_json_bytes = json.dumps(data, sort_keys=True).encode()
162
+ recomputed_hash = _compute_hmac(expected_prev_hash, event_json_bytes)
163
+
164
+ # Constant-time comparison to prevent timing attacks.
165
+ if not hmac.compare_digest(stored_hash, recomputed_hash):
166
+ return False
167
+
168
+ expected_prev_hash = stored_hash
169
+
170
+ return True
171
+
172
+ def get_logs(self, limit: int = 100) -> list[dict]:
173
+ """Returns the most recent audit logs."""
174
+ if not self.log_file.exists():
175
+ return []
176
+
177
+ logs = []
178
+ with open(self.log_file, encoding="utf-8") as f:
179
+ for line in f:
180
+ line = line.strip()
181
+ if line:
182
+ logs.append(json.loads(line))
183
+ return logs[-limit:]
184
+
185
+
186
+ audit_ledger = AuditLedger()
@@ -0,0 +1,62 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any
3
+
4
+
5
+ @dataclass
6
+ class InvocationRecord:
7
+ """Normalized representation of a single LLM invocation from any provider."""
8
+ uid: str # from connector artifact["uid"]
9
+ provider: str # "aws_bedrock" | "azure_openai" | "local"
10
+ model_id: str
11
+ prompt: str
12
+ completion: str
13
+ timestamp_iso: str
14
+ input_tokens: int = 0
15
+ output_tokens: int = 0
16
+ cost_usd: float = 0.0
17
+ latency_ms: int = 0
18
+ task_id: str = "" # populated during matching
19
+ task_title: str = "" # populated during matching
20
+ role: str = "" # populated during matching
21
+ metadata: dict[str, Any] = field(default_factory=dict) # raw source dict
22
+
23
+ def normalize_artifact(artifact: dict[str, Any]) -> InvocationRecord | None:
24
+ """
25
+ Convert a connector artifact dict to an InvocationRecord.
26
+ Returns None if artifact["type"] != "invocation_log" or prompt is empty.
27
+ """
28
+ if artifact.get("type") != "invocation_log":
29
+ return None
30
+
31
+ prompt = artifact.get("prompt", "").strip()
32
+ if not prompt:
33
+ return None
34
+
35
+ return InvocationRecord(
36
+ uid=artifact["uid"],
37
+ provider=artifact.get("provider", "unknown"),
38
+ model_id=artifact["model_id"],
39
+ prompt=str(prompt)[:8000],
40
+ completion=str(artifact.get("completion", ""))[:8000],
41
+ timestamp_iso=artifact["timestamp_iso"],
42
+ input_tokens=artifact.get("input_tokens", 0),
43
+ output_tokens=artifact.get("output_tokens", 0),
44
+ cost_usd=artifact.get("cost_usd", 0.0),
45
+ latency_ms=artifact.get("latency_ms", 0),
46
+ metadata=artifact
47
+ )
48
+
49
+ def normalize_batch(artifacts: list[dict[str, Any]]) -> list[InvocationRecord]:
50
+ """
51
+ Filter and normalize a list of connector artifacts.
52
+ Returns list of InvocationRecord, ordered by timestamp_iso ascending.
53
+ """
54
+ records = []
55
+ for art in artifacts:
56
+ record = normalize_artifact(art)
57
+ if record:
58
+ records.append(record)
59
+
60
+ # Sort chronologically
61
+ records.sort(key=lambda x: x.timestamp_iso)
62
+ return records
@@ -0,0 +1,117 @@
1
+ import datetime
2
+ import logging
3
+ from typing import Any
4
+
5
+ from app.audit.normalizer import InvocationRecord, normalize_batch
6
+ from app.audit.task_matcher import TaskMatcher
7
+ from app.connectors.factory import ConnectorFactory
8
+ from app.spec_v1 import GymSpecV1
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ class AuditOrchestrator:
13
+ """
14
+ Main orchestration engine for Audit Mode.
15
+ Cohesion point for Ingestion, Normalization, Matching, and Scoring.
16
+ """
17
+
18
+ def __init__(self, spec: GymSpecV1, scorer: Any = None):
19
+ self.spec = spec
20
+ if spec.mode != "audit":
21
+ raise ValueError("GymSpec is not in audit mode")
22
+
23
+ self.matcher = TaskMatcher(spec)
24
+ self.scorer = scorer # Should be an instance of ScoringEngine
25
+ self.incumbent_config = spec.audit.incumbent
26
+
27
+ def run_audit(self) -> dict[str, Any]:
28
+ """
29
+ Execute the full audit workflow.
30
+ """
31
+ logger.info(f"Starting audit for {self.spec.gym.name}")
32
+
33
+ # 1. Ingestion
34
+ artifacts = self._ingest_logs()
35
+
36
+ # 2. Normalization
37
+ records = normalize_batch(artifacts)
38
+
39
+ # 3. Task Matching
40
+ self.matcher.match_batch(records)
41
+ stats = self.matcher.match_stats(records)
42
+
43
+ # 4. Replay Scoring (Scoring existing outputs)
44
+ results = []
45
+ total_score = 0.0
46
+ scored_count = 0
47
+
48
+ for record in records:
49
+ if not record.task_id:
50
+ continue
51
+
52
+ score_result = self._score_record(record)
53
+ if score_result:
54
+ results.append({
55
+ "record_uid": record.uid,
56
+ "task_id": record.task_id,
57
+ "score": score_result.get("overall_score", 0.0),
58
+ "cost": record.cost_usd,
59
+ "dimensions": score_result.get("dimensions", {})
60
+ })
61
+ total_score += score_result.get("overall_score", 0.0)
62
+ scored_count += 1
63
+
64
+ avg_score = total_score / scored_count if scored_count > 0 else 0.0
65
+ total_cost = sum(r.cost_usd for r in records)
66
+
67
+ report = {
68
+ "spec_id": self.spec.gym.id,
69
+ "timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
70
+ "status": "completed",
71
+ "summary": {
72
+ "total_invocations": len(records),
73
+ "matched_invocations": stats["matched"],
74
+ "scored_invocations": scored_count,
75
+ "average_score": round(avg_score, 4),
76
+ "total_cost_usd": round(total_cost, 4),
77
+ "cost_per_matched_outcome": round(total_cost / scored_count, 4) if scored_count > 0 else 0.0
78
+ },
79
+ "results": results,
80
+ "tasks": stats["by_task"]
81
+ }
82
+
83
+ return report
84
+
85
+ def _ingest_logs(self) -> list[dict[str, Any]]:
86
+ """Pull logs from the configured source."""
87
+ conn_id = self.incumbent_config.connection_id or "audit_source"
88
+
89
+ # Build connector config from incumbent settings
90
+ config = {
91
+ "provider": self.incumbent_config.provider,
92
+ "model_id": self.incumbent_config.model_id,
93
+ "region_name": self.incumbent_config.region,
94
+ "log_group_name": self.incumbent_config.log_group,
95
+ "start_time_iso": self.incumbent_config.billing_start,
96
+ "end_time_iso": self.incumbent_config.billing_end,
97
+ }
98
+
99
+ connector = ConnectorFactory.get_connector(conn_id, config)
100
+ return connector.list_artifacts()
101
+
102
+ def _score_record(self, record: InvocationRecord) -> dict[str, Any] | None:
103
+ """Score a single invocation using the provided scorer."""
104
+ if not self.scorer:
105
+ # Fallback mock score if no scorer provided (for testing/demo)
106
+ return {"overall_score": 0.0, "dimensions": {}}
107
+
108
+ # The scorer should have a method like score_task(task_id, prompt, completion)
109
+ try:
110
+ return self.scorer.score_task(
111
+ task_id=record.task_id,
112
+ prompt=record.prompt,
113
+ completion=record.completion
114
+ )
115
+ except Exception as e:
116
+ logger.error(f"Scoring failed for record {record.uid}: {e!s}")
117
+ return None
@@ -0,0 +1,85 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ from app.audit.normalizer import InvocationRecord
5
+ from app.spec_v1 import GymSpecV1
6
+ from app.vector_stores import LexicalVectorStore
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ class TaskMatcher:
11
+ """
12
+ Matches raw LLM invocation prompts to GymSpec task definitions
13
+ using lexical similarity.
14
+ """
15
+
16
+ def __init__(self, spec: GymSpecV1):
17
+ self.spec = spec
18
+ self.tasks = spec.tasks
19
+ self.vector_store = LexicalVectorStore(chunk_size=200, overlap=20)
20
+
21
+ # Build index of task prompts
22
+ # Index format: "{task.name} {task.prompt}"
23
+ for task in self.tasks:
24
+ content = f"{task.name} {task.prompt}"
25
+ self.vector_store.add_document(
26
+ run_id="task_index",
27
+ content=content,
28
+ metadata={
29
+ "task_id": task.id,
30
+ "name": task.name,
31
+ "agent_id": task.agent_id
32
+ }
33
+ )
34
+
35
+ def match(self, record: InvocationRecord, top_k: int = 1) -> str | None:
36
+ """
37
+ Match a single InvocationRecord to a task by prompt similarity.
38
+ """
39
+ results = self.vector_store.search(
40
+ run_id="task_index",
41
+ query=record.prompt,
42
+ top_k=top_k
43
+ )
44
+
45
+ if not results:
46
+ return None
47
+
48
+ best_match = results[0]
49
+ # Threshold: term overlap frequency > 0.3 (adjusted to avoid false positives)
50
+ if best_match.get("score", 0) < 0.3:
51
+ return None
52
+
53
+ task_id = best_match["metadata"]["task_id"]
54
+ record.task_id = task_id
55
+ record.task_title = best_match["metadata"]["name"]
56
+ record.role = best_match["metadata"]["agent_id"]
57
+
58
+ return task_id
59
+
60
+ def match_batch(self, records: list[InvocationRecord]) -> None:
61
+ """
62
+ Match all records in place.
63
+ """
64
+ for record in records:
65
+ if not self.match(record):
66
+ logger.warning(f"Unmatched prompt for record {record.uid}")
67
+
68
+ def match_stats(self, records: list[InvocationRecord]) -> dict[str, Any]:
69
+ """
70
+ Returns matching statistics.
71
+ """
72
+ total = len(records)
73
+ matched_records = [r for r in records if r.task_id]
74
+ matched_count = len(matched_records)
75
+
76
+ by_task = {}
77
+ for r in matched_records:
78
+ by_task[r.task_id] = by_task.get(r.task_id, 0) + 1
79
+
80
+ return {
81
+ "total": total,
82
+ "matched": matched_count,
83
+ "unmatched": total - matched_count,
84
+ "by_task": by_task,
85
+ }