synth-ai 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (236) hide show
  1. examples/README.md +1 -0
  2. examples/multi_step/SFT_README.md +147 -0
  3. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +9 -9
  4. examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
  5. examples/multi_step/convert_traces_to_sft.py +84 -0
  6. examples/multi_step/run_sft_qwen30b.sh +45 -0
  7. examples/qwen_coder/configs/coder_lora_30b.toml +2 -1
  8. examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
  9. examples/qwen_coder/configs/coder_lora_small.toml +2 -1
  10. examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
  11. examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
  12. examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
  13. examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
  14. examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
  15. examples/qwen_vl/QUICKSTART.md +327 -0
  16. examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
  17. examples/qwen_vl/README.md +154 -0
  18. examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
  19. examples/qwen_vl/RL_VISION_TESTING.md +333 -0
  20. examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
  21. examples/qwen_vl/SETUP_COMPLETE.md +275 -0
  22. examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
  23. examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
  24. examples/qwen_vl/__init__.py +2 -0
  25. examples/qwen_vl/collect_data_via_cli.md +423 -0
  26. examples/qwen_vl/collect_vision_traces.py +368 -0
  27. examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
  28. examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
  29. examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
  30. examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
  31. examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
  32. examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
  33. examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
  34. examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
  35. examples/qwen_vl/configs/filter_vision_test.toml +8 -0
  36. examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
  37. examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
  38. examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
  39. examples/qwen_vl/run_vision_comparison.sh +62 -0
  40. examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
  41. examples/qwen_vl/test_image_validation.py +201 -0
  42. examples/qwen_vl/test_sft_vision_data.py +110 -0
  43. examples/rl/README.md +1 -1
  44. examples/rl/configs/eval_base_qwen.toml +17 -0
  45. examples/rl/configs/eval_rl_qwen.toml +13 -0
  46. examples/rl/configs/rl_from_base_qwen.toml +37 -0
  47. examples/rl/configs/rl_from_base_qwen17.toml +76 -0
  48. examples/rl/configs/rl_from_ft_qwen.toml +37 -0
  49. examples/rl/run_eval.py +436 -0
  50. examples/rl/run_rl_and_save.py +111 -0
  51. examples/rl/task_app/README.md +22 -0
  52. examples/rl/task_app/math_single_step.py +990 -0
  53. examples/rl/task_app/math_task_app.py +111 -0
  54. examples/sft/README.md +5 -5
  55. examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
  56. examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
  57. examples/sft/evaluate.py +2 -4
  58. examples/sft/export_dataset.py +7 -4
  59. examples/swe/task_app/README.md +1 -1
  60. examples/swe/task_app/grpo_swe_mini.py +0 -1
  61. examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
  62. examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
  63. examples/swe/task_app/hosted/policy_routes.py +0 -2
  64. examples/swe/task_app/hosted/rollout.py +0 -8
  65. examples/task_apps/crafter/task_app/grpo_crafter.py +4 -7
  66. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +59 -1
  67. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +30 -0
  68. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +62 -31
  69. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +16 -14
  70. examples/task_apps/enron/__init__.py +1 -0
  71. examples/vlm/README.md +3 -3
  72. examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
  73. examples/vlm/crafter_openai_vlm_agent.py +3 -5
  74. examples/vlm/filter_image_rows.py +1 -1
  75. examples/vlm/run_crafter_vlm_benchmark.py +2 -2
  76. examples/warming_up_to_rl/_utils.py +92 -0
  77. examples/warming_up_to_rl/analyze_trace_db.py +1 -1
  78. examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
  79. examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
  80. examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
  81. examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
  82. examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
  83. examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
  84. examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
  85. examples/warming_up_to_rl/export_trace_sft.py +174 -60
  86. examples/warming_up_to_rl/readme.md +63 -132
  87. examples/warming_up_to_rl/run_fft_and_save.py +1 -1
  88. examples/warming_up_to_rl/run_rl_and_save.py +1 -1
  89. examples/warming_up_to_rl/task_app/README.md +42 -0
  90. examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
  91. examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
  92. examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
  93. examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
  94. examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
  95. examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
  96. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
  97. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
  98. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
  99. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
  100. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
  101. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
  102. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
  103. examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
  104. examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
  105. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
  106. examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
  107. examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
  108. examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
  109. examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
  110. examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
  111. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
  112. examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
  113. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
  114. examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
  115. examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
  116. synth_ai/__init__.py +44 -30
  117. synth_ai/_utils/__init__.py +47 -0
  118. synth_ai/_utils/base_url.py +10 -0
  119. synth_ai/_utils/http.py +10 -0
  120. synth_ai/_utils/prompts.py +10 -0
  121. synth_ai/_utils/task_app_state.py +12 -0
  122. synth_ai/_utils/user_config.py +10 -0
  123. synth_ai/api/models/supported.py +144 -7
  124. synth_ai/api/train/__init__.py +13 -1
  125. synth_ai/api/train/cli.py +30 -7
  126. synth_ai/api/train/config_finder.py +18 -11
  127. synth_ai/api/train/env_resolver.py +13 -10
  128. synth_ai/cli/__init__.py +62 -78
  129. synth_ai/cli/_modal_wrapper.py +7 -5
  130. synth_ai/cli/_typer_patch.py +0 -2
  131. synth_ai/cli/_validate_task_app.py +22 -4
  132. synth_ai/cli/legacy_root_backup.py +3 -1
  133. synth_ai/cli/lib/__init__.py +10 -0
  134. synth_ai/cli/lib/task_app_discovery.py +7 -0
  135. synth_ai/cli/lib/task_app_env.py +518 -0
  136. synth_ai/cli/recent.py +2 -1
  137. synth_ai/cli/setup.py +266 -0
  138. synth_ai/cli/status.py +1 -1
  139. synth_ai/cli/task_app_deploy.py +16 -0
  140. synth_ai/cli/task_app_list.py +25 -0
  141. synth_ai/cli/task_app_modal_serve.py +16 -0
  142. synth_ai/cli/task_app_serve.py +18 -0
  143. synth_ai/cli/task_apps.py +71 -31
  144. synth_ai/cli/traces.py +1 -1
  145. synth_ai/cli/train.py +18 -0
  146. synth_ai/cli/tui.py +7 -2
  147. synth_ai/cli/turso.py +1 -1
  148. synth_ai/cli/watch.py +1 -1
  149. synth_ai/demos/__init__.py +10 -0
  150. synth_ai/demos/core/__init__.py +28 -1
  151. synth_ai/demos/crafter/__init__.py +1 -0
  152. synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
  153. synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
  154. synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
  155. synth_ai/demos/demo_registry.py +176 -0
  156. synth_ai/demos/math/__init__.py +1 -0
  157. synth_ai/demos/math/_common.py +16 -0
  158. synth_ai/demos/math/app.py +38 -0
  159. synth_ai/demos/math/config.toml +76 -0
  160. synth_ai/demos/math/deploy_modal.py +54 -0
  161. synth_ai/demos/math/modal_task_app.py +702 -0
  162. synth_ai/demos/math/task_app_entry.py +51 -0
  163. synth_ai/environments/environment/core.py +7 -1
  164. synth_ai/environments/examples/bandit/engine.py +0 -1
  165. synth_ai/environments/examples/bandit/environment.py +0 -1
  166. synth_ai/environments/examples/wordle/environment.py +0 -1
  167. synth_ai/evals/base.py +16 -5
  168. synth_ai/evals/client.py +1 -1
  169. synth_ai/inference/client.py +1 -1
  170. synth_ai/judge_schemas.py +8 -8
  171. synth_ai/learning/client.py +1 -1
  172. synth_ai/learning/health.py +1 -1
  173. synth_ai/learning/jobs.py +1 -1
  174. synth_ai/learning/rl/client.py +1 -1
  175. synth_ai/learning/rl/env_keys.py +1 -1
  176. synth_ai/learning/rl/secrets.py +1 -1
  177. synth_ai/learning/sft/client.py +1 -1
  178. synth_ai/learning/sft/data.py +407 -4
  179. synth_ai/learning/validators.py +4 -1
  180. synth_ai/task/apps/__init__.py +4 -2
  181. synth_ai/task/config.py +6 -4
  182. synth_ai/task/rubrics/__init__.py +1 -2
  183. synth_ai/task/rubrics/loaders.py +14 -10
  184. synth_ai/task/rubrics.py +219 -0
  185. synth_ai/task/trace_correlation_helpers.py +24 -11
  186. synth_ai/task/tracing_utils.py +14 -3
  187. synth_ai/task/validators.py +2 -3
  188. synth_ai/tracing_v3/abstractions.py +3 -3
  189. synth_ai/tracing_v3/config.py +15 -13
  190. synth_ai/tracing_v3/constants.py +21 -0
  191. synth_ai/tracing_v3/db_config.py +3 -1
  192. synth_ai/tracing_v3/decorators.py +10 -7
  193. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  194. synth_ai/tracing_v3/session_tracer.py +7 -7
  195. synth_ai/tracing_v3/storage/base.py +29 -29
  196. synth_ai/tracing_v3/storage/config.py +3 -3
  197. synth_ai/tracing_v3/turso/daemon.py +8 -9
  198. synth_ai/tracing_v3/turso/native_manager.py +80 -72
  199. synth_ai/tracing_v3/utils.py +2 -2
  200. synth_ai/tui/cli/query_experiments.py +4 -4
  201. synth_ai/tui/cli/query_experiments_v3.py +4 -4
  202. synth_ai/tui/dashboard.py +14 -9
  203. synth_ai/utils/__init__.py +101 -0
  204. synth_ai/utils/base_url.py +94 -0
  205. synth_ai/utils/cli.py +131 -0
  206. synth_ai/utils/env.py +287 -0
  207. synth_ai/utils/http.py +169 -0
  208. synth_ai/utils/modal.py +308 -0
  209. synth_ai/utils/process.py +212 -0
  210. synth_ai/utils/prompts.py +39 -0
  211. synth_ai/utils/sqld.py +122 -0
  212. synth_ai/utils/task_app_discovery.py +882 -0
  213. synth_ai/utils/task_app_env.py +186 -0
  214. synth_ai/utils/task_app_state.py +318 -0
  215. synth_ai/utils/user_config.py +137 -0
  216. synth_ai/v0/config/__init__.py +1 -5
  217. synth_ai/v0/config/base_url.py +1 -7
  218. synth_ai/v0/tracing/config.py +1 -1
  219. synth_ai/v0/tracing/decorators.py +1 -1
  220. synth_ai/v0/tracing/upload.py +1 -1
  221. synth_ai/v0/tracing_v1/config.py +1 -1
  222. synth_ai/v0/tracing_v1/decorators.py +1 -1
  223. synth_ai/v0/tracing_v1/upload.py +1 -1
  224. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
  225. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/RECORD +229 -117
  226. synth_ai/cli/man.py +0 -106
  227. synth_ai/compound/cais.py +0 -0
  228. synth_ai/core/experiment.py +0 -13
  229. synth_ai/core/system.py +0 -15
  230. synth_ai/demo_registry.py +0 -295
  231. synth_ai/handshake.py +0 -109
  232. synth_ai/http.py +0 -26
  233. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
  234. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
  235. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
  236. {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,219 @@
1
+ """Rubric schema, loading, and scoring helpers for Task Apps."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from collections.abc import Iterable
7
+ from pathlib import Path
8
+ from typing import Any
9
+
10
+ from pydantic import BaseModel, Field, field_validator
11
+
12
+
13
+ class Criterion(BaseModel):
14
+ id: str
15
+ description: str
16
+ weight: float = 1.0
17
+ required: bool = False
18
+
19
+ @field_validator("weight")
20
+ @classmethod
21
+ def _validate_weight(cls, value: float) -> float:
22
+ if value <= 0:
23
+ raise ValueError("criterion weight must be positive")
24
+ return value
25
+
26
+
27
+ class Rubric(BaseModel):
28
+ version: str
29
+ goal_text: str | None = None
30
+ criteria: list[Criterion] = Field(default_factory=list)
31
+ aggregation: str = "weighted_sum"
32
+
33
+ @field_validator("aggregation")
34
+ @classmethod
35
+ def _validate_aggregation(cls, value: str) -> str:
36
+ allowed = {"sum", "weighted_sum", "custom", "inherit"}
37
+ if value not in allowed:
38
+ raise ValueError(f"aggregation must be one of {sorted(allowed)}")
39
+ return value
40
+
41
+ @field_validator("criteria")
42
+ @classmethod
43
+ def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
44
+ seen = set()
45
+ for criterion in criteria:
46
+ if criterion.id in seen:
47
+ raise ValueError(f"duplicate criterion id: {criterion.id}")
48
+ seen.add(criterion.id)
49
+ return criteria
50
+
51
+
52
+ def _load_text(source: str) -> tuple[str, str | None]:
53
+ path = Path(source)
54
+ if path.exists():
55
+ return path.read_text(encoding="utf-8"), path.suffix.lower()
56
+ return source, None
57
+
58
+
59
+ def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
60
+ text = text.strip()
61
+ if not text:
62
+ raise ValueError("Rubric source is empty")
63
+ if suffix in (".yaml", ".yml"):
64
+ try:
65
+ import yaml # type: ignore
66
+ except Exception as exc: # pragma: no cover - optional dependency
67
+ raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
68
+ data = yaml.safe_load(text)
69
+ if not isinstance(data, dict):
70
+ raise ValueError("Rubric YAML must produce a mapping") from None
71
+ return data
72
+ if text.startswith("{"):
73
+ return json.loads(text)
74
+ if text.startswith("http://") or text.startswith("https://"):
75
+ import requests # type: ignore
76
+
77
+ response = requests.get(text, timeout=15)
78
+ response.raise_for_status()
79
+ return _parse_structured(response.text, suffix)
80
+ try:
81
+ return json.loads(text)
82
+ except json.JSONDecodeError:
83
+ try:
84
+ import yaml # type: ignore
85
+ except Exception as exc: # pragma: no cover - optional dependency
86
+ raise RuntimeError("PyYAML is required to load rubric text") from exc
87
+ data = yaml.safe_load(text)
88
+ if not isinstance(data, dict):
89
+ raise ValueError("Rubric text must decode to a mapping") from None
90
+ return data
91
+
92
+
93
+ def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
94
+ if source is None:
95
+ return None
96
+ if isinstance(source, Rubric):
97
+ return source
98
+ if isinstance(source, dict):
99
+ return Rubric.model_validate(source)
100
+ text, suffix = _load_text(str(source))
101
+ data = _parse_structured(text, suffix)
102
+ return Rubric.model_validate(data)
103
+
104
+
105
+ def _merge_weights(base: Criterion, override: Criterion) -> float:
106
+ if override.weight != 1.0 and base.weight != 1.0:
107
+ return base.weight * override.weight
108
+ if override.weight != 1.0:
109
+ return override.weight
110
+ return base.weight
111
+
112
+
113
+ def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
114
+ if override is None and base is None:
115
+ return None
116
+ if base is None:
117
+ return override
118
+ if override is None:
119
+ return base
120
+
121
+ base_map = {criterion.id: criterion for criterion in base.criteria}
122
+ merged: list[Criterion] = []
123
+
124
+ for ov in override.criteria:
125
+ if ov.id in base_map:
126
+ existing = base_map.pop(ov.id)
127
+ merged.append(
128
+ Criterion(
129
+ id=ov.id,
130
+ description=ov.description or existing.description,
131
+ weight=_merge_weights(existing, ov),
132
+ required=ov.required if ov.required is not None else existing.required,
133
+ )
134
+ )
135
+ else:
136
+ merged.append(ov)
137
+
138
+ merged.extend(base_map.values())
139
+
140
+ aggregation = override.aggregation
141
+ if aggregation == "inherit":
142
+ aggregation = base.aggregation
143
+
144
+ return Rubric(
145
+ version=override.version or base.version,
146
+ goal_text=override.goal_text or base.goal_text,
147
+ criteria=merged,
148
+ aggregation=aggregation,
149
+ )
150
+
151
+
152
+ def _as_float(value: Any) -> float | None:
153
+ try:
154
+ return float(value)
155
+ except Exception:
156
+ return None
157
+
158
+
159
+ def _score(
160
+ criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
161
+ ) -> dict[str, Any]:
162
+ if aggregation == "inherit":
163
+ aggregation = "weighted_sum"
164
+ per_criterion: dict[str, dict[str, Any]] = {}
165
+ total = 0.0
166
+ total_weight = 0.0
167
+ for criterion in criteria:
168
+ score = values.get(criterion.id, 0.0)
169
+ per_criterion[criterion.id] = {
170
+ "score": score,
171
+ "weight": criterion.weight,
172
+ "required": criterion.required,
173
+ }
174
+ if aggregation == "sum":
175
+ total += score
176
+ elif aggregation == "weighted_sum":
177
+ total += score * criterion.weight
178
+ total_weight += criterion.weight
179
+ if aggregation == "weighted_sum" and total_weight > 0:
180
+ total = total / total_weight
181
+ if aggregation == "custom":
182
+ total = None # type: ignore[assignment]
183
+ return {
184
+ "aggregation": aggregation,
185
+ "score": total,
186
+ "per_criterion": per_criterion,
187
+ }
188
+
189
+
190
+ def score_events_against_rubric(
191
+ events: list[dict[str, Any]], rubric: Rubric | None
192
+ ) -> dict[str, Any]:
193
+ if rubric is None:
194
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
195
+ values: dict[str, float] = {}
196
+ for event in events or []:
197
+ if not isinstance(event, dict):
198
+ continue
199
+ cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
200
+ score = _as_float(event.get("score"))
201
+ if cid and score is not None:
202
+ values[str(cid)] = score
203
+ return _score(rubric.criteria, values, rubric.aggregation)
204
+
205
+
206
+ def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
207
+ if rubric is None:
208
+ return {"aggregation": "none", "score": None, "per_criterion": {}}
209
+ values: dict[str, float] = {}
210
+ if isinstance(outcome, dict):
211
+ candidates = (
212
+ outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
213
+ )
214
+ if isinstance(candidates, dict):
215
+ for key, value in candidates.items():
216
+ score = _as_float(value)
217
+ if score is not None:
218
+ values[str(key)] = score
219
+ return _score(rubric.criteria, values, rubric.aggregation)
@@ -7,8 +7,9 @@ This module provides utilities for task apps to:
7
7
  See monorepo/trace_creation_and_judgement.txt "Fatal Guards" section for requirements.
8
8
  """
9
9
 
10
+ import importlib
10
11
  import logging
11
- from typing import Any
12
+ from typing import Any, cast
12
13
  from urllib.parse import parse_qs, urlparse
13
14
 
14
15
  logger = logging.getLogger(__name__)
@@ -63,13 +64,25 @@ def extract_trace_correlation_id(
63
64
  return stripped
64
65
 
65
66
  # Determine if we're in EVAL mode (trace_correlation_id not required for eval)
67
+ rollout_mode_cls: Any | None = None
66
68
  try:
67
- from synth_ai.task.contracts import RolloutMode
68
- is_eval_mode = (mode == "eval" or mode == RolloutMode.EVAL or
69
- (hasattr(mode, 'value') and mode.value == "eval"))
70
- except ImportError:
71
- # If RolloutMode not available, fall back to string comparison
72
- is_eval_mode = (mode == "eval")
69
+ contracts_module = importlib.import_module("synth_ai.task.contracts")
70
+ rollout_mode_cls = getattr(contracts_module, "RolloutMode", None)
71
+ except Exception:
72
+ rollout_mode_cls = None
73
+
74
+ is_eval_mode = False
75
+ if rollout_mode_cls is not None:
76
+ try:
77
+ is_eval_mode = (
78
+ mode == "eval"
79
+ or mode == rollout_mode_cls.EVAL
80
+ or getattr(mode, "value", None) == "eval"
81
+ )
82
+ except Exception:
83
+ is_eval_mode = mode == "eval"
84
+ else:
85
+ is_eval_mode = mode == "eval" or getattr(mode, "value", None) == "eval"
73
86
 
74
87
  # Fallback: try to extract from inference_url query params
75
88
  if not inference_url or not isinstance(inference_url, str):
@@ -87,10 +100,12 @@ def extract_trace_correlation_id(
87
100
 
88
101
  try:
89
102
  parsed = urlparse(inference_url)
90
- query_params = parse_qs(parsed.query or "")
103
+ query_params = cast(dict[str, list[str]], parse_qs(parsed.query or ""))
91
104
  # Try multiple possible query param names
92
105
  for param_name in ["cid", "trace_correlation_id", "trace"]:
93
- values = query_params.get(param_name, [])
106
+ values = query_params.get(param_name)
107
+ if not values:
108
+ continue
94
109
  for value in values:
95
110
  if isinstance(value, str) and value.strip():
96
111
  correlation_id = value.strip()
@@ -311,5 +326,3 @@ def verify_trace_correlation_id_in_response(
311
326
  expected_correlation_id
312
327
  )
313
328
  return True
314
-
315
-
@@ -4,9 +4,12 @@ from __future__ import annotations
4
4
 
5
5
  import os
6
6
  from collections.abc import Callable
7
+ from datetime import datetime
7
8
  from pathlib import Path
8
9
  from typing import Any
9
10
 
11
+ from synth_ai.tracing_v3.constants import TRACE_DB_DIR, canonical_trace_db_name
12
+
10
13
 
11
14
  def tracing_env_enabled(default: bool = False) -> bool:
12
15
  """Return True when tracing is enabled for task apps via environment variable."""
@@ -40,9 +43,17 @@ def resolve_tracing_db_url() -> str | None:
40
43
  path.parent.mkdir(parents=True, exist_ok=True)
41
44
  return f"sqlite+aiosqlite:///{path}"
42
45
 
43
- fallback_path = Path("traces/v3/synth_ai.db").expanduser()
44
- fallback_path.parent.mkdir(parents=True, exist_ok=True)
45
- return f"sqlite+aiosqlite:///{fallback_path}"
46
+ existing = os.getenv("TASKAPP_TRACE_DB_PATH")
47
+ if existing:
48
+ path = Path(existing).expanduser()
49
+ else:
50
+ base_dir = TRACE_DB_DIR.expanduser()
51
+ base_dir.mkdir(parents=True, exist_ok=True)
52
+ path = base_dir / canonical_trace_db_name(timestamp=datetime.now())
53
+ os.environ["TASKAPP_TRACE_DB_PATH"] = str(path)
54
+ os.environ.setdefault("SQLD_DB_PATH", str(path))
55
+ path.parent.mkdir(parents=True, exist_ok=True)
56
+ return f"sqlite+aiosqlite:///{path}"
46
57
 
47
58
 
48
59
  def build_tracer_factory(
@@ -3,12 +3,11 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import re
6
- from typing import Any
6
+ from typing import Any, cast
7
7
  from urllib.parse import urlparse, urlunparse
8
8
 
9
9
  import click
10
10
  import httpx
11
-
12
11
  from synth_ai.task.contracts import TaskAppEndpoints # type: ignore[attr-defined]
13
12
 
14
13
 
@@ -152,7 +151,7 @@ def normalize_inference_url(url: str | None, *, default: str = "https://api.open
152
151
  new_path = f"{path}/v1/chat/completions" if path else "/v1/chat/completions"
153
152
 
154
153
  # Reconstruct URL with new path and original query/fragment
155
- return urlunparse(parsed._replace(path=new_path))
154
+ return cast(str, urlunparse(parsed._replace(path=new_path)))
156
155
 
157
156
 
158
157
  def validate_task_app_url(url: str | None) -> str:
@@ -37,7 +37,7 @@ Concepts:
37
37
  from __future__ import annotations
38
38
 
39
39
  from dataclasses import asdict, dataclass, field
40
- from datetime import datetime, timezone
40
+ from datetime import UTC, datetime
41
41
  from typing import Any
42
42
 
43
43
  from .lm_call_record_abstractions import LLMCallRecord
@@ -249,7 +249,7 @@ class SessionTimeStep:
249
249
 
250
250
  step_id: str = ""
251
251
  step_index: int = 0
252
- timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
252
+ timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
253
253
  turn_number: int | None = None
254
254
  events: list[BaseEvent] = field(default_factory=list)
255
255
  markov_blanket_messages: list[SessionEventMarkovBlanketMessage] = field(default_factory=list)
@@ -283,7 +283,7 @@ class SessionTrace:
283
283
  """
284
284
 
285
285
  session_id: str = ""
286
- created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
286
+ created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
287
287
  session_time_steps: list[SessionTimeStep] = field(default_factory=list)
288
288
  event_history: list[BaseEvent] = field(default_factory=list)
289
289
  markov_blanket_message_history: list[SessionEventMarkovBlanketMessage] = field(
@@ -3,27 +3,29 @@
3
3
  import os
4
4
  from dataclasses import dataclass
5
5
 
6
+ from synth_ai.tracing_v3.constants import canonical_trace_db_path
7
+
8
+ DEFAULT_DB_FILE = str(canonical_trace_db_path())
9
+
10
+
11
+ def _default_sqlite_url() -> str:
12
+ base_path = os.path.abspath(os.getenv("SQLD_DB_PATH", DEFAULT_DB_FILE))
13
+ candidate = os.path.join(base_path, "dbs", "default", "data")
14
+ if os.path.isdir(base_path) and os.path.exists(candidate):
15
+ return f"sqlite+aiosqlite:///{candidate}"
16
+ return f"sqlite+aiosqlite:///{base_path}"
17
+
6
18
 
7
19
  @dataclass
8
20
  class TursoConfig:
9
21
  """Configuration for Turso/sqld connection."""
10
22
 
11
23
  # Default values matching serve.sh
12
- DEFAULT_DB_FILE = "traces/v3/synth_ai.db"
24
+ DEFAULT_DB_FILE = DEFAULT_DB_FILE
13
25
  DEFAULT_HTTP_PORT = 8080
14
26
 
15
- # Local embedded database for async SQLAlchemy
16
- # Resolve to the actual SQLite file used by sqld if the base path is a directory
17
- def _resolve_sqlite_db_url() -> str: # type: ignore[no-redef]
18
- base_path = os.path.abspath(os.getenv("SQLD_DB_PATH", "traces/v3/synth_ai.db"))
19
- # If sqld is managing this DB, the real SQLite file lives under dbs/default/data
20
- candidate = os.path.join(base_path, "dbs", "default", "data")
21
- if os.path.isdir(base_path) and os.path.exists(candidate):
22
- return f"sqlite+aiosqlite:///{candidate}"
23
- return f"sqlite+aiosqlite:///{base_path}"
24
-
25
27
  # Use env override if provided; otherwise resolve based on SQLD layout
26
- db_url: str = os.getenv("TURSO_LOCAL_DB_URL", _resolve_sqlite_db_url())
28
+ db_url: str = os.getenv("TURSO_LOCAL_DB_URL", _default_sqlite_url())
27
29
 
28
30
  # Remote database sync configuration
29
31
  sync_url: str = os.getenv("TURSO_DATABASE_URL", "")
@@ -48,7 +50,7 @@ class TursoConfig:
48
50
 
49
51
  # Daemon settings (for local sqld) - match serve.sh defaults
50
52
  sqld_binary: str = os.getenv("SQLD_BINARY", "sqld")
51
- sqld_db_path: str = os.getenv("SQLD_DB_PATH", "traces/v3/synth_ai.db")
53
+ sqld_db_path: str = os.getenv("SQLD_DB_PATH", DEFAULT_DB_FILE)
52
54
  sqld_http_port: int = int(os.getenv("SQLD_HTTP_PORT", "8080"))
53
55
  sqld_idle_shutdown: int = int(os.getenv("SQLD_IDLE_SHUTDOWN", "0")) # 0 = no idle shutdown
54
56
 
@@ -0,0 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+
6
+ TRACE_DB_DIR = Path("traces")
7
+ TRACE_DB_BASENAME = "task_app_traces"
8
+
9
+
10
+ def canonical_trace_db_name(*, timestamp: datetime | None = None) -> str:
11
+ """Return the canonical trace database filename (with optional timestamp suffix)."""
12
+
13
+ if timestamp is None:
14
+ return f"{TRACE_DB_BASENAME}.db"
15
+ return f"{TRACE_DB_BASENAME}_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.db"
16
+
17
+
18
+ def canonical_trace_db_path(*, timestamp: datetime | None = None) -> Path:
19
+ """Return the canonical trace database path within the default trace directory."""
20
+
21
+ return TRACE_DB_DIR / canonical_trace_db_name(timestamp=timestamp)
@@ -7,6 +7,8 @@ import os
7
7
  import shutil
8
8
  from typing import TYPE_CHECKING, Optional
9
9
 
10
+ from synth_ai.tracing_v3.constants import canonical_trace_db_path
11
+
10
12
  if TYPE_CHECKING:
11
13
  from .turso.daemon import SqldDaemon
12
14
 
@@ -17,7 +19,7 @@ class DatabaseConfig:
17
19
  """Centralized database configuration management."""
18
20
 
19
21
  # Default values from serve.sh
20
- DEFAULT_DB_FILE = "traces/v3/synth_ai.db"
22
+ DEFAULT_DB_FILE = str(canonical_trace_db_path())
21
23
  DEFAULT_HTTP_PORT = 8080
22
24
 
23
25
  def __init__(
@@ -29,6 +29,7 @@ import contextvars
29
29
  import functools
30
30
  import time
31
31
  from collections.abc import Awaitable, Callable, Mapping
32
+ from contextvars import Token
32
33
  from typing import Any, TypeVar, cast, overload
33
34
 
34
35
  from .abstractions import LMCAISEvent, TimeRecord
@@ -367,11 +368,11 @@ class SessionContext:
367
368
  ```
368
369
  """
369
370
 
370
- def __init__(self, session_id: str, tracer=None):
371
+ def __init__(self, session_id: str, tracer: Any | None = None):
371
372
  self.session_id = session_id
372
373
  self.tracer = tracer
373
- self._token = None
374
- self._tracer_token = None
374
+ self._token: Token[str | None] | None = None
375
+ self._tracer_token: Token[Any] | None = None
375
376
 
376
377
  def __enter__(self):
377
378
  # Store tokens to restore previous context on exit
@@ -382,8 +383,9 @@ class SessionContext:
382
383
 
383
384
  def __exit__(self, exc_type, exc_val, exc_tb):
384
385
  # Restore previous context - this is crucial for proper isolation
385
- _session_id_ctx.reset(self._token)
386
- if self._tracer_token:
386
+ if self._token is not None:
387
+ _session_id_ctx.reset(self._token)
388
+ if self._tracer_token is not None:
387
389
  _session_tracer_ctx.reset(self._tracer_token)
388
390
 
389
391
  async def __aenter__(self):
@@ -393,6 +395,7 @@ class SessionContext:
393
395
  return self
394
396
 
395
397
  async def __aexit__(self, exc_type, exc_val, exc_tb):
396
- _session_id_ctx.reset(self._token)
397
- if self._tracer_token:
398
+ if self._token is not None:
399
+ _session_id_ctx.reset(self._token)
400
+ if self._tracer_token is not None:
398
401
  _session_tracer_ctx.reset(self._tracer_token)
@@ -8,7 +8,7 @@ from __future__ import annotations
8
8
 
9
9
  import uuid
10
10
  from dataclasses import dataclass, field
11
- from datetime import datetime, timezone
11
+ from datetime import UTC, datetime
12
12
  from typing import Any, TypedDict, cast
13
13
 
14
14
  from .lm_call_record_abstractions import (
@@ -180,8 +180,8 @@ def create_llm_call_record_from_response(
180
180
  api_type=api_type,
181
181
  provider=provider,
182
182
  model_name=model_name,
183
- started_at=started_at or datetime.now(timezone.utc),
184
- completed_at=completed_at or datetime.now(timezone.utc),
183
+ started_at=started_at or datetime.now(UTC),
184
+ completed_at=completed_at or datetime.now(UTC),
185
185
  latency_ms=latency_ms,
186
186
  request_params=params,
187
187
  input_messages=input_messages,
@@ -376,8 +376,8 @@ def create_llm_call_record_from_streaming(
376
376
  api_type="responses", # Streaming typically from Responses API
377
377
  provider=provider,
378
378
  model_name=model_name,
379
- started_at=started_at or datetime.now(timezone.utc),
380
- completed_at=completed_at or datetime.now(timezone.utc),
379
+ started_at=started_at or datetime.now(UTC),
380
+ completed_at=completed_at or datetime.now(UTC),
381
381
  latency_ms=latency_ms,
382
382
  request_params=params,
383
383
  input_messages=input_messages,
@@ -5,7 +5,7 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  from contextlib import asynccontextmanager
8
- from datetime import datetime, timezone
8
+ from datetime import UTC, datetime
9
9
  from typing import Any
10
10
 
11
11
  from .abstractions import (
@@ -106,7 +106,7 @@ class SessionTracer:
106
106
 
107
107
  self._current_trace = SessionTrace(
108
108
  session_id=session_id,
109
- created_at=datetime.now(timezone.utc),
109
+ created_at=datetime.now(UTC),
110
110
  session_time_steps=[],
111
111
  event_history=[],
112
112
  markov_blanket_message_history=[],
@@ -152,7 +152,7 @@ class SessionTracer:
152
152
  step = SessionTimeStep(
153
153
  step_id=step_id,
154
154
  step_index=len(self._current_trace.session_time_steps),
155
- timestamp=datetime.now(timezone.utc),
155
+ timestamp=datetime.now(UTC),
156
156
  turn_number=turn_number,
157
157
  step_metadata=metadata or {},
158
158
  )
@@ -197,7 +197,7 @@ class SessionTracer:
197
197
  step = self._current_step
198
198
 
199
199
  if step and step.completed_at is None:
200
- step.completed_at = datetime.now(timezone.utc)
200
+ step.completed_at = datetime.now(UTC)
201
201
 
202
202
  # Trigger hooks
203
203
  await self.hooks.trigger(
@@ -294,7 +294,7 @@ class SessionTracer:
294
294
  content=normalised_content,
295
295
  message_type=message_type,
296
296
  time_record=TimeRecord(
297
- event_time=event_time or datetime.now(timezone.utc).timestamp(), message_time=message_time
297
+ event_time=event_time or datetime.now(UTC).timestamp(), message_time=message_time
298
298
  ),
299
299
  metadata=metadata or {},
300
300
  )
@@ -368,7 +368,7 @@ class SessionTracer:
368
368
  # End any open timesteps
369
369
  for step in self._current_trace.session_time_steps:
370
370
  if step.completed_at is None:
371
- step.completed_at = datetime.now(timezone.utc)
371
+ step.completed_at = datetime.now(UTC)
372
372
 
373
373
  # Trigger pre-save hooks
374
374
  await self.hooks.trigger("before_save", session=self._current_trace)
@@ -384,7 +384,7 @@ class SessionTracer:
384
384
  if should_save and self.db:
385
385
  _logger.info(f"[TRACE_DEBUG] Calling insert_session_trace with {len(self._current_trace.markov_blanket_message_history)} messages")
386
386
  await self.db.insert_session_trace(self._current_trace)
387
- _logger.info(f"[TRACE_DEBUG] insert_session_trace completed")
387
+ _logger.info("[TRACE_DEBUG] insert_session_trace completed")
388
388
 
389
389
  # Trigger post-save hooks
390
390
  await self.hooks.trigger("after_save", session=self._current_trace)