synth-ai 0.2.14__py3-none-any.whl → 0.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of synth-ai might be problematic. Click here for more details.
- examples/README.md +1 -0
- examples/multi_step/SFT_README.md +147 -0
- examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +9 -9
- examples/multi_step/configs/crafter_sft_qwen30b_lora.toml +62 -0
- examples/multi_step/convert_traces_to_sft.py +84 -0
- examples/multi_step/run_sft_qwen30b.sh +45 -0
- examples/qwen_coder/configs/coder_lora_30b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_4b.toml +2 -1
- examples/qwen_coder/configs/coder_lora_small.toml +2 -1
- examples/qwen_vl/BUGS_AND_FIXES.md +232 -0
- examples/qwen_vl/IMAGE_VALIDATION_COMPLETE.md +271 -0
- examples/qwen_vl/IMAGE_VALIDATION_SUMMARY.md +260 -0
- examples/qwen_vl/INFERENCE_SFT_TESTS.md +412 -0
- examples/qwen_vl/NEXT_STEPS_2B.md +325 -0
- examples/qwen_vl/QUICKSTART.md +327 -0
- examples/qwen_vl/QUICKSTART_RL_VISION.md +110 -0
- examples/qwen_vl/README.md +154 -0
- examples/qwen_vl/RL_VISION_COMPLETE.md +475 -0
- examples/qwen_vl/RL_VISION_TESTING.md +333 -0
- examples/qwen_vl/SDK_VISION_INTEGRATION.md +328 -0
- examples/qwen_vl/SETUP_COMPLETE.md +275 -0
- examples/qwen_vl/VISION_TESTS_COMPLETE.md +490 -0
- examples/qwen_vl/VLM_PIPELINE_COMPLETE.md +242 -0
- examples/qwen_vl/__init__.py +2 -0
- examples/qwen_vl/collect_data_via_cli.md +423 -0
- examples/qwen_vl/collect_vision_traces.py +368 -0
- examples/qwen_vl/configs/crafter_rl_vision_qwen3vl4b.toml +127 -0
- examples/qwen_vl/configs/crafter_vlm_sft_example.toml +60 -0
- examples/qwen_vl/configs/eval_gpt4o_mini_vision.toml +43 -0
- examples/qwen_vl/configs/eval_gpt4o_vision_proper.toml +29 -0
- examples/qwen_vl/configs/eval_gpt5nano_vision.toml +45 -0
- examples/qwen_vl/configs/eval_qwen2vl_vision.toml +44 -0
- examples/qwen_vl/configs/filter_qwen2vl_sft.toml +50 -0
- examples/qwen_vl/configs/filter_vision_sft.toml +53 -0
- examples/qwen_vl/configs/filter_vision_test.toml +8 -0
- examples/qwen_vl/configs/sft_qwen3_vl_2b_test.toml +54 -0
- examples/qwen_vl/crafter_gpt5nano_agent.py +308 -0
- examples/qwen_vl/crafter_qwen_vl_agent.py +300 -0
- examples/qwen_vl/run_vision_comparison.sh +62 -0
- examples/qwen_vl/run_vision_sft_pipeline.sh +175 -0
- examples/qwen_vl/test_image_validation.py +201 -0
- examples/qwen_vl/test_sft_vision_data.py +110 -0
- examples/rl/README.md +1 -1
- examples/rl/configs/eval_base_qwen.toml +17 -0
- examples/rl/configs/eval_rl_qwen.toml +13 -0
- examples/rl/configs/rl_from_base_qwen.toml +37 -0
- examples/rl/configs/rl_from_base_qwen17.toml +76 -0
- examples/rl/configs/rl_from_ft_qwen.toml +37 -0
- examples/rl/run_eval.py +436 -0
- examples/rl/run_rl_and_save.py +111 -0
- examples/rl/task_app/README.md +22 -0
- examples/rl/task_app/math_single_step.py +990 -0
- examples/rl/task_app/math_task_app.py +111 -0
- examples/sft/README.md +5 -5
- examples/sft/configs/crafter_fft_qwen0p6b.toml +4 -2
- examples/sft/configs/crafter_lora_qwen0p6b.toml +4 -3
- examples/sft/evaluate.py +2 -4
- examples/sft/export_dataset.py +7 -4
- examples/swe/task_app/README.md +1 -1
- examples/swe/task_app/grpo_swe_mini.py +0 -1
- examples/swe/task_app/grpo_swe_mini_task_app.py +0 -12
- examples/swe/task_app/hosted/envs/mini_swe/environment.py +13 -13
- examples/swe/task_app/hosted/policy_routes.py +0 -2
- examples/swe/task_app/hosted/rollout.py +0 -8
- examples/task_apps/crafter/task_app/grpo_crafter.py +4 -7
- examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +59 -1
- examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +30 -0
- examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +62 -31
- examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +16 -14
- examples/task_apps/enron/__init__.py +1 -0
- examples/vlm/README.md +3 -3
- examples/vlm/configs/crafter_vlm_gpt4o.toml +2 -0
- examples/vlm/crafter_openai_vlm_agent.py +3 -5
- examples/vlm/filter_image_rows.py +1 -1
- examples/vlm/run_crafter_vlm_benchmark.py +2 -2
- examples/warming_up_to_rl/_utils.py +92 -0
- examples/warming_up_to_rl/analyze_trace_db.py +1 -1
- examples/warming_up_to_rl/configs/crafter_fft.toml +2 -0
- examples/warming_up_to_rl/configs/crafter_fft_4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_fft_qwen4b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_groq_qwen32b.toml +2 -0
- examples/warming_up_to_rl/configs/eval_modal_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_base_qwen4b.toml +2 -1
- examples/warming_up_to_rl/configs/rl_from_ft.toml +2 -0
- examples/warming_up_to_rl/export_trace_sft.py +174 -60
- examples/warming_up_to_rl/readme.md +63 -132
- examples/warming_up_to_rl/run_fft_and_save.py +1 -1
- examples/warming_up_to_rl/run_rl_and_save.py +1 -1
- examples/warming_up_to_rl/task_app/README.md +42 -0
- examples/warming_up_to_rl/task_app/grpo_crafter.py +696 -0
- examples/warming_up_to_rl/task_app/grpo_crafter_task_app.py +135 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/README.md +173 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/branching.py +143 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/environment_routes.py +1226 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/__init__.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/__init__.py +6 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/app.py +1 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/environment.py +522 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/policy.py +478 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/react_agent.py +108 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/shared.py +305 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/envs/crafter/tools.py +47 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/hosted_app.py +204 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/inference/openai_client.py +618 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/main.py +100 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/policy_routes.py +1081 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/registry.py +195 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/rollout.py +1861 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/__init__.py +5 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/storage/volume.py +211 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_agents.py +161 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/test_service.py +137 -0
- examples/warming_up_to_rl/task_app/synth_envs_hosted/utils.py +62 -0
- synth_ai/__init__.py +44 -30
- synth_ai/_utils/__init__.py +47 -0
- synth_ai/_utils/base_url.py +10 -0
- synth_ai/_utils/http.py +10 -0
- synth_ai/_utils/prompts.py +10 -0
- synth_ai/_utils/task_app_state.py +12 -0
- synth_ai/_utils/user_config.py +10 -0
- synth_ai/api/models/supported.py +144 -7
- synth_ai/api/train/__init__.py +13 -1
- synth_ai/api/train/cli.py +30 -7
- synth_ai/api/train/config_finder.py +18 -11
- synth_ai/api/train/env_resolver.py +13 -10
- synth_ai/cli/__init__.py +62 -78
- synth_ai/cli/_modal_wrapper.py +7 -5
- synth_ai/cli/_typer_patch.py +0 -2
- synth_ai/cli/_validate_task_app.py +22 -4
- synth_ai/cli/legacy_root_backup.py +3 -1
- synth_ai/cli/lib/__init__.py +10 -0
- synth_ai/cli/lib/task_app_discovery.py +7 -0
- synth_ai/cli/lib/task_app_env.py +518 -0
- synth_ai/cli/recent.py +2 -1
- synth_ai/cli/setup.py +266 -0
- synth_ai/cli/status.py +1 -1
- synth_ai/cli/task_app_deploy.py +16 -0
- synth_ai/cli/task_app_list.py +25 -0
- synth_ai/cli/task_app_modal_serve.py +16 -0
- synth_ai/cli/task_app_serve.py +18 -0
- synth_ai/cli/task_apps.py +71 -31
- synth_ai/cli/traces.py +1 -1
- synth_ai/cli/train.py +18 -0
- synth_ai/cli/tui.py +7 -2
- synth_ai/cli/turso.py +1 -1
- synth_ai/cli/watch.py +1 -1
- synth_ai/demos/__init__.py +10 -0
- synth_ai/demos/core/__init__.py +28 -1
- synth_ai/demos/crafter/__init__.py +1 -0
- synth_ai/demos/crafter/crafter_fft_4b.toml +55 -0
- synth_ai/demos/crafter/grpo_crafter_task_app.py +185 -0
- synth_ai/demos/crafter/rl_from_base_qwen4b.toml +74 -0
- synth_ai/demos/demo_registry.py +176 -0
- synth_ai/demos/math/__init__.py +1 -0
- synth_ai/demos/math/_common.py +16 -0
- synth_ai/demos/math/app.py +38 -0
- synth_ai/demos/math/config.toml +76 -0
- synth_ai/demos/math/deploy_modal.py +54 -0
- synth_ai/demos/math/modal_task_app.py +702 -0
- synth_ai/demos/math/task_app_entry.py +51 -0
- synth_ai/environments/environment/core.py +7 -1
- synth_ai/environments/examples/bandit/engine.py +0 -1
- synth_ai/environments/examples/bandit/environment.py +0 -1
- synth_ai/environments/examples/wordle/environment.py +0 -1
- synth_ai/evals/base.py +16 -5
- synth_ai/evals/client.py +1 -1
- synth_ai/inference/client.py +1 -1
- synth_ai/judge_schemas.py +8 -8
- synth_ai/learning/client.py +1 -1
- synth_ai/learning/health.py +1 -1
- synth_ai/learning/jobs.py +1 -1
- synth_ai/learning/rl/client.py +1 -1
- synth_ai/learning/rl/env_keys.py +1 -1
- synth_ai/learning/rl/secrets.py +1 -1
- synth_ai/learning/sft/client.py +1 -1
- synth_ai/learning/sft/data.py +407 -4
- synth_ai/learning/validators.py +4 -1
- synth_ai/task/apps/__init__.py +4 -2
- synth_ai/task/config.py +6 -4
- synth_ai/task/rubrics/__init__.py +1 -2
- synth_ai/task/rubrics/loaders.py +14 -10
- synth_ai/task/rubrics.py +219 -0
- synth_ai/task/trace_correlation_helpers.py +24 -11
- synth_ai/task/tracing_utils.py +14 -3
- synth_ai/task/validators.py +2 -3
- synth_ai/tracing_v3/abstractions.py +3 -3
- synth_ai/tracing_v3/config.py +15 -13
- synth_ai/tracing_v3/constants.py +21 -0
- synth_ai/tracing_v3/db_config.py +3 -1
- synth_ai/tracing_v3/decorators.py +10 -7
- synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
- synth_ai/tracing_v3/session_tracer.py +7 -7
- synth_ai/tracing_v3/storage/base.py +29 -29
- synth_ai/tracing_v3/storage/config.py +3 -3
- synth_ai/tracing_v3/turso/daemon.py +8 -9
- synth_ai/tracing_v3/turso/native_manager.py +80 -72
- synth_ai/tracing_v3/utils.py +2 -2
- synth_ai/tui/cli/query_experiments.py +4 -4
- synth_ai/tui/cli/query_experiments_v3.py +4 -4
- synth_ai/tui/dashboard.py +14 -9
- synth_ai/utils/__init__.py +101 -0
- synth_ai/utils/base_url.py +94 -0
- synth_ai/utils/cli.py +131 -0
- synth_ai/utils/env.py +287 -0
- synth_ai/utils/http.py +169 -0
- synth_ai/utils/modal.py +308 -0
- synth_ai/utils/process.py +212 -0
- synth_ai/utils/prompts.py +39 -0
- synth_ai/utils/sqld.py +122 -0
- synth_ai/utils/task_app_discovery.py +882 -0
- synth_ai/utils/task_app_env.py +186 -0
- synth_ai/utils/task_app_state.py +318 -0
- synth_ai/utils/user_config.py +137 -0
- synth_ai/v0/config/__init__.py +1 -5
- synth_ai/v0/config/base_url.py +1 -7
- synth_ai/v0/tracing/config.py +1 -1
- synth_ai/v0/tracing/decorators.py +1 -1
- synth_ai/v0/tracing/upload.py +1 -1
- synth_ai/v0/tracing_v1/config.py +1 -1
- synth_ai/v0/tracing_v1/decorators.py +1 -1
- synth_ai/v0/tracing_v1/upload.py +1 -1
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/METADATA +85 -31
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/RECORD +229 -117
- synth_ai/cli/man.py +0 -106
- synth_ai/compound/cais.py +0 -0
- synth_ai/core/experiment.py +0 -13
- synth_ai/core/system.py +0 -15
- synth_ai/demo_registry.py +0 -295
- synth_ai/handshake.py +0 -109
- synth_ai/http.py +0 -26
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/WHEEL +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/entry_points.txt +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/licenses/LICENSE +0 -0
- {synth_ai-0.2.14.dist-info → synth_ai-0.2.16.dist-info}/top_level.txt +0 -0
synth_ai/task/rubrics.py
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Rubric schema, loading, and scoring helpers for Task Apps."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from collections.abc import Iterable
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field, field_validator
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Criterion(BaseModel):
|
|
14
|
+
id: str
|
|
15
|
+
description: str
|
|
16
|
+
weight: float = 1.0
|
|
17
|
+
required: bool = False
|
|
18
|
+
|
|
19
|
+
@field_validator("weight")
|
|
20
|
+
@classmethod
|
|
21
|
+
def _validate_weight(cls, value: float) -> float:
|
|
22
|
+
if value <= 0:
|
|
23
|
+
raise ValueError("criterion weight must be positive")
|
|
24
|
+
return value
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class Rubric(BaseModel):
|
|
28
|
+
version: str
|
|
29
|
+
goal_text: str | None = None
|
|
30
|
+
criteria: list[Criterion] = Field(default_factory=list)
|
|
31
|
+
aggregation: str = "weighted_sum"
|
|
32
|
+
|
|
33
|
+
@field_validator("aggregation")
|
|
34
|
+
@classmethod
|
|
35
|
+
def _validate_aggregation(cls, value: str) -> str:
|
|
36
|
+
allowed = {"sum", "weighted_sum", "custom", "inherit"}
|
|
37
|
+
if value not in allowed:
|
|
38
|
+
raise ValueError(f"aggregation must be one of {sorted(allowed)}")
|
|
39
|
+
return value
|
|
40
|
+
|
|
41
|
+
@field_validator("criteria")
|
|
42
|
+
@classmethod
|
|
43
|
+
def _validate_criteria(cls, criteria: list[Criterion]) -> list[Criterion]:
|
|
44
|
+
seen = set()
|
|
45
|
+
for criterion in criteria:
|
|
46
|
+
if criterion.id in seen:
|
|
47
|
+
raise ValueError(f"duplicate criterion id: {criterion.id}")
|
|
48
|
+
seen.add(criterion.id)
|
|
49
|
+
return criteria
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _load_text(source: str) -> tuple[str, str | None]:
|
|
53
|
+
path = Path(source)
|
|
54
|
+
if path.exists():
|
|
55
|
+
return path.read_text(encoding="utf-8"), path.suffix.lower()
|
|
56
|
+
return source, None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _parse_structured(text: str, suffix: str | None) -> dict[str, Any]:
|
|
60
|
+
text = text.strip()
|
|
61
|
+
if not text:
|
|
62
|
+
raise ValueError("Rubric source is empty")
|
|
63
|
+
if suffix in (".yaml", ".yml"):
|
|
64
|
+
try:
|
|
65
|
+
import yaml # type: ignore
|
|
66
|
+
except Exception as exc: # pragma: no cover - optional dependency
|
|
67
|
+
raise RuntimeError("PyYAML is required to load YAML rubrics") from exc
|
|
68
|
+
data = yaml.safe_load(text)
|
|
69
|
+
if not isinstance(data, dict):
|
|
70
|
+
raise ValueError("Rubric YAML must produce a mapping") from None
|
|
71
|
+
return data
|
|
72
|
+
if text.startswith("{"):
|
|
73
|
+
return json.loads(text)
|
|
74
|
+
if text.startswith("http://") or text.startswith("https://"):
|
|
75
|
+
import requests # type: ignore
|
|
76
|
+
|
|
77
|
+
response = requests.get(text, timeout=15)
|
|
78
|
+
response.raise_for_status()
|
|
79
|
+
return _parse_structured(response.text, suffix)
|
|
80
|
+
try:
|
|
81
|
+
return json.loads(text)
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
try:
|
|
84
|
+
import yaml # type: ignore
|
|
85
|
+
except Exception as exc: # pragma: no cover - optional dependency
|
|
86
|
+
raise RuntimeError("PyYAML is required to load rubric text") from exc
|
|
87
|
+
data = yaml.safe_load(text)
|
|
88
|
+
if not isinstance(data, dict):
|
|
89
|
+
raise ValueError("Rubric text must decode to a mapping") from None
|
|
90
|
+
return data
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def load_rubric(source: str | dict[str, Any] | Rubric | None) -> Rubric | None:
|
|
94
|
+
if source is None:
|
|
95
|
+
return None
|
|
96
|
+
if isinstance(source, Rubric):
|
|
97
|
+
return source
|
|
98
|
+
if isinstance(source, dict):
|
|
99
|
+
return Rubric.model_validate(source)
|
|
100
|
+
text, suffix = _load_text(str(source))
|
|
101
|
+
data = _parse_structured(text, suffix)
|
|
102
|
+
return Rubric.model_validate(data)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _merge_weights(base: Criterion, override: Criterion) -> float:
|
|
106
|
+
if override.weight != 1.0 and base.weight != 1.0:
|
|
107
|
+
return base.weight * override.weight
|
|
108
|
+
if override.weight != 1.0:
|
|
109
|
+
return override.weight
|
|
110
|
+
return base.weight
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def blend_rubrics(base: Rubric | None, override: Rubric | None) -> Rubric | None:
|
|
114
|
+
if override is None and base is None:
|
|
115
|
+
return None
|
|
116
|
+
if base is None:
|
|
117
|
+
return override
|
|
118
|
+
if override is None:
|
|
119
|
+
return base
|
|
120
|
+
|
|
121
|
+
base_map = {criterion.id: criterion for criterion in base.criteria}
|
|
122
|
+
merged: list[Criterion] = []
|
|
123
|
+
|
|
124
|
+
for ov in override.criteria:
|
|
125
|
+
if ov.id in base_map:
|
|
126
|
+
existing = base_map.pop(ov.id)
|
|
127
|
+
merged.append(
|
|
128
|
+
Criterion(
|
|
129
|
+
id=ov.id,
|
|
130
|
+
description=ov.description or existing.description,
|
|
131
|
+
weight=_merge_weights(existing, ov),
|
|
132
|
+
required=ov.required if ov.required is not None else existing.required,
|
|
133
|
+
)
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
merged.append(ov)
|
|
137
|
+
|
|
138
|
+
merged.extend(base_map.values())
|
|
139
|
+
|
|
140
|
+
aggregation = override.aggregation
|
|
141
|
+
if aggregation == "inherit":
|
|
142
|
+
aggregation = base.aggregation
|
|
143
|
+
|
|
144
|
+
return Rubric(
|
|
145
|
+
version=override.version or base.version,
|
|
146
|
+
goal_text=override.goal_text or base.goal_text,
|
|
147
|
+
criteria=merged,
|
|
148
|
+
aggregation=aggregation,
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _as_float(value: Any) -> float | None:
|
|
153
|
+
try:
|
|
154
|
+
return float(value)
|
|
155
|
+
except Exception:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _score(
|
|
160
|
+
criteria: Iterable[Criterion], values: dict[str, float], aggregation: str
|
|
161
|
+
) -> dict[str, Any]:
|
|
162
|
+
if aggregation == "inherit":
|
|
163
|
+
aggregation = "weighted_sum"
|
|
164
|
+
per_criterion: dict[str, dict[str, Any]] = {}
|
|
165
|
+
total = 0.0
|
|
166
|
+
total_weight = 0.0
|
|
167
|
+
for criterion in criteria:
|
|
168
|
+
score = values.get(criterion.id, 0.0)
|
|
169
|
+
per_criterion[criterion.id] = {
|
|
170
|
+
"score": score,
|
|
171
|
+
"weight": criterion.weight,
|
|
172
|
+
"required": criterion.required,
|
|
173
|
+
}
|
|
174
|
+
if aggregation == "sum":
|
|
175
|
+
total += score
|
|
176
|
+
elif aggregation == "weighted_sum":
|
|
177
|
+
total += score * criterion.weight
|
|
178
|
+
total_weight += criterion.weight
|
|
179
|
+
if aggregation == "weighted_sum" and total_weight > 0:
|
|
180
|
+
total = total / total_weight
|
|
181
|
+
if aggregation == "custom":
|
|
182
|
+
total = None # type: ignore[assignment]
|
|
183
|
+
return {
|
|
184
|
+
"aggregation": aggregation,
|
|
185
|
+
"score": total,
|
|
186
|
+
"per_criterion": per_criterion,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def score_events_against_rubric(
|
|
191
|
+
events: list[dict[str, Any]], rubric: Rubric | None
|
|
192
|
+
) -> dict[str, Any]:
|
|
193
|
+
if rubric is None:
|
|
194
|
+
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
195
|
+
values: dict[str, float] = {}
|
|
196
|
+
for event in events or []:
|
|
197
|
+
if not isinstance(event, dict):
|
|
198
|
+
continue
|
|
199
|
+
cid = event.get("criterion_id") or event.get("id") or event.get("criterion")
|
|
200
|
+
score = _as_float(event.get("score"))
|
|
201
|
+
if cid and score is not None:
|
|
202
|
+
values[str(cid)] = score
|
|
203
|
+
return _score(rubric.criteria, values, rubric.aggregation)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def score_outcome_against_rubric(outcome: dict[str, Any], rubric: Rubric | None) -> dict[str, Any]:
|
|
207
|
+
if rubric is None:
|
|
208
|
+
return {"aggregation": "none", "score": None, "per_criterion": {}}
|
|
209
|
+
values: dict[str, float] = {}
|
|
210
|
+
if isinstance(outcome, dict):
|
|
211
|
+
candidates = (
|
|
212
|
+
outcome.get("criteria") if isinstance(outcome.get("criteria"), dict) else outcome
|
|
213
|
+
)
|
|
214
|
+
if isinstance(candidates, dict):
|
|
215
|
+
for key, value in candidates.items():
|
|
216
|
+
score = _as_float(value)
|
|
217
|
+
if score is not None:
|
|
218
|
+
values[str(key)] = score
|
|
219
|
+
return _score(rubric.criteria, values, rubric.aggregation)
|
|
@@ -7,8 +7,9 @@ This module provides utilities for task apps to:
|
|
|
7
7
|
See monorepo/trace_creation_and_judgement.txt "Fatal Guards" section for requirements.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
+
import importlib
|
|
10
11
|
import logging
|
|
11
|
-
from typing import Any
|
|
12
|
+
from typing import Any, cast
|
|
12
13
|
from urllib.parse import parse_qs, urlparse
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
@@ -63,13 +64,25 @@ def extract_trace_correlation_id(
|
|
|
63
64
|
return stripped
|
|
64
65
|
|
|
65
66
|
# Determine if we're in EVAL mode (trace_correlation_id not required for eval)
|
|
67
|
+
rollout_mode_cls: Any | None = None
|
|
66
68
|
try:
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
69
|
+
contracts_module = importlib.import_module("synth_ai.task.contracts")
|
|
70
|
+
rollout_mode_cls = getattr(contracts_module, "RolloutMode", None)
|
|
71
|
+
except Exception:
|
|
72
|
+
rollout_mode_cls = None
|
|
73
|
+
|
|
74
|
+
is_eval_mode = False
|
|
75
|
+
if rollout_mode_cls is not None:
|
|
76
|
+
try:
|
|
77
|
+
is_eval_mode = (
|
|
78
|
+
mode == "eval"
|
|
79
|
+
or mode == rollout_mode_cls.EVAL
|
|
80
|
+
or getattr(mode, "value", None) == "eval"
|
|
81
|
+
)
|
|
82
|
+
except Exception:
|
|
83
|
+
is_eval_mode = mode == "eval"
|
|
84
|
+
else:
|
|
85
|
+
is_eval_mode = mode == "eval" or getattr(mode, "value", None) == "eval"
|
|
73
86
|
|
|
74
87
|
# Fallback: try to extract from inference_url query params
|
|
75
88
|
if not inference_url or not isinstance(inference_url, str):
|
|
@@ -87,10 +100,12 @@ def extract_trace_correlation_id(
|
|
|
87
100
|
|
|
88
101
|
try:
|
|
89
102
|
parsed = urlparse(inference_url)
|
|
90
|
-
query_params = parse_qs(parsed.query or "")
|
|
103
|
+
query_params = cast(dict[str, list[str]], parse_qs(parsed.query or ""))
|
|
91
104
|
# Try multiple possible query param names
|
|
92
105
|
for param_name in ["cid", "trace_correlation_id", "trace"]:
|
|
93
|
-
values = query_params.get(param_name
|
|
106
|
+
values = query_params.get(param_name)
|
|
107
|
+
if not values:
|
|
108
|
+
continue
|
|
94
109
|
for value in values:
|
|
95
110
|
if isinstance(value, str) and value.strip():
|
|
96
111
|
correlation_id = value.strip()
|
|
@@ -311,5 +326,3 @@ def verify_trace_correlation_id_in_response(
|
|
|
311
326
|
expected_correlation_id
|
|
312
327
|
)
|
|
313
328
|
return True
|
|
314
|
-
|
|
315
|
-
|
synth_ai/task/tracing_utils.py
CHANGED
|
@@ -4,9 +4,12 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
from collections.abc import Callable
|
|
7
|
+
from datetime import datetime
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any
|
|
9
10
|
|
|
11
|
+
from synth_ai.tracing_v3.constants import TRACE_DB_DIR, canonical_trace_db_name
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
def tracing_env_enabled(default: bool = False) -> bool:
|
|
12
15
|
"""Return True when tracing is enabled for task apps via environment variable."""
|
|
@@ -40,9 +43,17 @@ def resolve_tracing_db_url() -> str | None:
|
|
|
40
43
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
41
44
|
return f"sqlite+aiosqlite:///{path}"
|
|
42
45
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
+
existing = os.getenv("TASKAPP_TRACE_DB_PATH")
|
|
47
|
+
if existing:
|
|
48
|
+
path = Path(existing).expanduser()
|
|
49
|
+
else:
|
|
50
|
+
base_dir = TRACE_DB_DIR.expanduser()
|
|
51
|
+
base_dir.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
path = base_dir / canonical_trace_db_name(timestamp=datetime.now())
|
|
53
|
+
os.environ["TASKAPP_TRACE_DB_PATH"] = str(path)
|
|
54
|
+
os.environ.setdefault("SQLD_DB_PATH", str(path))
|
|
55
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
56
|
+
return f"sqlite+aiosqlite:///{path}"
|
|
46
57
|
|
|
47
58
|
|
|
48
59
|
def build_tracer_factory(
|
synth_ai/task/validators.py
CHANGED
|
@@ -3,12 +3,11 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import re
|
|
6
|
-
from typing import Any
|
|
6
|
+
from typing import Any, cast
|
|
7
7
|
from urllib.parse import urlparse, urlunparse
|
|
8
8
|
|
|
9
9
|
import click
|
|
10
10
|
import httpx
|
|
11
|
-
|
|
12
11
|
from synth_ai.task.contracts import TaskAppEndpoints # type: ignore[attr-defined]
|
|
13
12
|
|
|
14
13
|
|
|
@@ -152,7 +151,7 @@ def normalize_inference_url(url: str | None, *, default: str = "https://api.open
|
|
|
152
151
|
new_path = f"{path}/v1/chat/completions" if path else "/v1/chat/completions"
|
|
153
152
|
|
|
154
153
|
# Reconstruct URL with new path and original query/fragment
|
|
155
|
-
return urlunparse(parsed._replace(path=new_path))
|
|
154
|
+
return cast(str, urlunparse(parsed._replace(path=new_path)))
|
|
156
155
|
|
|
157
156
|
|
|
158
157
|
def validate_task_app_url(url: str | None) -> str:
|
|
@@ -37,7 +37,7 @@ Concepts:
|
|
|
37
37
|
from __future__ import annotations
|
|
38
38
|
|
|
39
39
|
from dataclasses import asdict, dataclass, field
|
|
40
|
-
from datetime import
|
|
40
|
+
from datetime import UTC, datetime
|
|
41
41
|
from typing import Any
|
|
42
42
|
|
|
43
43
|
from .lm_call_record_abstractions import LLMCallRecord
|
|
@@ -249,7 +249,7 @@ class SessionTimeStep:
|
|
|
249
249
|
|
|
250
250
|
step_id: str = ""
|
|
251
251
|
step_index: int = 0
|
|
252
|
-
timestamp: datetime = field(default_factory=lambda: datetime.now(
|
|
252
|
+
timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
253
253
|
turn_number: int | None = None
|
|
254
254
|
events: list[BaseEvent] = field(default_factory=list)
|
|
255
255
|
markov_blanket_messages: list[SessionEventMarkovBlanketMessage] = field(default_factory=list)
|
|
@@ -283,7 +283,7 @@ class SessionTrace:
|
|
|
283
283
|
"""
|
|
284
284
|
|
|
285
285
|
session_id: str = ""
|
|
286
|
-
created_at: datetime = field(default_factory=lambda: datetime.now(
|
|
286
|
+
created_at: datetime = field(default_factory=lambda: datetime.now(UTC))
|
|
287
287
|
session_time_steps: list[SessionTimeStep] = field(default_factory=list)
|
|
288
288
|
event_history: list[BaseEvent] = field(default_factory=list)
|
|
289
289
|
markov_blanket_message_history: list[SessionEventMarkovBlanketMessage] = field(
|
synth_ai/tracing_v3/config.py
CHANGED
|
@@ -3,27 +3,29 @@
|
|
|
3
3
|
import os
|
|
4
4
|
from dataclasses import dataclass
|
|
5
5
|
|
|
6
|
+
from synth_ai.tracing_v3.constants import canonical_trace_db_path
|
|
7
|
+
|
|
8
|
+
DEFAULT_DB_FILE = str(canonical_trace_db_path())
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _default_sqlite_url() -> str:
|
|
12
|
+
base_path = os.path.abspath(os.getenv("SQLD_DB_PATH", DEFAULT_DB_FILE))
|
|
13
|
+
candidate = os.path.join(base_path, "dbs", "default", "data")
|
|
14
|
+
if os.path.isdir(base_path) and os.path.exists(candidate):
|
|
15
|
+
return f"sqlite+aiosqlite:///{candidate}"
|
|
16
|
+
return f"sqlite+aiosqlite:///{base_path}"
|
|
17
|
+
|
|
6
18
|
|
|
7
19
|
@dataclass
|
|
8
20
|
class TursoConfig:
|
|
9
21
|
"""Configuration for Turso/sqld connection."""
|
|
10
22
|
|
|
11
23
|
# Default values matching serve.sh
|
|
12
|
-
DEFAULT_DB_FILE =
|
|
24
|
+
DEFAULT_DB_FILE = DEFAULT_DB_FILE
|
|
13
25
|
DEFAULT_HTTP_PORT = 8080
|
|
14
26
|
|
|
15
|
-
# Local embedded database for async SQLAlchemy
|
|
16
|
-
# Resolve to the actual SQLite file used by sqld if the base path is a directory
|
|
17
|
-
def _resolve_sqlite_db_url() -> str: # type: ignore[no-redef]
|
|
18
|
-
base_path = os.path.abspath(os.getenv("SQLD_DB_PATH", "traces/v3/synth_ai.db"))
|
|
19
|
-
# If sqld is managing this DB, the real SQLite file lives under dbs/default/data
|
|
20
|
-
candidate = os.path.join(base_path, "dbs", "default", "data")
|
|
21
|
-
if os.path.isdir(base_path) and os.path.exists(candidate):
|
|
22
|
-
return f"sqlite+aiosqlite:///{candidate}"
|
|
23
|
-
return f"sqlite+aiosqlite:///{base_path}"
|
|
24
|
-
|
|
25
27
|
# Use env override if provided; otherwise resolve based on SQLD layout
|
|
26
|
-
db_url: str = os.getenv("TURSO_LOCAL_DB_URL",
|
|
28
|
+
db_url: str = os.getenv("TURSO_LOCAL_DB_URL", _default_sqlite_url())
|
|
27
29
|
|
|
28
30
|
# Remote database sync configuration
|
|
29
31
|
sync_url: str = os.getenv("TURSO_DATABASE_URL", "")
|
|
@@ -48,7 +50,7 @@ class TursoConfig:
|
|
|
48
50
|
|
|
49
51
|
# Daemon settings (for local sqld) - match serve.sh defaults
|
|
50
52
|
sqld_binary: str = os.getenv("SQLD_BINARY", "sqld")
|
|
51
|
-
sqld_db_path: str = os.getenv("SQLD_DB_PATH",
|
|
53
|
+
sqld_db_path: str = os.getenv("SQLD_DB_PATH", DEFAULT_DB_FILE)
|
|
52
54
|
sqld_http_port: int = int(os.getenv("SQLD_HTTP_PORT", "8080"))
|
|
53
55
|
sqld_idle_shutdown: int = int(os.getenv("SQLD_IDLE_SHUTDOWN", "0")) # 0 = no idle shutdown
|
|
54
56
|
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
TRACE_DB_DIR = Path("traces")
|
|
7
|
+
TRACE_DB_BASENAME = "task_app_traces"
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def canonical_trace_db_name(*, timestamp: datetime | None = None) -> str:
|
|
11
|
+
"""Return the canonical trace database filename (with optional timestamp suffix)."""
|
|
12
|
+
|
|
13
|
+
if timestamp is None:
|
|
14
|
+
return f"{TRACE_DB_BASENAME}.db"
|
|
15
|
+
return f"{TRACE_DB_BASENAME}_{timestamp.strftime('%Y-%m-%d_%H-%M-%S')}.db"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def canonical_trace_db_path(*, timestamp: datetime | None = None) -> Path:
|
|
19
|
+
"""Return the canonical trace database path within the default trace directory."""
|
|
20
|
+
|
|
21
|
+
return TRACE_DB_DIR / canonical_trace_db_name(timestamp=timestamp)
|
synth_ai/tracing_v3/db_config.py
CHANGED
|
@@ -7,6 +7,8 @@ import os
|
|
|
7
7
|
import shutil
|
|
8
8
|
from typing import TYPE_CHECKING, Optional
|
|
9
9
|
|
|
10
|
+
from synth_ai.tracing_v3.constants import canonical_trace_db_path
|
|
11
|
+
|
|
10
12
|
if TYPE_CHECKING:
|
|
11
13
|
from .turso.daemon import SqldDaemon
|
|
12
14
|
|
|
@@ -17,7 +19,7 @@ class DatabaseConfig:
|
|
|
17
19
|
"""Centralized database configuration management."""
|
|
18
20
|
|
|
19
21
|
# Default values from serve.sh
|
|
20
|
-
DEFAULT_DB_FILE =
|
|
22
|
+
DEFAULT_DB_FILE = str(canonical_trace_db_path())
|
|
21
23
|
DEFAULT_HTTP_PORT = 8080
|
|
22
24
|
|
|
23
25
|
def __init__(
|
|
@@ -29,6 +29,7 @@ import contextvars
|
|
|
29
29
|
import functools
|
|
30
30
|
import time
|
|
31
31
|
from collections.abc import Awaitable, Callable, Mapping
|
|
32
|
+
from contextvars import Token
|
|
32
33
|
from typing import Any, TypeVar, cast, overload
|
|
33
34
|
|
|
34
35
|
from .abstractions import LMCAISEvent, TimeRecord
|
|
@@ -367,11 +368,11 @@ class SessionContext:
|
|
|
367
368
|
```
|
|
368
369
|
"""
|
|
369
370
|
|
|
370
|
-
def __init__(self, session_id: str, tracer=None):
|
|
371
|
+
def __init__(self, session_id: str, tracer: Any | None = None):
|
|
371
372
|
self.session_id = session_id
|
|
372
373
|
self.tracer = tracer
|
|
373
|
-
self._token = None
|
|
374
|
-
self._tracer_token = None
|
|
374
|
+
self._token: Token[str | None] | None = None
|
|
375
|
+
self._tracer_token: Token[Any] | None = None
|
|
375
376
|
|
|
376
377
|
def __enter__(self):
|
|
377
378
|
# Store tokens to restore previous context on exit
|
|
@@ -382,8 +383,9 @@ class SessionContext:
|
|
|
382
383
|
|
|
383
384
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
384
385
|
# Restore previous context - this is crucial for proper isolation
|
|
385
|
-
|
|
386
|
-
|
|
386
|
+
if self._token is not None:
|
|
387
|
+
_session_id_ctx.reset(self._token)
|
|
388
|
+
if self._tracer_token is not None:
|
|
387
389
|
_session_tracer_ctx.reset(self._tracer_token)
|
|
388
390
|
|
|
389
391
|
async def __aenter__(self):
|
|
@@ -393,6 +395,7 @@ class SessionContext:
|
|
|
393
395
|
return self
|
|
394
396
|
|
|
395
397
|
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
396
|
-
|
|
397
|
-
|
|
398
|
+
if self._token is not None:
|
|
399
|
+
_session_id_ctx.reset(self._token)
|
|
400
|
+
if self._tracer_token is not None:
|
|
398
401
|
_session_tracer_ctx.reset(self._tracer_token)
|
|
@@ -8,7 +8,7 @@ from __future__ import annotations
|
|
|
8
8
|
|
|
9
9
|
import uuid
|
|
10
10
|
from dataclasses import dataclass, field
|
|
11
|
-
from datetime import
|
|
11
|
+
from datetime import UTC, datetime
|
|
12
12
|
from typing import Any, TypedDict, cast
|
|
13
13
|
|
|
14
14
|
from .lm_call_record_abstractions import (
|
|
@@ -180,8 +180,8 @@ def create_llm_call_record_from_response(
|
|
|
180
180
|
api_type=api_type,
|
|
181
181
|
provider=provider,
|
|
182
182
|
model_name=model_name,
|
|
183
|
-
started_at=started_at or datetime.now(
|
|
184
|
-
completed_at=completed_at or datetime.now(
|
|
183
|
+
started_at=started_at or datetime.now(UTC),
|
|
184
|
+
completed_at=completed_at or datetime.now(UTC),
|
|
185
185
|
latency_ms=latency_ms,
|
|
186
186
|
request_params=params,
|
|
187
187
|
input_messages=input_messages,
|
|
@@ -376,8 +376,8 @@ def create_llm_call_record_from_streaming(
|
|
|
376
376
|
api_type="responses", # Streaming typically from Responses API
|
|
377
377
|
provider=provider,
|
|
378
378
|
model_name=model_name,
|
|
379
|
-
started_at=started_at or datetime.now(
|
|
380
|
-
completed_at=completed_at or datetime.now(
|
|
379
|
+
started_at=started_at or datetime.now(UTC),
|
|
380
|
+
completed_at=completed_at or datetime.now(UTC),
|
|
381
381
|
latency_ms=latency_ms,
|
|
382
382
|
request_params=params,
|
|
383
383
|
input_messages=input_messages,
|
|
@@ -5,7 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
7
|
from contextlib import asynccontextmanager
|
|
8
|
-
from datetime import
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
9
|
from typing import Any
|
|
10
10
|
|
|
11
11
|
from .abstractions import (
|
|
@@ -106,7 +106,7 @@ class SessionTracer:
|
|
|
106
106
|
|
|
107
107
|
self._current_trace = SessionTrace(
|
|
108
108
|
session_id=session_id,
|
|
109
|
-
created_at=datetime.now(
|
|
109
|
+
created_at=datetime.now(UTC),
|
|
110
110
|
session_time_steps=[],
|
|
111
111
|
event_history=[],
|
|
112
112
|
markov_blanket_message_history=[],
|
|
@@ -152,7 +152,7 @@ class SessionTracer:
|
|
|
152
152
|
step = SessionTimeStep(
|
|
153
153
|
step_id=step_id,
|
|
154
154
|
step_index=len(self._current_trace.session_time_steps),
|
|
155
|
-
timestamp=datetime.now(
|
|
155
|
+
timestamp=datetime.now(UTC),
|
|
156
156
|
turn_number=turn_number,
|
|
157
157
|
step_metadata=metadata or {},
|
|
158
158
|
)
|
|
@@ -197,7 +197,7 @@ class SessionTracer:
|
|
|
197
197
|
step = self._current_step
|
|
198
198
|
|
|
199
199
|
if step and step.completed_at is None:
|
|
200
|
-
step.completed_at = datetime.now(
|
|
200
|
+
step.completed_at = datetime.now(UTC)
|
|
201
201
|
|
|
202
202
|
# Trigger hooks
|
|
203
203
|
await self.hooks.trigger(
|
|
@@ -294,7 +294,7 @@ class SessionTracer:
|
|
|
294
294
|
content=normalised_content,
|
|
295
295
|
message_type=message_type,
|
|
296
296
|
time_record=TimeRecord(
|
|
297
|
-
event_time=event_time or datetime.now(
|
|
297
|
+
event_time=event_time or datetime.now(UTC).timestamp(), message_time=message_time
|
|
298
298
|
),
|
|
299
299
|
metadata=metadata or {},
|
|
300
300
|
)
|
|
@@ -368,7 +368,7 @@ class SessionTracer:
|
|
|
368
368
|
# End any open timesteps
|
|
369
369
|
for step in self._current_trace.session_time_steps:
|
|
370
370
|
if step.completed_at is None:
|
|
371
|
-
step.completed_at = datetime.now(
|
|
371
|
+
step.completed_at = datetime.now(UTC)
|
|
372
372
|
|
|
373
373
|
# Trigger pre-save hooks
|
|
374
374
|
await self.hooks.trigger("before_save", session=self._current_trace)
|
|
@@ -384,7 +384,7 @@ class SessionTracer:
|
|
|
384
384
|
if should_save and self.db:
|
|
385
385
|
_logger.info(f"[TRACE_DEBUG] Calling insert_session_trace with {len(self._current_trace.markov_blanket_message_history)} messages")
|
|
386
386
|
await self.db.insert_session_trace(self._current_trace)
|
|
387
|
-
_logger.info(
|
|
387
|
+
_logger.info("[TRACE_DEBUG] insert_session_trace completed")
|
|
388
388
|
|
|
389
389
|
# Trigger post-save hooks
|
|
390
390
|
await self.hooks.trigger("after_save", session=self._current_trace)
|