synth-ai 0.2.13.dev2__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (110) hide show
  1. examples/multi_step/configs/README_verilog_rl.md +77 -0
  2. examples/multi_step/configs/VERILOG_REWARDS.md +90 -0
  3. examples/multi_step/configs/VERILOG_RL_CHECKLIST.md +183 -0
  4. examples/multi_step/configs/crafter_eval_synth_qwen4b.toml +35 -0
  5. examples/multi_step/configs/crafter_eval_text_only_groq_qwen32b.toml +36 -0
  6. examples/multi_step/configs/crafter_rl_stepwise_hosted_judge.toml +5 -4
  7. examples/multi_step/configs/crafter_synth_backend.md +40 -0
  8. examples/multi_step/configs/verilog_eval_groq_qwen32b.toml +31 -0
  9. examples/multi_step/configs/verilog_eval_synth_qwen8b.toml +33 -0
  10. examples/multi_step/configs/verilog_rl_lora.toml +190 -0
  11. examples/multi_step/judges/crafter_backend_judge.py +220 -0
  12. examples/multi_step/judges/verilog_backend_judge.py +234 -0
  13. examples/multi_step/readme.md +48 -0
  14. examples/multi_step/verilog_rl_lora.md +218 -0
  15. examples/qwen_coder/configs/coder_lora_30b.toml +1 -1
  16. examples/sft/evaluate.py +2 -0
  17. examples/sft/generate_traces.py +2 -0
  18. examples/swe/task_app/grpo_swe_mini.py +1 -0
  19. examples/swe/task_app/hosted/rollout.py +2 -0
  20. examples/task_apps/IMAGE_ONLY_EVAL_QUICKSTART.md +258 -0
  21. examples/task_apps/crafter/CREATE_SFT_DATASET.md +273 -0
  22. examples/task_apps/crafter/EVAL_IMAGE_ONLY_RESULTS.md +152 -0
  23. examples/task_apps/crafter/FILTER_COMMAND_STATUS.md +174 -0
  24. examples/task_apps/crafter/FILTER_COMMAND_SUCCESS.md +268 -0
  25. examples/task_apps/crafter/QUERY_EXAMPLES.md +203 -0
  26. examples/task_apps/crafter/README_IMAGE_ONLY_EVAL.md +316 -0
  27. examples/task_apps/crafter/eval_image_only_gpt4o.toml +28 -0
  28. examples/task_apps/crafter/eval_text_only_groq_llama.toml +36 -0
  29. examples/task_apps/crafter/filter_sft_dataset.toml +16 -0
  30. examples/task_apps/crafter/task_app/__init__.py +3 -0
  31. examples/task_apps/crafter/task_app/grpo_crafter.py +306 -8
  32. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/environment.py +10 -0
  33. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/policy.py +16 -3
  34. examples/task_apps/crafter/task_app/synth_envs_hosted/envs/crafter/react_agent.py +17 -2
  35. examples/task_apps/crafter/task_app/synth_envs_hosted/inference/openai_client.py +25 -3
  36. examples/task_apps/crafter/task_app/synth_envs_hosted/policy_routes.py +52 -1
  37. examples/task_apps/crafter/task_app/synth_envs_hosted/rollout.py +111 -13
  38. examples/task_apps/crafter/task_app/synth_envs_hosted/utils.py +156 -0
  39. examples/task_apps/enron/filter_sft.toml +5 -0
  40. examples/task_apps/enron/tests/__init__.py +2 -0
  41. examples/task_apps/enron/tests/integration/__init__.py +2 -0
  42. examples/task_apps/enron/tests/integration/test_enron_eval.py +2 -0
  43. examples/task_apps/enron/tests/unit/__init__.py +2 -0
  44. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_COMPLETE.md +283 -0
  45. examples/task_apps/pokemon_red/EVAL_IMAGE_ONLY_STATUS.md +155 -0
  46. examples/task_apps/pokemon_red/README_IMAGE_ONLY_EVAL.md +415 -0
  47. examples/task_apps/pokemon_red/eval_image_only_gpt4o.toml +29 -0
  48. examples/task_apps/pokemon_red/pallet_town_rl_config.toml +2 -0
  49. examples/task_apps/pokemon_red/task_app.py +199 -6
  50. examples/task_apps/pokemon_red/test_pallet_town_rewards.py +2 -0
  51. examples/task_apps/sokoban/filter_sft.toml +5 -0
  52. examples/task_apps/sokoban/tests/__init__.py +2 -0
  53. examples/task_apps/sokoban/tests/integration/__init__.py +2 -0
  54. examples/task_apps/sokoban/tests/unit/__init__.py +2 -0
  55. examples/task_apps/verilog/eval_groq_qwen32b.toml +8 -4
  56. examples/task_apps/verilog/filter_sft.toml +5 -0
  57. examples/task_apps/verilog/task_app/grpo_verilog.py +258 -23
  58. examples/task_apps/verilog/tests/__init__.py +2 -0
  59. examples/task_apps/verilog/tests/integration/__init__.py +2 -0
  60. examples/task_apps/verilog/tests/integration/test_verilog_eval.py +2 -0
  61. examples/task_apps/verilog/tests/unit/__init__.py +2 -0
  62. examples/warming_up_to_rl/groq_test.py +2 -0
  63. examples/warming_up_to_rl/run_local_rollout.py +2 -0
  64. examples/warming_up_to_rl/run_local_rollout_modal.py +2 -0
  65. examples/warming_up_to_rl/run_local_rollout_parallel.py +2 -0
  66. examples/warming_up_to_rl/run_local_rollout_traced.py +2 -0
  67. examples/warming_up_to_rl/run_rollout_remote.py +2 -0
  68. synth_ai/api/models/supported.py +1 -0
  69. synth_ai/cli/__init__.py +46 -13
  70. synth_ai/cli/_modal_wrapper.py +3 -2
  71. synth_ai/cli/recent.py +1 -1
  72. synth_ai/cli/status.py +1 -1
  73. synth_ai/cli/task_apps.py +354 -143
  74. synth_ai/cli/traces.py +1 -1
  75. synth_ai/cli/tui.py +57 -0
  76. synth_ai/cli/turso.py +1 -1
  77. synth_ai/cli/watch.py +1 -1
  78. synth_ai/demos/demo_task_apps/crafter/grpo_crafter_task_app.py +1 -1
  79. synth_ai/environments/examples/crafter_classic/environment.py +1 -1
  80. synth_ai/environments/examples/verilog/engine.py +76 -10
  81. synth_ai/judge_schemas.py +8 -8
  82. synth_ai/task/__init__.py +11 -1
  83. synth_ai/task/apps/__init__.py +1 -0
  84. synth_ai/task/config.py +257 -0
  85. synth_ai/task/contracts.py +15 -2
  86. synth_ai/task/rubrics/__init__.py +3 -0
  87. synth_ai/task/rubrics/loaders.py +22 -3
  88. synth_ai/task/rubrics/scoring.py +3 -0
  89. synth_ai/task/trace_correlation_helpers.py +315 -0
  90. synth_ai/task/validators.py +144 -0
  91. synth_ai/tracing_v3/abstractions.py +3 -3
  92. synth_ai/tracing_v3/llm_call_record_helpers.py +5 -5
  93. synth_ai/tracing_v3/session_tracer.py +16 -6
  94. synth_ai/tracing_v3/storage/base.py +29 -29
  95. synth_ai/tracing_v3/storage/config.py +3 -3
  96. synth_ai/tracing_v3/turso/daemon.py +8 -7
  97. synth_ai/tracing_v3/turso/native_manager.py +63 -40
  98. synth_ai/tracing_v3/utils.py +3 -3
  99. synth_ai/tui/__init__.py +5 -0
  100. synth_ai/tui/__main__.py +13 -0
  101. synth_ai/tui/cli/__init__.py +1 -0
  102. synth_ai/tui/cli/query_experiments.py +164 -0
  103. synth_ai/tui/cli/query_experiments_v3.py +164 -0
  104. synth_ai/tui/dashboard.py +906 -0
  105. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/METADATA +1 -1
  106. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/RECORD +110 -71
  107. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/WHEEL +0 -0
  108. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/entry_points.txt +0 -0
  109. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/licenses/LICENSE +0 -0
  110. {synth_ai-0.2.13.dev2.dist-info → synth_ai-0.2.14.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from typing import Any, Dict, Iterable, Mapping, Sequence
4
5
 
5
6
  from fastapi import HTTPException, Request
@@ -21,6 +22,15 @@ from synth_ai.task.contracts import (
21
22
  TaskInfo,
22
23
  )
23
24
  from synth_ai.task.server import ProxyConfig, TaskAppConfig
25
+ from synth_ai.task.tracing_utils import (
26
+ build_tracer_factory,
27
+ resolve_sft_output_dir,
28
+ resolve_tracing_db_url,
29
+ tracing_env_enabled,
30
+ )
31
+ from synth_ai.tracing_v3.session_tracer import SessionTracer
32
+
33
+ logger = logging.getLogger(__name__)
24
34
 
25
35
 
26
36
  def _base_task_info() -> TaskInfo:
@@ -182,7 +192,70 @@ def _calculate_outcome_score(final_state: dict[str, Any], total_reward: float) -
182
192
 
183
193
 
184
194
  async def rollout_executor(request: RolloutRequest, fastapi_request: Request) -> RolloutResponse:
195
+ # Initialize SessionTracer for this rollout
196
+ tracer_factory = getattr(fastapi_request.app.state, "session_tracer_factory", None)
197
+ tracer_instance: SessionTracer | None = None
198
+ if callable(tracer_factory):
199
+ try:
200
+ inst = tracer_factory()
201
+ tracer_instance = inst if isinstance(inst, SessionTracer) else None
202
+ except Exception as exc:
203
+ logger.debug(f"TRACER_FACTORY_FAIL: {exc}")
204
+
205
+ # Start tracing session
206
+ if tracer_instance is not None:
207
+ try:
208
+ await tracer_instance.initialize()
209
+ await tracer_instance.start_session(
210
+ session_id=request.run_id,
211
+ metadata={
212
+ "run_id": request.run_id,
213
+ "env_name": "pokemon_red",
214
+ "policy_name": request.policy.policy_name or "default",
215
+ "seed": request.env.seed,
216
+ }
217
+ )
218
+ logger.info(f"[pokemon_red] tracing enabled for run_id={request.run_id}")
219
+ except Exception as exc:
220
+ logger.warning(f"[pokemon_red] tracing init failed: {exc}")
221
+ tracer_instance = None
222
+
185
223
  async def _call_inference(policy_cfg: Mapping[str, Any], observation: Mapping[str, Any]) -> Mapping[str, Any]:
224
+ # Check if vision mode is enabled
225
+ use_vision = bool(policy_cfg.get("use_vision", False))
226
+ image_only_mode = bool(policy_cfg.get("image_only_mode", False))
227
+
228
+ # Build user message content
229
+ if use_vision and "observation_image_data_url" in observation:
230
+ # Extract image data URL
231
+ image_data_url = observation["observation_image_data_url"]
232
+
233
+ # Build state summary (text observation)
234
+ state_summary = "State summary: " + str({
235
+ k: observation.get(k)
236
+ for k in observation.keys()
237
+ if k not in ["error", "observation_image_base64", "observation_image_data_url",
238
+ "observation_image_format", "observation_image_width", "observation_image_height"]
239
+ })
240
+
241
+ # Image-only mode: only send image, no text
242
+ if image_only_mode:
243
+ user_content = [
244
+ {"type": "image_url", "image_url": {"url": image_data_url}}
245
+ ]
246
+ else:
247
+ # Vision mode with text: send both text and image
248
+ user_content = [
249
+ {"type": "text", "text": state_summary},
250
+ {"type": "image_url", "image_url": {"url": image_data_url}}
251
+ ]
252
+ else:
253
+ # Text-only mode (default)
254
+ state_summary = "State summary: " + str({
255
+ k: observation.get(k) for k in observation.keys() if k != "error"
256
+ })
257
+ user_content = state_summary
258
+
186
259
  messages = [
187
260
  {
188
261
  "role": "system",
@@ -193,9 +266,7 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
193
266
  },
194
267
  {
195
268
  "role": "user",
196
- "content": (
197
- "State summary: " + str({k: observation.get(k) for k in observation.keys() if k != "error"})
198
- ),
269
+ "content": user_content,
199
270
  },
200
271
  ]
201
272
  payload = {
@@ -262,6 +333,10 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
262
333
  "max_tokens": int(policy_cfg.get("max_tokens") or 500),
263
334
  }
264
335
  inference_url = str(policy_cfg.get("inference_url") or "").rstrip("/")
336
+
337
+ # Determine if this is an external URL or internal proxy
338
+ is_external = inference_url.startswith("http://") or inference_url.startswith("https://")
339
+
265
340
  if not inference_url:
266
341
  # Prefer built-in proxy endpoints from app if no external URL
267
342
  provider = (policy_cfg.get("provider") or "").lower()
@@ -269,8 +344,31 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
269
344
  inference_url = "/proxy/groq/v1/chat/completions"
270
345
  else:
271
346
  inference_url = "/proxy/v1/chat/completions"
272
- async with httpx.AsyncClient(base_url="http://127.0.0.1:" + str(fastapi_request.url.port or 8913), timeout=httpx.Timeout(60.0)) as client: # best-effort
273
- resp = await client.post(inference_url, json=payload)
347
+ is_external = False
348
+ elif is_external:
349
+ # Add /v1/chat/completions if using OpenAI directly
350
+ if "api.openai.com" in inference_url and not inference_url.endswith("/chat/completions"):
351
+ inference_url = inference_url + "/v1/chat/completions"
352
+
353
+ if is_external:
354
+ # External API: use direct HTTP client with auth header
355
+ headers = {}
356
+ if "api.openai.com" in inference_url:
357
+ import os
358
+ api_key = os.getenv("OPENAI_API_KEY")
359
+ if api_key:
360
+ headers["Authorization"] = f"Bearer {api_key}"
361
+
362
+ async with httpx.AsyncClient(timeout=httpx.Timeout(60.0)) as client:
363
+ resp = await client.post(inference_url, json=payload, headers=headers)
364
+ else:
365
+ # Internal proxy: use local base_url
366
+ async with httpx.AsyncClient(
367
+ base_url="http://127.0.0.1:" + str(fastapi_request.url.port or 8913),
368
+ timeout=httpx.Timeout(60.0)
369
+ ) as client:
370
+ resp = await client.post(inference_url, json=payload)
371
+
274
372
  resp.raise_for_status()
275
373
  data = resp.json()
276
374
  # Extract first tool call
@@ -555,6 +653,72 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
555
653
  inference_url=inference_url, # NEW: Required for trace correlation
556
654
  )
557
655
 
656
+ # Record outcome rewards and end session
657
+ trace_payload = None
658
+ if tracer_instance is not None:
659
+ try:
660
+ # Count achievements (milestones)
661
+ achievements_count = len(milestone_events)
662
+
663
+ # Build metadata with all relevant info
664
+ reward_metadata = {
665
+ "run_id": request.run_id,
666
+ "env_name": "pokemon_red",
667
+ "final_map": final_state.get("map_id", -1),
668
+ "party_count": final_state.get("party_count", 0),
669
+ "badges": final_state.get("badges", 0),
670
+ "steps": len(steps),
671
+ "milestone_events": milestone_events,
672
+ "reward_components": all_reward_components,
673
+ }
674
+
675
+ # Record outcome reward to Turso
676
+ await tracer_instance.record_outcome_reward(
677
+ total_reward=int(total_reward),
678
+ achievements_count=achievements_count,
679
+ total_steps=len(steps),
680
+ reward_metadata=reward_metadata,
681
+ )
682
+ logger.info(f"[pokemon_red] recorded outcome: reward={total_reward}, achievements={achievements_count}")
683
+
684
+ # End session and get trace
685
+ session_trace = await tracer_instance.end_session()
686
+
687
+ # Build trace payload if requested
688
+ record_config = getattr(request, 'record', None)
689
+ if record_config and getattr(record_config, 'return_trace', False) and session_trace:
690
+ trace_payload = {
691
+ "session_id": session_trace.session_id,
692
+ "created_at": session_trace.created_at.isoformat() if session_trace.created_at else None,
693
+ "metadata": dict(session_trace.metadata or {}),
694
+ "num_timesteps": session_trace.num_timesteps,
695
+ "num_events": session_trace.num_events,
696
+ "num_messages": session_trace.num_messages,
697
+ }
698
+ except Exception as exc:
699
+ logger.warning(f"[pokemon_red] tracing finalization failed: {exc}")
700
+
701
+ # Fallback trace payload if no tracer but CLI needs it
702
+ if trace_payload is None:
703
+ record_config = getattr(request, 'record', None)
704
+ if record_config and getattr(record_config, 'return_trace', False):
705
+ trace_payload = {
706
+ "session_id": request.run_id,
707
+ "created_at": import_datetime().now().isoformat(),
708
+ "metadata": {
709
+ "run_id": request.run_id,
710
+ "env_name": "pokemon_red",
711
+ "total_reward": int(total_reward),
712
+ "final_map": final_state.get("map_id", -1),
713
+ "party_count": final_state.get("party_count", 0),
714
+ "badges": final_state.get("badges", 0),
715
+ "steps": len(steps),
716
+ },
717
+ "num_timesteps": len(steps),
718
+ "num_events": len(steps),
719
+ "num_messages": len(steps) * 2,
720
+ }
721
+
558
722
  return RolloutResponse(
559
723
  run_id=request.run_id,
560
724
  trajectories=[trajectory],
@@ -562,11 +726,40 @@ async def rollout_executor(request: RolloutRequest, fastapi_request: Request) ->
562
726
  metrics=metrics,
563
727
  aborted=False,
564
728
  ops_executed=len(request.ops or []),
729
+ trace=trace_payload,
565
730
  )
566
731
 
567
732
 
733
+ def import_datetime():
734
+ """Helper to import datetime for trace timestamps."""
735
+ from datetime import datetime
736
+ return datetime
737
+
738
+
568
739
  def build_config() -> TaskAppConfig:
569
740
  base_info = _base_task_info()
741
+
742
+ # Set up tracing
743
+ tracing_enabled = tracing_env_enabled()
744
+ tracing_db_url = resolve_tracing_db_url()
745
+ tracer_factory = build_tracer_factory(
746
+ SessionTracer, enabled=tracing_enabled, db_url=tracing_db_url
747
+ )
748
+ sft_output_dir = resolve_sft_output_dir()
749
+
750
+ app_state: dict[str, Any] = {
751
+ "tracing_enabled": tracing_enabled,
752
+ }
753
+ if tracer_factory is not None:
754
+ app_state["session_tracer_factory"] = tracer_factory
755
+ if sft_output_dir:
756
+ app_state["sft_output_dir"] = sft_output_dir
757
+
758
+ if tracing_enabled:
759
+ status_msg = f"[task:tracing] enabled (db={tracing_db_url or 'default'})"
760
+ logger.info(status_msg)
761
+ print(status_msg, flush=True)
762
+
570
763
  return TaskAppConfig(
571
764
  app_id="pokemon_red",
572
765
  name="Pokémon Red Task App",
@@ -585,7 +778,7 @@ def build_config() -> TaskAppConfig:
585
778
  "Example: {\"tool\": \"execute_sequence\", \"args\": {\"actions\": [{\"button\": \"DOWN\", \"frames\": 30}, ...]}}"
586
779
  ),
587
780
  ),
588
- app_state={},
781
+ app_state=app_state,
589
782
  require_api_key=False,
590
783
  expose_debug_env=True,
591
784
  cors_origins=["*"],
@@ -189,3 +189,5 @@ async def main():
189
189
  if __name__ == "__main__":
190
190
  asyncio.run(main())
191
191
 
192
+
193
+
@@ -0,0 +1,5 @@
1
+ [filter]
2
+ db = "traces/v3/synth_ai.db"
3
+ output = "ft_data/sokoban_sft.jsonl"
4
+ min_official_score = 0.01
5
+
@@ -1,2 +1,4 @@
1
1
  # Sokoban task app tests
2
2
 
3
+
4
+
@@ -1,2 +1,4 @@
1
1
  # Integration tests for Sokoban task app
2
2
 
3
+
4
+
@@ -1,2 +1,4 @@
1
1
  # Unit tests for Sokoban task app
2
2
 
3
+
4
+
@@ -1,12 +1,14 @@
1
1
  # Verilog Eval Config for Groq Qwen3-32B
2
+ # Quick eval to test Verilog task app before RL training
2
3
 
3
4
  [task_app]
4
- url = "http://localhost:8103" # Verilog task app port
5
+ # Update this with your Modal URL after deployment
6
+ url = "https://synth-laboratories--grpo-verilog-task-app-fastapi-app-dev.modal.run"
5
7
 
6
8
  [eval]
7
- num_episodes = 5
8
- seeds = [0, 1, 2, 3, 4]
9
- max_steps = 10
9
+ num_episodes = 3 # Quick test with 3 seeds
10
+ seeds = [0, 1, 2]
11
+ max_steps = 15 # More steps for Verilog compilation chains
10
12
 
11
13
  [policy]
12
14
  provider = "groq"
@@ -18,3 +20,5 @@ inference_url = "https://api.groq.com/openai/v1/chat/completions"
18
20
  [env]
19
21
  difficulty = "medium" # Can be "easy", "medium", or "hard"
20
22
 
23
+
24
+
@@ -0,0 +1,5 @@
1
+ [filter]
2
+ db = "traces/v3/synth_ai.db"
3
+ output = "ft_data/verilog_sft.jsonl"
4
+ min_official_score = 0.01
5
+