synth-ai 0.4.1__py3-none-any.whl → 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of synth-ai might be problematic. Click here for more details.

Files changed (153) hide show
  1. synth_ai/__init__.py +13 -13
  2. synth_ai/cli/__init__.py +6 -15
  3. synth_ai/cli/commands/eval/__init__.py +6 -15
  4. synth_ai/cli/commands/eval/config.py +338 -0
  5. synth_ai/cli/commands/eval/core.py +236 -1091
  6. synth_ai/cli/commands/eval/runner.py +704 -0
  7. synth_ai/cli/commands/eval/validation.py +44 -117
  8. synth_ai/cli/commands/filter/core.py +7 -7
  9. synth_ai/cli/commands/filter/validation.py +2 -2
  10. synth_ai/cli/commands/smoke/core.py +7 -17
  11. synth_ai/cli/commands/status/__init__.py +1 -64
  12. synth_ai/cli/commands/status/client.py +50 -151
  13. synth_ai/cli/commands/status/config.py +3 -83
  14. synth_ai/cli/commands/status/errors.py +4 -13
  15. synth_ai/cli/commands/status/subcommands/__init__.py +2 -8
  16. synth_ai/cli/commands/status/subcommands/config.py +13 -0
  17. synth_ai/cli/commands/status/subcommands/files.py +18 -63
  18. synth_ai/cli/commands/status/subcommands/jobs.py +28 -311
  19. synth_ai/cli/commands/status/subcommands/models.py +18 -62
  20. synth_ai/cli/commands/status/subcommands/runs.py +16 -63
  21. synth_ai/cli/commands/status/subcommands/session.py +67 -172
  22. synth_ai/cli/commands/status/subcommands/summary.py +24 -32
  23. synth_ai/cli/commands/status/subcommands/utils.py +41 -0
  24. synth_ai/cli/commands/status/utils.py +16 -107
  25. synth_ai/cli/commands/train/__init__.py +18 -20
  26. synth_ai/cli/commands/train/errors.py +3 -3
  27. synth_ai/cli/commands/train/prompt_learning_validation.py +15 -16
  28. synth_ai/cli/commands/train/validation.py +7 -7
  29. synth_ai/cli/commands/train/{judge_schemas.py → verifier_schemas.py} +33 -34
  30. synth_ai/cli/commands/train/verifier_validation.py +235 -0
  31. synth_ai/cli/demo_apps/demo_task_apps/math/config.toml +0 -1
  32. synth_ai/cli/demo_apps/demo_task_apps/math/modal_task_app.py +2 -6
  33. synth_ai/cli/demo_apps/math/config.toml +0 -1
  34. synth_ai/cli/demo_apps/math/modal_task_app.py +2 -6
  35. synth_ai/cli/demo_apps/mipro/task_app.py +25 -47
  36. synth_ai/cli/lib/apps/task_app.py +12 -13
  37. synth_ai/cli/lib/task_app_discovery.py +6 -6
  38. synth_ai/cli/lib/train_cfgs.py +10 -10
  39. synth_ai/cli/task_apps/__init__.py +11 -0
  40. synth_ai/cli/task_apps/commands.py +7 -15
  41. synth_ai/core/env.py +12 -1
  42. synth_ai/core/errors.py +1 -2
  43. synth_ai/core/integrations/cloudflare.py +209 -33
  44. synth_ai/core/tracing_v3/abstractions.py +46 -0
  45. synth_ai/data/__init__.py +3 -30
  46. synth_ai/data/enums.py +1 -20
  47. synth_ai/data/rewards.py +100 -3
  48. synth_ai/products/graph_evolve/__init__.py +1 -2
  49. synth_ai/products/graph_evolve/config.py +16 -16
  50. synth_ai/products/graph_evolve/converters/__init__.py +3 -3
  51. synth_ai/products/graph_evolve/converters/openai_sft.py +7 -7
  52. synth_ai/products/graph_evolve/examples/hotpotqa/config.toml +1 -1
  53. synth_ai/products/graph_gepa/__init__.py +23 -0
  54. synth_ai/products/graph_gepa/converters/__init__.py +19 -0
  55. synth_ai/products/graph_gepa/converters/openai_sft.py +29 -0
  56. synth_ai/sdk/__init__.py +45 -35
  57. synth_ai/sdk/api/eval/__init__.py +33 -0
  58. synth_ai/sdk/api/eval/job.py +732 -0
  59. synth_ai/sdk/api/research_agent/__init__.py +276 -66
  60. synth_ai/sdk/api/train/builders.py +181 -0
  61. synth_ai/sdk/api/train/cli.py +41 -33
  62. synth_ai/sdk/api/train/configs/__init__.py +6 -4
  63. synth_ai/sdk/api/train/configs/prompt_learning.py +127 -33
  64. synth_ai/sdk/api/train/configs/rl.py +264 -16
  65. synth_ai/sdk/api/train/configs/sft.py +165 -1
  66. synth_ai/sdk/api/train/graph_validators.py +12 -12
  67. synth_ai/sdk/api/train/graphgen.py +169 -51
  68. synth_ai/sdk/api/train/graphgen_models.py +95 -45
  69. synth_ai/sdk/api/train/local_api.py +10 -0
  70. synth_ai/sdk/api/train/pollers.py +36 -0
  71. synth_ai/sdk/api/train/prompt_learning.py +390 -60
  72. synth_ai/sdk/api/train/rl.py +41 -5
  73. synth_ai/sdk/api/train/sft.py +2 -0
  74. synth_ai/sdk/api/train/task_app.py +20 -0
  75. synth_ai/sdk/api/train/validators.py +17 -17
  76. synth_ai/sdk/graphs/completions.py +239 -33
  77. synth_ai/sdk/{judging/schemas.py → graphs/verifier_schemas.py} +23 -23
  78. synth_ai/sdk/learning/__init__.py +35 -5
  79. synth_ai/sdk/learning/context_learning_client.py +531 -0
  80. synth_ai/sdk/learning/context_learning_types.py +294 -0
  81. synth_ai/sdk/learning/prompt_learning_client.py +1 -1
  82. synth_ai/sdk/learning/prompt_learning_types.py +2 -1
  83. synth_ai/sdk/learning/rl/__init__.py +0 -4
  84. synth_ai/sdk/learning/rl/contracts.py +0 -4
  85. synth_ai/sdk/localapi/__init__.py +40 -0
  86. synth_ai/sdk/localapi/apps/__init__.py +28 -0
  87. synth_ai/sdk/localapi/client.py +10 -0
  88. synth_ai/sdk/localapi/contracts.py +10 -0
  89. synth_ai/sdk/localapi/helpers.py +519 -0
  90. synth_ai/sdk/localapi/rollouts.py +93 -0
  91. synth_ai/sdk/localapi/server.py +29 -0
  92. synth_ai/sdk/localapi/template.py +49 -0
  93. synth_ai/sdk/streaming/handlers.py +6 -6
  94. synth_ai/sdk/streaming/streamer.py +10 -6
  95. synth_ai/sdk/task/__init__.py +18 -5
  96. synth_ai/sdk/task/apps/__init__.py +37 -1
  97. synth_ai/sdk/task/client.py +9 -1
  98. synth_ai/sdk/task/config.py +6 -11
  99. synth_ai/sdk/task/contracts.py +137 -95
  100. synth_ai/sdk/task/in_process.py +32 -22
  101. synth_ai/sdk/task/in_process_runner.py +9 -4
  102. synth_ai/sdk/task/rubrics/__init__.py +2 -3
  103. synth_ai/sdk/task/rubrics/loaders.py +4 -4
  104. synth_ai/sdk/task/rubrics/strict.py +3 -4
  105. synth_ai/sdk/task/server.py +76 -16
  106. synth_ai/sdk/task/trace_correlation_helpers.py +190 -139
  107. synth_ai/sdk/task/validators.py +34 -49
  108. synth_ai/sdk/training/__init__.py +7 -16
  109. synth_ai/sdk/tunnels/__init__.py +118 -0
  110. synth_ai/sdk/tunnels/cleanup.py +83 -0
  111. synth_ai/sdk/tunnels/ports.py +120 -0
  112. synth_ai/sdk/tunnels/tunneled_api.py +363 -0
  113. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/METADATA +71 -4
  114. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/RECORD +118 -128
  115. synth_ai/cli/commands/baseline/__init__.py +0 -12
  116. synth_ai/cli/commands/baseline/core.py +0 -636
  117. synth_ai/cli/commands/baseline/list.py +0 -94
  118. synth_ai/cli/commands/eval/errors.py +0 -81
  119. synth_ai/cli/commands/status/formatters.py +0 -164
  120. synth_ai/cli/commands/status/subcommands/pricing.py +0 -23
  121. synth_ai/cli/commands/status/subcommands/usage.py +0 -203
  122. synth_ai/cli/commands/train/judge_validation.py +0 -305
  123. synth_ai/cli/usage.py +0 -159
  124. synth_ai/data/specs.py +0 -36
  125. synth_ai/sdk/api/research_agent/cli.py +0 -428
  126. synth_ai/sdk/api/research_agent/config.py +0 -357
  127. synth_ai/sdk/api/research_agent/job.py +0 -717
  128. synth_ai/sdk/baseline/__init__.py +0 -25
  129. synth_ai/sdk/baseline/config.py +0 -209
  130. synth_ai/sdk/baseline/discovery.py +0 -216
  131. synth_ai/sdk/baseline/execution.py +0 -154
  132. synth_ai/sdk/judging/__init__.py +0 -15
  133. synth_ai/sdk/judging/base.py +0 -24
  134. synth_ai/sdk/judging/client.py +0 -191
  135. synth_ai/sdk/judging/types.py +0 -42
  136. synth_ai/sdk/research_agent/__init__.py +0 -34
  137. synth_ai/sdk/research_agent/container_builder.py +0 -328
  138. synth_ai/sdk/research_agent/container_spec.py +0 -198
  139. synth_ai/sdk/research_agent/defaults.py +0 -34
  140. synth_ai/sdk/research_agent/results_collector.py +0 -69
  141. synth_ai/sdk/specs/__init__.py +0 -46
  142. synth_ai/sdk/specs/dataclasses.py +0 -149
  143. synth_ai/sdk/specs/loader.py +0 -144
  144. synth_ai/sdk/specs/serializer.py +0 -199
  145. synth_ai/sdk/specs/validation.py +0 -250
  146. synth_ai/sdk/tracing/__init__.py +0 -39
  147. synth_ai/sdk/usage/__init__.py +0 -37
  148. synth_ai/sdk/usage/client.py +0 -171
  149. synth_ai/sdk/usage/models.py +0 -261
  150. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/WHEEL +0 -0
  151. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/entry_points.txt +0 -0
  152. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/licenses/LICENSE +0 -0
  153. {synth_ai-0.4.1.dist-info → synth_ai-0.4.4.dist-info}/top_level.txt +0 -0
@@ -196,11 +196,11 @@ class ParetoFloorsConfig(BaseModel):
196
196
 
197
197
 
198
198
  # ============================================================================
199
- # ADAS Dataset Format Models
199
+ # Graph Opt Dataset Format Models
200
200
  # ============================================================================
201
201
 
202
202
  class TaskInput(BaseModel):
203
- """A single task/example in an ADAS dataset.
203
+ """A single task/example in a Graph Opt dataset.
204
204
 
205
205
  For POLICY graphs: Contains the problem to solve.
206
206
  For VERIFIER graphs: Contains a trace to evaluate.
@@ -267,8 +267,8 @@ class GoldOutput(BaseModel):
267
267
  score: Optional[float] = Field(default=None, ge=0.0, le=1.0, description="Gold score (0.0-1.0)")
268
268
 
269
269
 
270
- class ADASDatasetMetadata(BaseModel):
271
- """Metadata about an ADAS dataset.
270
+ class GraphOptDatasetMetadata(BaseModel):
271
+ """Metadata about a Graph Opt dataset.
272
272
 
273
273
  Provides context for graph generation and optimization.
274
274
  """
@@ -279,8 +279,8 @@ class ADASDatasetMetadata(BaseModel):
279
279
  domain: Optional[str] = Field(default=None, description="Domain (qa, code, games, etc.)")
280
280
 
281
281
 
282
- class ADASDataset(BaseModel):
283
- """Complete ADAS dataset format for inline upload.
282
+ class GraphOptDataset(BaseModel):
283
+ """Complete Graph Opt dataset format for inline upload.
284
284
 
285
285
  This is the schema for the `dataset` field in GraphOptimizationConfig
286
286
  when uploading data directly instead of using a pre-registered dataset.
@@ -303,7 +303,7 @@ class ADASDataset(BaseModel):
303
303
  """
304
304
  tasks: List[TaskInput] = Field(..., min_length=1, description="List of tasks/examples")
305
305
  gold_outputs: List[GoldOutput] = Field(..., min_length=1, description="Ground truth for each task")
306
- metadata: ADASDatasetMetadata = Field(default_factory=ADASDatasetMetadata)
306
+ metadata: GraphOptDatasetMetadata = Field(default_factory=GraphOptDatasetMetadata)
307
307
 
308
308
  @field_validator("tasks", mode="before")
309
309
  @classmethod
@@ -443,7 +443,7 @@ class GraphOptimizationConfig(BaseModel):
443
443
  # Format: {"name": str, "task_description": str, "examples": [...]}
444
444
  dataset: Optional[Dict[str, Any]] = Field(
445
445
  default=None,
446
- description="Inline dataset for upload (ADAS format). If provided, dataset_name is used as identifier."
446
+ description="Inline dataset for upload (GraphGen format). If provided, dataset_name is used as identifier."
447
447
  )
448
448
 
449
449
  # Task context for initial graph generation (when dataset doesn't provide it)
@@ -464,8 +464,8 @@ class GraphOptimizationConfig(BaseModel):
464
464
  )
465
465
 
466
466
  # Scoring configuration
467
- scoring_strategy: str = Field(default="rubric", description="Scoring strategy: 'default', 'rubric', 'mae'")
468
- judge_model: str = Field(default="gpt-4o-mini", description="Model for LLM judge scoring")
467
+ verifier_mode: str = Field(default="rubric", description="Verifier mode: 'rubric', 'contrastive', 'fewshot'")
468
+ verifier_model: str = Field(default="gpt-4o-mini", description="Model for LLM verifier scoring")
469
469
 
470
470
  @field_validator("graph_type", mode="before")
471
471
  @classmethod
@@ -529,8 +529,8 @@ class GraphOptimizationConfig(BaseModel):
529
529
  "graph_structure": self.graph_structure.value,
530
530
  "allowed_policy_models": self.allowed_policy_models,
531
531
  "dataset_config": self.dataset_config,
532
- "scoring_strategy": self.scoring_strategy,
533
- "judge_model": self.judge_model,
532
+ "verifier_mode": self.verifier_mode,
533
+ "verifier_model": self.verifier_model,
534
534
  }
535
535
 
536
536
  if self.max_llm_calls_per_run is not None:
@@ -551,19 +551,19 @@ class GraphOptimizationConfig(BaseModel):
551
551
  if self.dataset:
552
552
  # Validate dataset structure using Pydantic model
553
553
  try:
554
- validated = ADASDataset(**self.dataset)
554
+ validated = GraphOptDataset(**self.dataset)
555
555
  # Check for task ID consistency (non-fatal warnings)
556
556
  warnings = validated.validate_task_ids()
557
557
  if warnings:
558
558
  import logging
559
559
  logger = logging.getLogger(__name__)
560
560
  for w in warnings:
561
- logger.warning(f"[ADASDataset] {w}")
561
+ logger.warning(f"[GraphOptDataset] {w}")
562
562
  except Exception as e:
563
563
  raise ValueError(
564
- f"Invalid ADAS dataset format: {e}\n"
564
+ f"Invalid Graph Opt dataset format: {e}\n"
565
565
  f"Expected format: {{'tasks': [...], 'gold_outputs': [...], 'metadata': {{...}}}}\n"
566
- f"See ADASDataset model for full schema.\n"
566
+ f"See GraphOptDataset model for full schema.\n"
567
567
  f"Got keys: {list(self.dataset.keys())}"
568
568
  )
569
569
  request["dataset"] = self.dataset
@@ -1,7 +1,7 @@
1
1
  """Dataset converters for Graph GEPA.
2
2
 
3
3
  This module provides converters to transform common dataset formats
4
- into ADAS format for use with Graph GEPA optimization.
4
+ into Graph Opt format for use with Graph GEPA optimization.
5
5
 
6
6
  Supported formats:
7
7
  - OpenAI SFT: JSONL with messages array (system, user, assistant roles)
@@ -11,13 +11,13 @@ Example:
11
11
  >>>
12
12
  >>> # Convert from file
13
13
  >>> result = convert_openai_sft("training_data.jsonl")
14
- >>> adas_dataset = result.dataset
14
+ >>> graph_opt_dataset = result.dataset
15
15
  >>>
16
16
  >>> # Use in GraphOptimizationConfig
17
17
  >>> from synth_ai.products.graph_gepa import GraphOptimizationConfig
18
18
  >>> config = GraphOptimizationConfig(
19
19
  ... dataset_name="my_qa_task",
20
- ... dataset=adas_dataset,
20
+ ... dataset=graph_opt_dataset,
21
21
  ... graph_type="policy",
22
22
  ... ...
23
23
  ... )
@@ -1,6 +1,6 @@
1
- """OpenAI SFT format to ADAS dataset converter.
1
+ """OpenAI SFT format to Graph Opt dataset converter.
2
2
 
3
- This module converts OpenAI SFT format (JSONL with messages array) to ADAS format
3
+ This module converts OpenAI SFT format (JSONL with messages array) to Graph Opt format
4
4
  for use with Graph GEPA optimization.
5
5
 
6
6
  Example OpenAI SFT format:
@@ -10,7 +10,7 @@ Example OpenAI SFT format:
10
10
  {"role": "assistant", "content": "Paris"}
11
11
  ]}
12
12
 
13
- Example ADAS output:
13
+ Example Graph Opt output:
14
14
  {
15
15
  "tasks": [{"task_id": "sft_0000", "input": {"user_message": "..."}}],
16
16
  "gold_outputs": [{"task_id": "sft_0000", "output": {"response": "..."}, "score": 1.0}],
@@ -59,10 +59,10 @@ class ConversionWarning:
59
59
 
60
60
  @dataclass
61
61
  class ConversionResult:
62
- """Result of converting SFT to ADAS.
62
+ """Result of converting SFT to Graph Opt.
63
63
 
64
64
  Attributes:
65
- dataset: The ADAS dataset dict
65
+ dataset: The Graph Opt dataset dict
66
66
  warnings: Non-fatal issues encountered
67
67
  stats: Conversion statistics
68
68
  """
@@ -343,7 +343,7 @@ def convert_openai_sft(
343
343
  detect_template: bool = True,
344
344
  max_examples: int | None = None,
345
345
  ) -> ConversionResult:
346
- """Convert OpenAI SFT format to ADAS dataset.
346
+ """Convert OpenAI SFT format to Graph Opt dataset.
347
347
 
348
348
  Args:
349
349
  source: Path to JSONL file, or list of SFT example dicts
@@ -352,7 +352,7 @@ def convert_openai_sft(
352
352
  max_examples: Maximum number of examples to include (None for all)
353
353
 
354
354
  Returns:
355
- ConversionResult containing the ADAS dataset, warnings, and stats
355
+ ConversionResult containing the Graph Opt dataset, warnings, and stats
356
356
 
357
357
  Raises:
358
358
  ConversionError: If no valid examples found
@@ -7,7 +7,7 @@ algorithm = "graph_gepa"
7
7
 
8
8
  # What we're optimizing
9
9
  dataset_name = "hotpotqa"
10
- graph_type = "policy" # "policy" (solves tasks), "verifier" (judges results), or "rlm" (massive context via tools)
10
+ graph_type = "policy" # "policy" (solves tasks), "verifier" (verifies results), or "rlm" (massive context via tools)
11
11
  graph_structure = "dag" # "single_prompt", "dag", or "conditional"
12
12
 
13
13
  # Custom topology guidance (optional - adds detail to graph_structure)
@@ -0,0 +1,23 @@
1
+ """Backward-compatible Graph GEPA package alias."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from synth_ai.products.graph_evolve import (
6
+ GraphOptimizationClient,
7
+ GraphOptimizationConfig,
8
+ ConversionError,
9
+ ConversionResult,
10
+ ConversionWarning,
11
+ convert_openai_sft,
12
+ preview_conversion,
13
+ )
14
+
15
+ __all__ = [
16
+ "GraphOptimizationConfig",
17
+ "GraphOptimizationClient",
18
+ "convert_openai_sft",
19
+ "preview_conversion",
20
+ "ConversionResult",
21
+ "ConversionWarning",
22
+ "ConversionError",
23
+ ]
@@ -0,0 +1,19 @@
1
+ """Graph GEPA converters (compatibility layer)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from synth_ai.products.graph_evolve.converters import (
6
+ ConversionError,
7
+ ConversionResult,
8
+ ConversionWarning,
9
+ convert_openai_sft,
10
+ preview_conversion,
11
+ )
12
+
13
+ __all__ = [
14
+ "convert_openai_sft",
15
+ "preview_conversion",
16
+ "ConversionResult",
17
+ "ConversionWarning",
18
+ "ConversionError",
19
+ ]
@@ -0,0 +1,29 @@
1
+ """Compatibility wrapper for OpenAI SFT converters."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from synth_ai.products.graph_evolve.converters.openai_sft import (
6
+ ConversionError,
7
+ ConversionResult,
8
+ ConversionWarning,
9
+ convert_openai_sft,
10
+ detect_system_prompt,
11
+ extract_fields,
12
+ infer_template,
13
+ parse_sft_example,
14
+ preview_conversion,
15
+ validate_sft_examples,
16
+ )
17
+
18
+ __all__ = [
19
+ "ConversionError",
20
+ "ConversionResult",
21
+ "ConversionWarning",
22
+ "convert_openai_sft",
23
+ "detect_system_prompt",
24
+ "extract_fields",
25
+ "infer_template",
26
+ "parse_sft_example",
27
+ "preview_conversion",
28
+ "validate_sft_examples",
29
+ ]
synth_ai/sdk/__init__.py CHANGED
@@ -1,19 +1,16 @@
1
1
  """Synth AI SDK Layer.
2
2
 
3
3
  This module provides the user-facing programmatic API for:
4
- - Training (prompt learning, SFT, RL)
4
+ - Training (prompt learning, SFT, RL, graph generation)
5
5
  - Task apps (in-process, deployed, Modal)
6
- - Judging (LLM-based evaluation)
6
+ - Graphs (verifiers, completions)
7
7
  - Inference (model inference via Synth)
8
- - Tracing (session traces)
9
- - Specs (system specifications)
10
- - Research agents (scaffold tuning, evaluation)
11
8
 
12
9
  Usage:
13
10
  from synth_ai.sdk import (
14
11
  PromptLearningJob,
15
12
  InProcessTaskApp,
16
- JudgeClient,
13
+ VerifierClient,
17
14
  InferenceClient,
18
15
  )
19
16
 
@@ -24,32 +21,22 @@ Dependency rules:
24
21
 
25
22
  from __future__ import annotations
26
23
 
27
- # Research Agent
28
- from synth_ai.sdk.api.research_agent import ResearchAgentJob, ResearchAgentJobConfig
29
-
30
24
  # Inference
31
25
  from synth_ai.sdk.inference import InferenceClient
32
26
 
33
27
  # Jobs API Client
34
28
  from synth_ai.sdk.jobs import JobsClient
35
29
 
36
- # Judging
37
- from synth_ai.sdk.judging import JudgeClient, JudgeOptions, JudgeScoreResponse
30
+ # Verifier types and graph clients
38
31
  from synth_ai.sdk.graphs import GraphCompletionsClient, GraphTarget, VerifierClient
39
-
40
- # Specs
41
- from synth_ai.sdk.specs import (
42
- load_spec_from_dict,
43
- load_spec_from_file,
44
- spec_to_prompt_context,
45
- validate_spec_dict,
46
- validate_spec_file,
47
- )
32
+ from synth_ai.sdk.graphs.verifier_schemas import VerifierOptions, VerifierScoreResponse
48
33
 
49
34
  # Task Apps
50
35
  from synth_ai.sdk.task import (
51
36
  InProcessJobResult,
52
37
  InProcessTaskApp,
38
+ LocalAPIClient,
39
+ LocalAPIConfig,
53
40
  TaskAppConfig,
54
41
  create_task_app,
55
42
  merge_dot_overrides,
@@ -69,10 +56,27 @@ from synth_ai.sdk.training import (
69
56
  GraphGenTask,
70
57
  GraphGenGoldOutput,
71
58
  GraphGenRubric,
72
- GraphGenJudgeConfig,
59
+ GraphGenVerifierConfig,
73
60
  load_graphgen_taskset,
74
61
  )
75
62
 
63
+ # Evaluation
64
+ from synth_ai.sdk.api.eval import EvalJob, EvalJobConfig
65
+
66
+ # Tunnels - commonly used functions for notebook/script usage
67
+ from synth_ai.sdk.tunnels import (
68
+ rotate_tunnel,
69
+ open_managed_tunnel,
70
+ stop_tunnel,
71
+ track_process,
72
+ cleanup_all,
73
+ verify_tunnel_dns_resolution,
74
+ wait_for_health_check,
75
+ kill_port,
76
+ is_port_available,
77
+ find_available_port,
78
+ )
79
+
76
80
  __all__ = [
77
81
  # Training
78
82
  "PromptLearningJob",
@@ -85,8 +89,11 @@ __all__ = [
85
89
  "GraphGenTask",
86
90
  "GraphGenGoldOutput",
87
91
  "GraphGenRubric",
88
- "GraphGenJudgeConfig",
92
+ "GraphGenVerifierConfig",
89
93
  "load_graphgen_taskset",
94
+ # Evaluation
95
+ "EvalJob",
96
+ "EvalJobConfig",
90
97
  # Task Apps
91
98
  "InProcessTaskApp",
92
99
  "InProcessJobResult",
@@ -94,26 +101,29 @@ __all__ = [
94
101
  "resolve_backend_api_base",
95
102
  "run_in_process_job",
96
103
  "run_in_process_job_sync",
104
+ "LocalAPIClient",
105
+ "LocalAPIConfig",
97
106
  "TaskAppConfig",
98
107
  "create_task_app",
99
- # Judging
100
- "JudgeClient",
108
+ # Graphs / Verifier
101
109
  "VerifierClient",
102
- "JudgeOptions",
103
- "JudgeScoreResponse",
110
+ "VerifierOptions",
111
+ "VerifierScoreResponse",
104
112
  "GraphCompletionsClient",
105
113
  "GraphTarget",
106
114
  # Inference
107
115
  "InferenceClient",
108
- # Specs
109
- "load_spec_from_dict",
110
- "load_spec_from_file",
111
- "spec_to_prompt_context",
112
- "validate_spec_dict",
113
- "validate_spec_file",
114
- # Research Agent
115
- "ResearchAgentJob",
116
- "ResearchAgentJobConfig",
117
116
  # Jobs API Client
118
117
  "JobsClient",
118
+ # Tunnels
119
+ "rotate_tunnel",
120
+ "open_managed_tunnel",
121
+ "stop_tunnel",
122
+ "track_process",
123
+ "cleanup_all",
124
+ "verify_tunnel_dns_resolution",
125
+ "wait_for_health_check",
126
+ "kill_port",
127
+ "is_port_available",
128
+ "find_available_port",
119
129
  ]
@@ -0,0 +1,33 @@
1
+ """First-class SDK API for evaluation jobs.
2
+
3
+ This module provides high-level abstractions for running evaluation jobs
4
+ both via CLI and programmatically in Python scripts.
5
+
6
+ Example CLI usage:
7
+ python -m synth_ai.cli eval --config banking77_eval.toml --backend http://localhost:8000
8
+
9
+ Example SDK usage:
10
+ from synth_ai.sdk.api.eval import EvalJob, EvalResult
11
+
12
+ job = EvalJob(config)
13
+ job.submit()
14
+
15
+ # progress=True provides built-in status printing:
16
+ # [00:05] running | 3/10 completed
17
+ # [00:10] running | 7/10 completed
18
+ # [00:15] completed | mean_score: 0.85
19
+ result = job.poll_until_complete(progress=True)
20
+
21
+ # Typed result access (not raw dict)
22
+ if result.succeeded:
23
+ print(f"Mean score: {result.mean_score}")
24
+ print(f"Total cost: ${result.total_cost_usd:.4f}")
25
+
26
+ See Also:
27
+ - `synth_ai.cli.commands.eval`: CLI implementation
28
+ - Backend API: POST /api/eval/jobs
29
+ """
30
+
31
+ from .job import EvalJob, EvalJobConfig, EvalResult, EvalStatus
32
+
33
+ __all__ = ["EvalJob", "EvalJobConfig", "EvalResult", "EvalStatus"]