@agentv/sdk 4.41.5-next.1 → 4.41.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -6483,6 +6483,201 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
6483
6483
  }>;
6484
6484
  type PromptTemplateInput = CodeGraderInput;
6485
6485
 
6486
+ declare const EVAL_SUITE_SYMBOL: unique symbol;
6487
+ declare const TO_EVAL_YAML_OBJECT_SYMBOL: unique symbol;
6488
+ declare const KNOWN_SNAKE_CASE_KEYS: {
6489
+ readonly afterAll: "after_all";
6490
+ readonly afterEach: "after_each";
6491
+ readonly argsMatch: "args_match";
6492
+ readonly baseCommit: "base_commit";
6493
+ readonly beforeAll: "before_all";
6494
+ readonly beforeEach: "before_each";
6495
+ readonly budgetUsd: "budget_usd";
6496
+ readonly conversationId: "conversation_id";
6497
+ readonly costLimitUsd: "cost_limit_usd";
6498
+ readonly dependsOn: "depends_on";
6499
+ readonly expectedOutput: "expected_output";
6500
+ readonly explorationTolerance: "exploration_tolerance";
6501
+ readonly failOnError: "fail_on_error";
6502
+ readonly inputFiles: "input_files";
6503
+ readonly keepWorkspaces: "keep_workspaces";
6504
+ readonly maxCostUsd: "max_cost_usd";
6505
+ readonly maxDurationMs: "max_duration_ms";
6506
+ readonly maxInput: "max_input";
6507
+ readonly maxLlmCalls: "max_llm_calls";
6508
+ readonly maxOutput: "max_output";
6509
+ readonly maxTokens: "max_tokens";
6510
+ readonly maxToolCalls: "max_tool_calls";
6511
+ readonly minScore: "min_score";
6512
+ readonly onDependencyFailure: "on_dependency_failure";
6513
+ readonly onTurnFailure: "on_turn_failure";
6514
+ readonly outputPath: "output_path";
6515
+ readonly scoreRanges: "score_ranges";
6516
+ readonly skipDefaults: "skip_defaults";
6517
+ readonly targetExplorationRatio: "target_exploration_ratio";
6518
+ readonly timeoutMs: "timeout_ms";
6519
+ readonly useTarget: "use_target";
6520
+ readonly windowSize: "window_size";
6521
+ };
6522
+ type KnownSnakeCaseKeyMap = typeof KNOWN_SNAKE_CASE_KEYS;
6523
+ type LowerEvalKey<Key extends string> = Key extends keyof KnownSnakeCaseKeyMap ? KnownSnakeCaseKeyMap[Key] : Key;
6524
+ type LowerEvalYamlValue<Value> = Value extends readonly (infer Item)[] ? LowerEvalYamlValue<Item>[] : Value extends object ? {
6525
+ [Key in keyof Value as Key extends string ? LowerEvalKey<Key> : never]: LowerEvalYamlValue<Value[Key]>;
6526
+ } : Value;
6527
+ type EvalMessageContent = string | Readonly<Record<string, unknown>> | readonly (string | Readonly<Record<string, unknown>>)[];
6528
+ interface EvalMessage {
6529
+ readonly role: 'system' | 'user' | 'assistant' | 'tool';
6530
+ readonly content: EvalMessageContent;
6531
+ readonly [key: string]: unknown;
6532
+ }
6533
+ interface EvalAssertionConfig {
6534
+ readonly type: string;
6535
+ readonly [key: string]: unknown;
6536
+ }
6537
+ interface EvalPreprocessor {
6538
+ readonly type: string;
6539
+ readonly command: string | readonly string[];
6540
+ readonly [key: string]: unknown;
6541
+ }
6542
+ interface EvalWorkspaceHook {
6543
+ readonly command?: string | readonly string[];
6544
+ readonly script?: string | readonly string[];
6545
+ readonly timeoutMs?: number;
6546
+ readonly cwd?: string;
6547
+ readonly reset?: 'none' | 'fast' | 'strict';
6548
+ readonly [key: string]: unknown;
6549
+ }
6550
+ interface EvalWorkspaceHooks {
6551
+ readonly enabled?: boolean;
6552
+ readonly beforeAll?: EvalWorkspaceHook;
6553
+ readonly beforeEach?: EvalWorkspaceHook;
6554
+ readonly afterEach?: EvalWorkspaceHook;
6555
+ readonly afterAll?: EvalWorkspaceHook;
6556
+ }
6557
+ interface EvalWorkspaceRepo {
6558
+ readonly path?: string;
6559
+ readonly repo?: string;
6560
+ readonly commit?: string;
6561
+ readonly baseCommit?: string;
6562
+ readonly ancestor?: number;
6563
+ readonly sparse?: readonly string[];
6564
+ }
6565
+ interface EvalDockerWorkspace {
6566
+ readonly image: string;
6567
+ readonly timeout?: number;
6568
+ readonly memory?: string;
6569
+ readonly cpus?: number;
6570
+ }
6571
+ interface EvalWorkspace {
6572
+ readonly template?: string;
6573
+ readonly isolation?: 'shared' | 'per_test';
6574
+ readonly repos?: readonly EvalWorkspaceRepo[];
6575
+ readonly hooks?: EvalWorkspaceHooks;
6576
+ readonly mode?: 'pooled' | 'temp' | 'static';
6577
+ readonly path?: string;
6578
+ readonly docker?: EvalDockerWorkspace;
6579
+ }
6580
+ interface EvalTargetRef {
6581
+ readonly name: string;
6582
+ readonly useTarget?: string;
6583
+ readonly hooks?: EvalWorkspaceHooks;
6584
+ }
6585
+ interface EvalTrials {
6586
+ readonly count: number;
6587
+ readonly strategy?: 'pass_at_k' | 'mean' | 'confidence_interval';
6588
+ readonly costLimitUsd?: number;
6589
+ }
6590
+ interface EvalExecution {
6591
+ readonly target?: string;
6592
+ readonly targets?: readonly (string | EvalTargetRef)[];
6593
+ readonly workers?: number;
6594
+ readonly assertions?: readonly EvalAssertionConfig[];
6595
+ readonly skipDefaults?: boolean;
6596
+ readonly cache?: boolean;
6597
+ readonly trials?: EvalTrials;
6598
+ readonly budgetUsd?: number;
6599
+ readonly failOnError?: boolean;
6600
+ readonly threshold?: number;
6601
+ readonly [key: string]: unknown;
6602
+ }
6603
+ interface EvalTurn {
6604
+ readonly input: EvalMessageContent;
6605
+ readonly expectedOutput?: EvalMessageContent;
6606
+ readonly assertions?: readonly (string | EvalAssertionConfig)[];
6607
+ }
6608
+ interface EvalTest {
6609
+ readonly id: string;
6610
+ readonly vars?: Readonly<Record<string, unknown>>;
6611
+ readonly criteria?: string;
6612
+ readonly input?: string | readonly EvalMessage[];
6613
+ readonly inputFiles?: readonly string[];
6614
+ readonly expectedOutput?: string | Readonly<Record<string, unknown>> | readonly EvalMessage[];
6615
+ readonly assertions?: readonly EvalAssertionConfig[];
6616
+ readonly execution?: EvalExecution;
6617
+ readonly workspace?: EvalWorkspace;
6618
+ readonly metadata?: Readonly<Record<string, unknown>>;
6619
+ readonly conversationId?: string;
6620
+ readonly suite?: string;
6621
+ readonly dependsOn?: readonly string[];
6622
+ readonly onDependencyFailure?: 'skip' | 'fail' | 'run';
6623
+ readonly mode?: 'conversation';
6624
+ readonly turns?: readonly EvalTurn[];
6625
+ readonly aggregation?: 'mean' | 'min' | 'max';
6626
+ readonly onTurnFailure?: 'continue' | 'stop';
6627
+ readonly windowSize?: number;
6628
+ }
6629
+ interface EvalRequires {
6630
+ readonly agentv?: string;
6631
+ readonly [key: string]: unknown;
6632
+ }
6633
+ interface EvalDefinition {
6634
+ readonly $schema?: string;
6635
+ readonly name?: string;
6636
+ readonly description?: string;
6637
+ readonly category?: string;
6638
+ readonly version?: string;
6639
+ readonly author?: string;
6640
+ readonly tags?: readonly string[];
6641
+ readonly license?: string;
6642
+ readonly requires?: EvalRequires;
6643
+ readonly input?: string | readonly EvalMessage[];
6644
+ readonly inputFiles?: readonly string[];
6645
+ readonly tests: readonly EvalTest[] | string;
6646
+ readonly target?: string;
6647
+ readonly execution?: EvalExecution;
6648
+ readonly assertions?: readonly EvalAssertionConfig[];
6649
+ readonly preprocessors?: readonly EvalPreprocessor[];
6650
+ readonly workspace?: EvalWorkspace | string;
6651
+ }
6652
+ interface DefinedEvalSuite {
6653
+ readonly [EVAL_SUITE_SYMBOL]: true;
6654
+ readonly [TO_EVAL_YAML_OBJECT_SYMBOL]: () => Record<string, unknown>;
6655
+ }
6656
+ /**
6657
+ * Define a YAML-aligned eval suite in TypeScript.
6658
+ *
6659
+ * The returned object preserves the TypeScript authoring shape and carries a
6660
+ * non-enumerable lowering hook so AgentV can materialize the canonical
6661
+ * snake_case eval contract when the suite is loaded from a `.eval.ts` file.
6662
+ */
6663
+ declare function defineEval<T extends EvalDefinition>(definition: T): T & DefinedEvalSuite;
6664
+ /**
6665
+ * Alias for `defineEval()` when a suite reads more clearly as a plain object.
6666
+ */
6667
+ declare function evalSuite<T extends EvalDefinition>(definition: T): T & DefinedEvalSuite;
6668
+ /**
6669
+ * Lower a TypeScript-authored eval suite into the canonical snake_case object
6670
+ * contract used by YAML files and the runtime loader.
6671
+ *
6672
+ * Only known AgentV wire keys are converted. Unknown keys are preserved as-is
6673
+ * so opaque assertion, provider, and metadata payloads are not corrupted.
6674
+ */
6675
+ declare function toEvalYamlObject<T extends EvalDefinition | DefinedEvalSuite>(definition: T): LowerEvalYamlValue<T>;
6676
+ /**
6677
+ * Serialize an eval suite to canonical YAML.
6678
+ */
6679
+ declare function serializeEvalYaml<T extends EvalDefinition | DefinedEvalSuite>(definition: T): string;
6680
+
6486
6681
  /**
6487
6682
  * Client for invoking configured targets from code-grader scripts.
6488
6683
  *
@@ -6827,4 +7022,4 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
6827
7022
  */
6828
7023
  declare function defineAssertion(handler: AssertionHandler): void;
6829
7024
 
6830
- export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type Content, type ContentFile, ContentFileSchema, type ContentImage, ContentImageSchema, ContentSchema, type ContentText, ContentTextSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, TRACE_EVENT_TYPES, TRACE_REDACTION_LEVELS, TRACE_SOURCE_KINDS, TRACE_TOOL_STATUSES, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type Trace, type TraceArtifact, TraceArtifactSchema, type TraceBranch, TraceBranchSchema, type TraceError, TraceErrorSchema, type TraceEvent, TraceEventSchema, type TraceMessage, TraceMessageSchema, type TraceModel, TraceModelSchema, type TraceRawEvidence, TraceRawEvidenceSchema, type TraceRedactionState, TraceRedactionStateSchema, TraceSchema, type TraceSession, TraceSessionSchema, type TraceSource, type TraceSourceRef, TraceSourceRefSchema, TraceSourceSchema, type TraceSummary, TraceSummarySchema, type TraceTool, TraceToolSchema, createTargetClient, defineAssertion, defineCodeGrader, definePromptTemplate };
7025
+ export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type Content, type ContentFile, ContentFileSchema, type ContentImage, ContentImageSchema, ContentSchema, type ContentText, ContentTextSchema, type DefinedEvalSuite, type EvalAssertionConfig, type EvalDefinition, type EvalDockerWorkspace, type EvalExecution, type EvalMessage, type EvalMessageContent, type EvalPreprocessor, type EvalRequires, type EvalTargetRef, type EvalTest, type EvalTrials, type EvalTurn, type EvalWorkspace, type EvalWorkspaceHook, type EvalWorkspaceHooks, type EvalWorkspaceRepo, type LowerEvalYamlValue, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, TRACE_EVENT_TYPES, TRACE_REDACTION_LEVELS, TRACE_SOURCE_KINDS, TRACE_TOOL_STATUSES, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type Trace, type TraceArtifact, TraceArtifactSchema, type TraceBranch, TraceBranchSchema, type TraceError, TraceErrorSchema, type TraceEvent, TraceEventSchema, type TraceMessage, TraceMessageSchema, type TraceModel, TraceModelSchema, type TraceRawEvidence, TraceRawEvidenceSchema, type TraceRedactionState, TraceRedactionStateSchema, TraceSchema, type TraceSession, TraceSessionSchema, type TraceSource, type TraceSourceRef, TraceSourceRefSchema, TraceSourceSchema, type TraceSummary, TraceSummarySchema, type TraceTool, TraceToolSchema, createTargetClient, defineAssertion, defineCodeGrader, defineEval, definePromptTemplate, evalSuite, serializeEvalYaml, toEvalYamlObject };
package/dist/index.d.ts CHANGED
@@ -6483,6 +6483,201 @@ declare const PromptTemplateInputSchema: z.ZodObject<{
6483
6483
  }>;
6484
6484
  type PromptTemplateInput = CodeGraderInput;
6485
6485
 
6486
+ declare const EVAL_SUITE_SYMBOL: unique symbol;
6487
+ declare const TO_EVAL_YAML_OBJECT_SYMBOL: unique symbol;
6488
+ declare const KNOWN_SNAKE_CASE_KEYS: {
6489
+ readonly afterAll: "after_all";
6490
+ readonly afterEach: "after_each";
6491
+ readonly argsMatch: "args_match";
6492
+ readonly baseCommit: "base_commit";
6493
+ readonly beforeAll: "before_all";
6494
+ readonly beforeEach: "before_each";
6495
+ readonly budgetUsd: "budget_usd";
6496
+ readonly conversationId: "conversation_id";
6497
+ readonly costLimitUsd: "cost_limit_usd";
6498
+ readonly dependsOn: "depends_on";
6499
+ readonly expectedOutput: "expected_output";
6500
+ readonly explorationTolerance: "exploration_tolerance";
6501
+ readonly failOnError: "fail_on_error";
6502
+ readonly inputFiles: "input_files";
6503
+ readonly keepWorkspaces: "keep_workspaces";
6504
+ readonly maxCostUsd: "max_cost_usd";
6505
+ readonly maxDurationMs: "max_duration_ms";
6506
+ readonly maxInput: "max_input";
6507
+ readonly maxLlmCalls: "max_llm_calls";
6508
+ readonly maxOutput: "max_output";
6509
+ readonly maxTokens: "max_tokens";
6510
+ readonly maxToolCalls: "max_tool_calls";
6511
+ readonly minScore: "min_score";
6512
+ readonly onDependencyFailure: "on_dependency_failure";
6513
+ readonly onTurnFailure: "on_turn_failure";
6514
+ readonly outputPath: "output_path";
6515
+ readonly scoreRanges: "score_ranges";
6516
+ readonly skipDefaults: "skip_defaults";
6517
+ readonly targetExplorationRatio: "target_exploration_ratio";
6518
+ readonly timeoutMs: "timeout_ms";
6519
+ readonly useTarget: "use_target";
6520
+ readonly windowSize: "window_size";
6521
+ };
6522
+ type KnownSnakeCaseKeyMap = typeof KNOWN_SNAKE_CASE_KEYS;
6523
+ type LowerEvalKey<Key extends string> = Key extends keyof KnownSnakeCaseKeyMap ? KnownSnakeCaseKeyMap[Key] : Key;
6524
+ type LowerEvalYamlValue<Value> = Value extends readonly (infer Item)[] ? LowerEvalYamlValue<Item>[] : Value extends object ? {
6525
+ [Key in keyof Value as Key extends string ? LowerEvalKey<Key> : never]: LowerEvalYamlValue<Value[Key]>;
6526
+ } : Value;
6527
+ type EvalMessageContent = string | Readonly<Record<string, unknown>> | readonly (string | Readonly<Record<string, unknown>>)[];
6528
+ interface EvalMessage {
6529
+ readonly role: 'system' | 'user' | 'assistant' | 'tool';
6530
+ readonly content: EvalMessageContent;
6531
+ readonly [key: string]: unknown;
6532
+ }
6533
+ interface EvalAssertionConfig {
6534
+ readonly type: string;
6535
+ readonly [key: string]: unknown;
6536
+ }
6537
+ interface EvalPreprocessor {
6538
+ readonly type: string;
6539
+ readonly command: string | readonly string[];
6540
+ readonly [key: string]: unknown;
6541
+ }
6542
+ interface EvalWorkspaceHook {
6543
+ readonly command?: string | readonly string[];
6544
+ readonly script?: string | readonly string[];
6545
+ readonly timeoutMs?: number;
6546
+ readonly cwd?: string;
6547
+ readonly reset?: 'none' | 'fast' | 'strict';
6548
+ readonly [key: string]: unknown;
6549
+ }
6550
+ interface EvalWorkspaceHooks {
6551
+ readonly enabled?: boolean;
6552
+ readonly beforeAll?: EvalWorkspaceHook;
6553
+ readonly beforeEach?: EvalWorkspaceHook;
6554
+ readonly afterEach?: EvalWorkspaceHook;
6555
+ readonly afterAll?: EvalWorkspaceHook;
6556
+ }
6557
+ interface EvalWorkspaceRepo {
6558
+ readonly path?: string;
6559
+ readonly repo?: string;
6560
+ readonly commit?: string;
6561
+ readonly baseCommit?: string;
6562
+ readonly ancestor?: number;
6563
+ readonly sparse?: readonly string[];
6564
+ }
6565
+ interface EvalDockerWorkspace {
6566
+ readonly image: string;
6567
+ readonly timeout?: number;
6568
+ readonly memory?: string;
6569
+ readonly cpus?: number;
6570
+ }
6571
+ interface EvalWorkspace {
6572
+ readonly template?: string;
6573
+ readonly isolation?: 'shared' | 'per_test';
6574
+ readonly repos?: readonly EvalWorkspaceRepo[];
6575
+ readonly hooks?: EvalWorkspaceHooks;
6576
+ readonly mode?: 'pooled' | 'temp' | 'static';
6577
+ readonly path?: string;
6578
+ readonly docker?: EvalDockerWorkspace;
6579
+ }
6580
+ interface EvalTargetRef {
6581
+ readonly name: string;
6582
+ readonly useTarget?: string;
6583
+ readonly hooks?: EvalWorkspaceHooks;
6584
+ }
6585
+ interface EvalTrials {
6586
+ readonly count: number;
6587
+ readonly strategy?: 'pass_at_k' | 'mean' | 'confidence_interval';
6588
+ readonly costLimitUsd?: number;
6589
+ }
6590
+ interface EvalExecution {
6591
+ readonly target?: string;
6592
+ readonly targets?: readonly (string | EvalTargetRef)[];
6593
+ readonly workers?: number;
6594
+ readonly assertions?: readonly EvalAssertionConfig[];
6595
+ readonly skipDefaults?: boolean;
6596
+ readonly cache?: boolean;
6597
+ readonly trials?: EvalTrials;
6598
+ readonly budgetUsd?: number;
6599
+ readonly failOnError?: boolean;
6600
+ readonly threshold?: number;
6601
+ readonly [key: string]: unknown;
6602
+ }
6603
+ interface EvalTurn {
6604
+ readonly input: EvalMessageContent;
6605
+ readonly expectedOutput?: EvalMessageContent;
6606
+ readonly assertions?: readonly (string | EvalAssertionConfig)[];
6607
+ }
6608
+ interface EvalTest {
6609
+ readonly id: string;
6610
+ readonly vars?: Readonly<Record<string, unknown>>;
6611
+ readonly criteria?: string;
6612
+ readonly input?: string | readonly EvalMessage[];
6613
+ readonly inputFiles?: readonly string[];
6614
+ readonly expectedOutput?: string | Readonly<Record<string, unknown>> | readonly EvalMessage[];
6615
+ readonly assertions?: readonly EvalAssertionConfig[];
6616
+ readonly execution?: EvalExecution;
6617
+ readonly workspace?: EvalWorkspace;
6618
+ readonly metadata?: Readonly<Record<string, unknown>>;
6619
+ readonly conversationId?: string;
6620
+ readonly suite?: string;
6621
+ readonly dependsOn?: readonly string[];
6622
+ readonly onDependencyFailure?: 'skip' | 'fail' | 'run';
6623
+ readonly mode?: 'conversation';
6624
+ readonly turns?: readonly EvalTurn[];
6625
+ readonly aggregation?: 'mean' | 'min' | 'max';
6626
+ readonly onTurnFailure?: 'continue' | 'stop';
6627
+ readonly windowSize?: number;
6628
+ }
6629
+ interface EvalRequires {
6630
+ readonly agentv?: string;
6631
+ readonly [key: string]: unknown;
6632
+ }
6633
+ interface EvalDefinition {
6634
+ readonly $schema?: string;
6635
+ readonly name?: string;
6636
+ readonly description?: string;
6637
+ readonly category?: string;
6638
+ readonly version?: string;
6639
+ readonly author?: string;
6640
+ readonly tags?: readonly string[];
6641
+ readonly license?: string;
6642
+ readonly requires?: EvalRequires;
6643
+ readonly input?: string | readonly EvalMessage[];
6644
+ readonly inputFiles?: readonly string[];
6645
+ readonly tests: readonly EvalTest[] | string;
6646
+ readonly target?: string;
6647
+ readonly execution?: EvalExecution;
6648
+ readonly assertions?: readonly EvalAssertionConfig[];
6649
+ readonly preprocessors?: readonly EvalPreprocessor[];
6650
+ readonly workspace?: EvalWorkspace | string;
6651
+ }
6652
+ interface DefinedEvalSuite {
6653
+ readonly [EVAL_SUITE_SYMBOL]: true;
6654
+ readonly [TO_EVAL_YAML_OBJECT_SYMBOL]: () => Record<string, unknown>;
6655
+ }
6656
+ /**
6657
+ * Define a YAML-aligned eval suite in TypeScript.
6658
+ *
6659
+ * The returned object preserves the TypeScript authoring shape and carries a
6660
+ * non-enumerable lowering hook so AgentV can materialize the canonical
6661
+ * snake_case eval contract when the suite is loaded from a `.eval.ts` file.
6662
+ */
6663
+ declare function defineEval<T extends EvalDefinition>(definition: T): T & DefinedEvalSuite;
6664
+ /**
6665
+ * Alias for `defineEval()` when a suite reads more clearly as a plain object.
6666
+ */
6667
+ declare function evalSuite<T extends EvalDefinition>(definition: T): T & DefinedEvalSuite;
6668
+ /**
6669
+ * Lower a TypeScript-authored eval suite into the canonical snake_case object
6670
+ * contract used by YAML files and the runtime loader.
6671
+ *
6672
+ * Only known AgentV wire keys are converted. Unknown keys are preserved as-is
6673
+ * so opaque assertion, provider, and metadata payloads are not corrupted.
6674
+ */
6675
+ declare function toEvalYamlObject<T extends EvalDefinition | DefinedEvalSuite>(definition: T): LowerEvalYamlValue<T>;
6676
+ /**
6677
+ * Serialize an eval suite to canonical YAML.
6678
+ */
6679
+ declare function serializeEvalYaml<T extends EvalDefinition | DefinedEvalSuite>(definition: T): string;
6680
+
6486
6681
  /**
6487
6682
  * Client for invoking configured targets from code-grader scripts.
6488
6683
  *
@@ -6827,4 +7022,4 @@ declare function definePromptTemplate(handler: PromptTemplateHandler): void;
6827
7022
  */
6828
7023
  declare function defineAssertion(handler: AssertionHandler): void;
6829
7024
 
6830
- export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type Content, type ContentFile, ContentFileSchema, type ContentImage, ContentImageSchema, ContentSchema, type ContentText, ContentTextSchema, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, TRACE_EVENT_TYPES, TRACE_REDACTION_LEVELS, TRACE_SOURCE_KINDS, TRACE_TOOL_STATUSES, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type Trace, type TraceArtifact, TraceArtifactSchema, type TraceBranch, TraceBranchSchema, type TraceError, TraceErrorSchema, type TraceEvent, TraceEventSchema, type TraceMessage, TraceMessageSchema, type TraceModel, TraceModelSchema, type TraceRawEvidence, TraceRawEvidenceSchema, type TraceRedactionState, TraceRedactionStateSchema, TraceSchema, type TraceSession, TraceSessionSchema, type TraceSource, type TraceSourceRef, TraceSourceRefSchema, TraceSourceSchema, type TraceSummary, TraceSummarySchema, type TraceTool, TraceToolSchema, createTargetClient, defineAssertion, defineCodeGrader, definePromptTemplate };
7025
+ export { type AssertionContext, type AssertionHandler, type AssertionScore, type AssertionType, type CodeGraderHandler, type CodeGraderInput, CodeGraderInputSchema, type CodeGraderResult, CodeGraderResultSchema, type Content, type ContentFile, ContentFileSchema, type ContentImage, ContentImageSchema, ContentSchema, type ContentText, ContentTextSchema, type DefinedEvalSuite, type EvalAssertionConfig, type EvalDefinition, type EvalDockerWorkspace, type EvalExecution, type EvalMessage, type EvalMessageContent, type EvalPreprocessor, type EvalRequires, type EvalTargetRef, type EvalTest, type EvalTrials, type EvalTurn, type EvalWorkspace, type EvalWorkspaceHook, type EvalWorkspaceHooks, type EvalWorkspaceRepo, type LowerEvalYamlValue, type Message, MessageSchema, type PromptTemplateHandler, type PromptTemplateInput, PromptTemplateInputSchema, TRACE_EVENT_TYPES, TRACE_REDACTION_LEVELS, TRACE_SOURCE_KINDS, TRACE_TOOL_STATUSES, type TargetClient, type TargetInfo, TargetInvocationError, type TargetInvokeRequest, type TargetInvokeResponse, TargetNotAvailableError, type TokenUsage, TokenUsageSchema, type ToolCall, ToolCallSchema, type Trace, type TraceArtifact, TraceArtifactSchema, type TraceBranch, TraceBranchSchema, type TraceError, TraceErrorSchema, type TraceEvent, TraceEventSchema, type TraceMessage, TraceMessageSchema, type TraceModel, TraceModelSchema, type TraceRawEvidence, TraceRawEvidenceSchema, type TraceRedactionState, TraceRedactionStateSchema, TraceSchema, type TraceSession, TraceSessionSchema, type TraceSource, type TraceSourceRef, TraceSourceRefSchema, TraceSourceSchema, type TraceSummary, TraceSummarySchema, type TraceTool, TraceToolSchema, createTargetClient, defineAssertion, defineCodeGrader, defineEval, definePromptTemplate, evalSuite, serializeEvalYaml, toEvalYamlObject };
package/dist/index.js CHANGED
@@ -226,6 +226,92 @@ var CodeGraderResultSchema = z.object({
226
226
  });
227
227
  var PromptTemplateInputSchema = CodeGraderInputSchema;
228
228
 
229
+ // src/eval.ts
230
+ import { stringify as stringifyYaml } from "yaml";
231
+ var EVAL_SUITE_SYMBOL = Symbol.for("@agentv/sdk/eval-suite");
232
+ var TO_EVAL_YAML_OBJECT_SYMBOL = Symbol.for("@agentv/sdk/to-eval-yaml-object");
233
+ var KNOWN_SNAKE_CASE_KEYS = {
234
+ afterAll: "after_all",
235
+ afterEach: "after_each",
236
+ argsMatch: "args_match",
237
+ baseCommit: "base_commit",
238
+ beforeAll: "before_all",
239
+ beforeEach: "before_each",
240
+ budgetUsd: "budget_usd",
241
+ conversationId: "conversation_id",
242
+ costLimitUsd: "cost_limit_usd",
243
+ dependsOn: "depends_on",
244
+ expectedOutput: "expected_output",
245
+ explorationTolerance: "exploration_tolerance",
246
+ failOnError: "fail_on_error",
247
+ inputFiles: "input_files",
248
+ keepWorkspaces: "keep_workspaces",
249
+ maxCostUsd: "max_cost_usd",
250
+ maxDurationMs: "max_duration_ms",
251
+ maxInput: "max_input",
252
+ maxLlmCalls: "max_llm_calls",
253
+ maxOutput: "max_output",
254
+ maxTokens: "max_tokens",
255
+ maxToolCalls: "max_tool_calls",
256
+ minScore: "min_score",
257
+ onDependencyFailure: "on_dependency_failure",
258
+ onTurnFailure: "on_turn_failure",
259
+ outputPath: "output_path",
260
+ scoreRanges: "score_ranges",
261
+ skipDefaults: "skip_defaults",
262
+ targetExplorationRatio: "target_exploration_ratio",
263
+ timeoutMs: "timeout_ms",
264
+ useTarget: "use_target",
265
+ windowSize: "window_size"
266
+ };
267
+ function lowerEvalYamlValue(value) {
268
+ if (Array.isArray(value)) {
269
+ return value.map((item) => lowerEvalYamlValue(item));
270
+ }
271
+ if (value && typeof value === "object") {
272
+ const result = {};
273
+ for (const [key, nestedValue] of Object.entries(value)) {
274
+ const loweredKey = KNOWN_SNAKE_CASE_KEYS[key] ?? key;
275
+ result[loweredKey] = lowerEvalYamlValue(nestedValue);
276
+ }
277
+ return result;
278
+ }
279
+ return value;
280
+ }
281
+ function attachEvalSuiteBrand(definition) {
282
+ const branded = definition;
283
+ if (branded[EVAL_SUITE_SYMBOL] === true) {
284
+ return branded;
285
+ }
286
+ Object.defineProperties(branded, {
287
+ [EVAL_SUITE_SYMBOL]: {
288
+ value: true,
289
+ enumerable: false,
290
+ configurable: false,
291
+ writable: false
292
+ },
293
+ [TO_EVAL_YAML_OBJECT_SYMBOL]: {
294
+ value: () => toEvalYamlObject(definition),
295
+ enumerable: false,
296
+ configurable: false,
297
+ writable: false
298
+ }
299
+ });
300
+ return branded;
301
+ }
302
+ function defineEval(definition) {
303
+ return attachEvalSuiteBrand(definition);
304
+ }
305
+ function evalSuite(definition) {
306
+ return defineEval(definition);
307
+ }
308
+ function toEvalYamlObject(definition) {
309
+ return lowerEvalYamlValue(definition);
310
+ }
311
+ function serializeEvalYaml(definition) {
312
+ return stringifyYaml(toEvalYamlObject(definition), { lineWidth: 0 }).trimEnd();
313
+ }
314
+
229
315
  // src/target-client.ts
230
316
  var TargetNotAvailableError = class extends Error {
231
317
  constructor(message) {
@@ -556,7 +642,11 @@ export {
556
642
  createTargetClient,
557
643
  defineAssertion,
558
644
  defineCodeGrader,
645
+ defineEval,
559
646
  definePromptTemplate,
647
+ evalSuite,
648
+ serializeEvalYaml,
649
+ toEvalYamlObject,
560
650
  z2 as z
561
651
  };
562
652
  //# sourceMappingURL=index.js.map