evalgate-sdk 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. evalgate_sdk/__init__.py +707 -0
  2. evalgate_sdk/_version.py +3 -0
  3. evalgate_sdk/assertions.py +1362 -0
  4. evalgate_sdk/auto.py +247 -0
  5. evalgate_sdk/batch.py +174 -0
  6. evalgate_sdk/cache.py +111 -0
  7. evalgate_sdk/ci_context.py +123 -0
  8. evalgate_sdk/cli/__init__.py +111 -0
  9. evalgate_sdk/cli/api.py +261 -0
  10. evalgate_sdk/cli/cli_constants.py +20 -0
  11. evalgate_sdk/cli/commands.py +1041 -0
  12. evalgate_sdk/cli/config.py +228 -0
  13. evalgate_sdk/cli/env.py +43 -0
  14. evalgate_sdk/cli/formatters/types.py +132 -0
  15. evalgate_sdk/cli/golden_commands.py +322 -0
  16. evalgate_sdk/cli/manifest.py +301 -0
  17. evalgate_sdk/cli/new_commands.py +435 -0
  18. evalgate_sdk/cli/policy_packs.py +103 -0
  19. evalgate_sdk/cli/profiles.py +12 -0
  20. evalgate_sdk/cli/regression_gate.py +312 -0
  21. evalgate_sdk/cli/render/__init__.py +1 -0
  22. evalgate_sdk/cli/render/snippet.py +18 -0
  23. evalgate_sdk/cli/render/sort.py +29 -0
  24. evalgate_sdk/cli/report/__init__.py +1 -0
  25. evalgate_sdk/cli/report/build_check_report.py +209 -0
  26. evalgate_sdk/cli/traces.py +186 -0
  27. evalgate_sdk/cli/workspace.py +63 -0
  28. evalgate_sdk/client.py +609 -0
  29. evalgate_sdk/cluster.py +359 -0
  30. evalgate_sdk/collector.py +161 -0
  31. evalgate_sdk/constants.py +6 -0
  32. evalgate_sdk/context.py +151 -0
  33. evalgate_sdk/errors.py +236 -0
  34. evalgate_sdk/export.py +238 -0
  35. evalgate_sdk/formatters/__init__.py +11 -0
  36. evalgate_sdk/formatters/github.py +51 -0
  37. evalgate_sdk/formatters/human.py +68 -0
  38. evalgate_sdk/formatters/json_fmt.py +11 -0
  39. evalgate_sdk/formatters/pr_comment.py +80 -0
  40. evalgate_sdk/golden.py +426 -0
  41. evalgate_sdk/integrations/__init__.py +1 -0
  42. evalgate_sdk/integrations/anthropic.py +99 -0
  43. evalgate_sdk/integrations/autogen.py +62 -0
  44. evalgate_sdk/integrations/crewai.py +61 -0
  45. evalgate_sdk/integrations/langchain.py +100 -0
  46. evalgate_sdk/integrations/openai.py +155 -0
  47. evalgate_sdk/integrations/openai_eval.py +221 -0
  48. evalgate_sdk/local.py +144 -0
  49. evalgate_sdk/logger.py +123 -0
  50. evalgate_sdk/matchers.py +62 -0
  51. evalgate_sdk/otel.py +256 -0
  52. evalgate_sdk/pagination.py +145 -0
  53. evalgate_sdk/py.typed +0 -0
  54. evalgate_sdk/pytest_plugin.py +96 -0
  55. evalgate_sdk/reason_codes.py +103 -0
  56. evalgate_sdk/regression.py +196 -0
  57. evalgate_sdk/replay_decision.py +115 -0
  58. evalgate_sdk/runtime/__init__.py +50 -0
  59. evalgate_sdk/runtime/adapters/__init__.py +1 -0
  60. evalgate_sdk/runtime/adapters/config_to_dsl.py +270 -0
  61. evalgate_sdk/runtime/adapters/testsuite_to_dsl.py +213 -0
  62. evalgate_sdk/runtime/context.py +68 -0
  63. evalgate_sdk/runtime/eval.py +318 -0
  64. evalgate_sdk/runtime/execution_mode.py +170 -0
  65. evalgate_sdk/runtime/executor.py +92 -0
  66. evalgate_sdk/runtime/registry.py +125 -0
  67. evalgate_sdk/runtime/run_report.py +249 -0
  68. evalgate_sdk/runtime/types.py +143 -0
  69. evalgate_sdk/snapshot.py +219 -0
  70. evalgate_sdk/streaming.py +124 -0
  71. evalgate_sdk/synthesize.py +226 -0
  72. evalgate_sdk/testing.py +128 -0
  73. evalgate_sdk/types.py +666 -0
  74. evalgate_sdk/utils/__init__.py +1 -0
  75. evalgate_sdk/utils/input_hash.py +42 -0
  76. evalgate_sdk/workflows.py +264 -0
  77. evalgate_sdk-3.3.1.dist-info/METADATA +608 -0
  78. evalgate_sdk-3.3.1.dist-info/RECORD +80 -0
  79. evalgate_sdk-3.3.1.dist-info/WHEEL +4 -0
  80. evalgate_sdk-3.3.1.dist-info/entry_points.txt +2 -0
evalgate_sdk/types.py ADDED
@@ -0,0 +1,666 @@
1
+ """Data models for the EvalAI SDK, matching the TypeScript SDK's types.ts."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from typing import Any, Literal, TypeVar
8
+
9
+ from pydantic import BaseModel, ConfigDict, Field
10
+
11
+ TMetadata = TypeVar("TMetadata", bound=dict[str, Any])
12
+
13
+
14
+ def to_camel(s: str) -> str:
15
+ """Convert snake_case to camelCase for API serialization."""
16
+ parts = s.split("_")
17
+ return parts[0] + "".join(p.capitalize() for p in parts[1:])
18
+
19
+
20
+ class CamelModel(BaseModel):
21
+ """Base model that serializes to/from camelCase for API compatibility."""
22
+
23
+ model_config = ConfigDict(
24
+ alias_generator=to_camel,
25
+ populate_by_name=True,
26
+ )
27
+
28
+
29
+ # ── Client config ────────────────────────────────────────────────────
30
+
31
+
32
+ class RetryConfig(BaseModel):
33
+ max_attempts: int = 3
34
+ backoff: Literal["exponential", "linear", "fixed"] = "exponential"
35
+ retryable_errors: list[str] = Field(default_factory=lambda: ["RATE_LIMIT_EXCEEDED", "TIMEOUT", "NETWORK_ERROR"])
36
+
37
+
38
+ class ClientConfig(BaseModel):
39
+ api_key: str | None = None
40
+ base_url: str | None = None
41
+ organization_id: int | None = None
42
+ timeout: int = 30_000
43
+ debug: bool = False
44
+ log_level: Literal["trace", "debug", "info", "warn", "error"] = "info"
45
+ retry: RetryConfig = Field(default_factory=RetryConfig)
46
+ enable_caching: bool = True
47
+ cache_size: int = 1000
48
+ enable_batching: bool = True
49
+ batch_size: int = 10
50
+ batch_delay: int = 50
51
+ keep_alive: bool = True
52
+
53
+
54
+ # ── Evaluation templates ─────────────────────────────────────────────
55
+
56
+
57
+ class EvaluationTemplates(str, Enum):
58
+ UNIT_TESTING = "unit-testing"
59
+ OUTPUT_QUALITY = "output-quality"
60
+ PROMPT_OPTIMIZATION = "prompt-optimization"
61
+ CHAIN_OF_THOUGHT = "chain-of-thought"
62
+ LONG_CONTEXT_TESTING = "long-context-testing"
63
+ MODEL_STEERING = "model-steering"
64
+ REGRESSION_TESTING = "regression-testing"
65
+ CONFIDENCE_CALIBRATION = "confidence-calibration"
66
+ SAFETY_COMPLIANCE = "safety-compliance"
67
+ RAG_EVALUATION = "rag-evaluation"
68
+ CODE_GENERATION = "code-generation"
69
+ SUMMARIZATION = "summarization"
70
+
71
+
72
+ # ── Feature usage ────────────────────────────────────────────────────
73
+
74
+
75
+ class FeatureUsage(CamelModel):
76
+ feature_id: str
77
+ unlimited: bool
78
+ interval: str
79
+ remaining: int | None = None
80
+ limit: int | None = None
81
+ used: int | None = None
82
+
83
+
84
+ class OrganizationLimits(CamelModel):
85
+ organization_id: int
86
+ plan: str
87
+ features: list[FeatureUsage]
88
+
89
+
90
+ class Organization(CamelModel):
91
+ id: int
92
+ name: str
93
+ slug: str | None = None
94
+ plan: str | None = None
95
+
96
+
97
+ # ── Traces & Spans ───────────────────────────────────────────────────
98
+
99
+
100
+ class Trace(CamelModel):
101
+ id: int
102
+ trace_id: str
103
+ name: str | None = None
104
+ organization_id: int | None = None
105
+ status: str | None = None
106
+ input: str | None = None
107
+ output: str | None = None
108
+ metadata: dict[str, Any] | None = None
109
+ start_time: datetime | None = None
110
+ end_time: datetime | None = None
111
+ duration_ms: int | None = None
112
+ created_at: datetime | None = None
113
+ updated_at: datetime | None = None
114
+
115
+
116
+ class CreateTraceParams(CamelModel):
117
+ name: str
118
+ trace_id: str | None = None
119
+ input: str | None = None
120
+ output: str | None = None
121
+ metadata: dict[str, Any] | None = None
122
+ organization_id: int | None = None
123
+
124
+
125
+ class UpdateTraceParams(CamelModel):
126
+ name: str | None = None
127
+ output: str | None = None
128
+ status: str | None = None
129
+ metadata: dict[str, Any] | None = None
130
+
131
+
132
+ class ListTracesParams(CamelModel):
133
+ limit: int = 20
134
+ offset: int = 0
135
+ organization_id: int | None = None
136
+ status: str | None = None
137
+
138
+
139
+ class Span(CamelModel):
140
+ id: int
141
+ span_id: str
142
+ trace_id: int
143
+ name: str | None = None
144
+ type: str | None = None
145
+ input: str | None = None
146
+ output: str | None = None
147
+ metadata: dict[str, Any] | None = None
148
+ start_time: datetime | None = None
149
+ end_time: datetime | None = None
150
+ duration_ms: int | None = None
151
+
152
+
153
+ class CreateSpanParams(CamelModel):
154
+ name: str
155
+ span_id: str | None = None
156
+ type: str | None = None
157
+ input: str | None = None
158
+ output: str | None = None
159
+ metadata: dict[str, Any] | None = None
160
+
161
+
162
+ # ── Evaluations ──────────────────────────────────────────────────────
163
+
164
+
165
+ class Evaluation(CamelModel):
166
+ id: int
167
+ name: str
168
+ description: str | None = None
169
+ type: str | None = None
170
+ status: str | None = None
171
+ organization_id: int | None = None
172
+ created_by: str | None = None
173
+ model_settings: dict[str, Any] | None = None
174
+ execution_settings: dict[str, Any] | None = None
175
+ custom_metrics: list[dict[str, Any]] | None = None
176
+ created_at: datetime | None = None
177
+ updated_at: datetime | None = None
178
+
179
+
180
+ class CreateEvaluationParams(CamelModel):
181
+ name: str
182
+ description: str | None = None
183
+ type: str | None = None
184
+ organization_id: int | None = None
185
+ model_settings: dict[str, Any] | None = None
186
+ execution_settings: dict[str, Any] | None = None
187
+ assertions: list[dict[str, Any]] | None = None
188
+ test_cases: list[dict[str, Any]] | None = None
189
+
190
+
191
+ class UpdateEvaluationParams(CamelModel):
192
+ name: str | None = None
193
+ description: str | None = None
194
+ status: str | None = None
195
+ model_settings: dict[str, Any] | None = None
196
+ execution_settings: dict[str, Any] | None = None
197
+
198
+
199
+ class ListEvaluationsParams(CamelModel):
200
+ limit: int = 20
201
+ offset: int = 0
202
+ status: str | None = None
203
+
204
+
205
+ # ── Test Cases ───────────────────────────────────────────────────────
206
+
207
+
208
+ class TestCase(CamelModel):
209
+ id: int
210
+ evaluation_id: int
211
+ name: str | None = None
212
+ input: str | None = None
213
+ expected_output: str | None = None
214
+ metadata: dict[str, Any] | None = None
215
+
216
+
217
+ class CreateTestCaseParams(CamelModel):
218
+ name: str | None = None
219
+ input: str
220
+ expected_output: str | None = None
221
+ metadata: dict[str, Any] | None = None
222
+
223
+
224
+ # ── Evaluation Runs ──────────────────────────────────────────────────
225
+
226
+
227
+ class EvaluationRun(CamelModel):
228
+ id: int
229
+ evaluation_id: int
230
+ status: str | None = None
231
+ total_cases: int | None = None
232
+ passed_cases: int | None = None
233
+ failed_cases: int | None = None
234
+ score: float | None = None
235
+ trace_log: dict[str, Any] | None = None
236
+ started_at: datetime | None = None
237
+ completed_at: datetime | None = None
238
+ created_at: datetime | None = None
239
+
240
+
241
+ class CreateRunParams(CamelModel):
242
+ model_settings: dict[str, Any] | None = None
243
+ execution_settings: dict[str, Any] | None = None
244
+
245
+
246
+ # ── LLM Judge ────────────────────────────────────────────────────────
247
+
248
+
249
+ class LLMJudgeConfig(CamelModel):
250
+ id: int
251
+ name: str
252
+ model: str | None = None
253
+ criteria: dict[str, Any] | None = None
254
+ settings: dict[str, Any] | None = None
255
+
256
+
257
+ class CreateLLMJudgeConfigParams(CamelModel):
258
+ name: str
259
+ model: str = "gpt-4"
260
+ criteria: dict[str, Any] | None = None
261
+ settings: dict[str, Any] | None = None
262
+ organization_id: int | None = None
263
+
264
+
265
+ class LLMJudgeResult(CamelModel):
266
+ id: int
267
+ config_id: int | None = None
268
+ score: float | None = None
269
+ reasoning: str | None = None
270
+ metadata: dict[str, Any] | None = None
271
+ created_at: datetime | None = None
272
+
273
+
274
+ class RunLLMJudgeParams(CamelModel):
275
+ config_id: int
276
+ input: str
277
+ output: str
278
+ expected_output: str | None = None
279
+ context: str | None = None
280
+
281
+
282
+ class ListLLMJudgeConfigsParams(CamelModel):
283
+ limit: int = 20
284
+ offset: int = 0
285
+
286
+
287
+ class ListLLMJudgeResultsParams(CamelModel):
288
+ config_id: int | None = None
289
+ limit: int = 20
290
+ offset: int = 0
291
+
292
+
293
+ class LLMJudgeAlignment(CamelModel):
294
+ alignment_score: float | None = None
295
+ details: dict[str, Any] | None = None
296
+
297
+
298
+ class GetLLMJudgeAlignmentParams(CamelModel):
299
+ config_id: int
300
+
301
+
302
+ # ── Annotations ──────────────────────────────────────────────────────
303
+
304
+
305
+ class Annotation(CamelModel):
306
+ id: int
307
+ evaluation_run_id: int | None = None
308
+ test_case_id: int | None = None
309
+ annotator_id: str | None = None
310
+ rating: int | None = None
311
+ feedback: str | None = None
312
+ labels: dict[str, Any] | None = None
313
+ metadata: dict[str, Any] | None = None
314
+ created_at: datetime | None = None
315
+
316
+
317
+ class CreateAnnotationParams(CamelModel):
318
+ evaluation_run_id: int
319
+ test_case_id: int
320
+ rating: int | None = None
321
+ feedback: str | None = None
322
+ labels: dict[str, Any] | None = None
323
+ metadata: dict[str, Any] | None = None
324
+
325
+
326
+ class ListAnnotationsParams(CamelModel):
327
+ evaluation_run_id: int | None = None
328
+ test_case_id: int | None = None
329
+ limit: int = 20
330
+ offset: int = 0
331
+
332
+
333
+ class AnnotationTask(CamelModel):
334
+ id: int
335
+ name: str | None = None
336
+ status: str | None = None
337
+ settings: dict[str, Any] | None = None
338
+ created_at: datetime | None = None
339
+
340
+
341
+ class CreateAnnotationTaskParams(CamelModel):
342
+ name: str
343
+ evaluation_id: int
344
+ settings: dict[str, Any] | None = None
345
+ organization_id: int | None = None
346
+
347
+
348
+ class ListAnnotationTasksParams(CamelModel):
349
+ limit: int = 20
350
+ offset: int = 0
351
+
352
+
353
+ class AnnotationItem(CamelModel):
354
+ id: int
355
+ task_id: int
356
+ content: dict[str, Any] | None = None
357
+ status: str | None = None
358
+
359
+
360
+ class CreateAnnotationItemParams(CamelModel):
361
+ content: dict[str, Any]
362
+
363
+
364
+ class ListAnnotationItemsParams(CamelModel):
365
+ status: str | None = None
366
+ limit: int = 20
367
+ offset: int = 0
368
+
369
+
370
+ # ── API Keys ─────────────────────────────────────────────────────────
371
+
372
+
373
+ class APIKey(CamelModel):
374
+ id: int
375
+ name: str
376
+ key_prefix: str | None = None
377
+ scopes: list[str] | None = None
378
+ last_used_at: datetime | None = None
379
+ expires_at: datetime | None = None
380
+ created_at: datetime | None = None
381
+
382
+
383
+ class APIKeyWithSecret(APIKey):
384
+ key: str
385
+
386
+
387
+ class CreateAPIKeyParams(CamelModel):
388
+ name: str
389
+ scopes: list[str] | None = None
390
+ expires_at: str | None = None
391
+ organization_id: int | None = None
392
+
393
+
394
+ class UpdateAPIKeyParams(CamelModel):
395
+ name: str | None = None
396
+ scopes: list[str] | None = None
397
+
398
+
399
+ class ListAPIKeysParams(CamelModel):
400
+ organization_id: int | None = None
401
+
402
+
403
+ class APIKeyUsage(CamelModel):
404
+ total_requests: int = 0
405
+ requests_today: int = 0
406
+ last_used_at: datetime | None = None
407
+
408
+
409
+ # ── Webhooks ─────────────────────────────────────────────────────────
410
+
411
+
412
+ class Webhook(CamelModel):
413
+ id: int
414
+ url: str
415
+ events: list[str] | None = None
416
+ active: bool = True
417
+ created_at: datetime | None = None
418
+
419
+
420
+ class CreateWebhookParams(CamelModel):
421
+ url: str
422
+ events: list[str]
423
+ organization_id: int | None = None
424
+
425
+
426
+ class UpdateWebhookParams(CamelModel):
427
+ url: str | None = None
428
+ events: list[str] | None = None
429
+ active: bool | None = None
430
+
431
+
432
+ class ListWebhooksParams(CamelModel):
433
+ organization_id: int | None = None
434
+
435
+
436
+ class WebhookDelivery(CamelModel):
437
+ id: int
438
+ webhook_id: int
439
+ event: str | None = None
440
+ status_code: int | None = None
441
+ response_body: str | None = None
442
+ created_at: datetime | None = None
443
+
444
+
445
+ class ListWebhookDeliveriesParams(CamelModel):
446
+ limit: int = 20
447
+ offset: int = 0
448
+
449
+
450
+ # ── Usage ────────────────────────────────────────────────────────────
451
+
452
+
453
+ class UsageStats(CamelModel):
454
+ total_requests: int = 0
455
+ total_evaluations: int = 0
456
+ total_traces: int = 0
457
+ period_start: datetime | None = None
458
+ period_end: datetime | None = None
459
+
460
+
461
+ class GetUsageParams(CamelModel):
462
+ organization_id: int
463
+ start_date: str | None = None
464
+ end_date: str | None = None
465
+
466
+
467
+ class UsageSummary(CamelModel):
468
+ evaluations: int = 0
469
+ traces: int = 0
470
+ test_cases: int = 0
471
+ api_calls: int = 0
472
+
473
+
474
+ # ── Quality Score ────────────────────────────────────────────────────
475
+
476
+
477
+ class QualityBreakdown(CamelModel):
478
+ pass_rate: float | None = None
479
+ safety: float | None = None
480
+ judge: float | None = None
481
+
482
+
483
+ class QualityScore(CamelModel):
484
+ score: float | None = None
485
+ total: int | None = None
486
+ evidence_level: str | None = None
487
+ baseline_score: float | None = None
488
+ regression_delta: float | None = None
489
+ baseline_missing: bool | None = None
490
+ breakdown: QualityBreakdown | None = None
491
+ flags: list[str] | None = None
492
+ evaluation_run_id: int | None = None
493
+ evaluation_id: int | None = None
494
+ avg_latency_ms: float | None = None
495
+ cost_usd: float | None = None
496
+ baseline_cost_usd: float | None = None
497
+ baseline_run_id: int | None = None
498
+
499
+
500
+ # ── Test Suite ───────────────────────────────────────────────────────
501
+
502
+
503
+ class TestSuiteCase(BaseModel):
504
+ name: str
505
+ input: str
506
+ expected_output: str | None = None
507
+ assertions: list[dict[str, Any]] | None = None
508
+ metadata: dict[str, Any] | None = None
509
+ tags: list[str] | None = None
510
+
511
+
512
+ class TestSuiteConfig(BaseModel):
513
+ model: str | None = None
514
+ provider: str | None = None
515
+ temperature: float | None = None
516
+ max_tokens: int | None = None
517
+ system_prompt: str | None = None
518
+ evaluator: Any | None = None
519
+ test_cases: list[TestSuiteCase] = Field(default_factory=list)
520
+ timeout: int = 30_000
521
+ retries: int = 0
522
+ retry_delay_ms: int = 1000
523
+ retry_jitter: bool = False
524
+ seed: int | None = None
525
+ strict: bool = False
526
+ stop_on_failure: bool = False
527
+
528
+
529
+ class TestSuiteCaseResult(BaseModel):
530
+ model_config = {"arbitrary_types_allowed": True}
531
+
532
+ name: str
533
+ passed: bool
534
+ duration_ms: int = 0
535
+ input: str
536
+ output: str | None = None
537
+ expected_output: str | None = None
538
+ assertions: list[Any] = Field(default_factory=list)
539
+ error: str | None = None
540
+
541
+
542
+ class TestSuiteResult(BaseModel):
543
+ suite_name: str
544
+ passed: bool
545
+ total: int = 0
546
+ passed_count: int = 0
547
+ failed_count: int = 0
548
+ duration_ms: int = 0
549
+ results: list[TestSuiteCaseResult] = Field(default_factory=list)
550
+
551
+
552
+ # ── Workflow types ───────────────────────────────────────────────────
553
+
554
+
555
+ class WorkflowNode(CamelModel):
556
+ id: str
557
+ type: str
558
+ name: str | None = None
559
+ config: dict[str, Any] | None = None
560
+
561
+
562
+ class WorkflowEdge(CamelModel):
563
+ source: str = Field(alias="from")
564
+ target: str = Field(alias="to")
565
+ condition: str | None = None
566
+ label: str | None = None
567
+
568
+
569
+ class WorkflowDefinition(CamelModel):
570
+ nodes: list[WorkflowNode]
571
+ edges: list[WorkflowEdge]
572
+ entrypoint: str | None = None
573
+ metadata: dict[str, Any] | None = None
574
+
575
+
576
+ class WorkflowStatus(str, Enum):
577
+ RUNNING = "running"
578
+ COMPLETED = "completed"
579
+ FAILED = "failed"
580
+ CANCELLED = "cancelled"
581
+
582
+
583
+ class HandoffType(str, Enum):
584
+ DELEGATION = "delegation"
585
+ ESCALATION = "escalation"
586
+ COLLABORATION = "collaboration"
587
+ FALLBACK = "fallback"
588
+
589
+
590
+ class AgentHandoff(CamelModel):
591
+ from_agent: str | None = None
592
+ to_agent: str
593
+ context: dict[str, Any] | None = None
594
+ handoff_type: HandoffType = HandoffType.DELEGATION
595
+ timestamp: datetime | None = None
596
+
597
+
598
+ class DecisionType(str, Enum):
599
+ ROUTING = "routing"
600
+ SELECTION = "selection"
601
+ FILTERING = "filtering"
602
+ PRIORITIZATION = "prioritization"
603
+
604
+
605
+ class DecisionAlternative(CamelModel):
606
+ name: str
607
+ score: float | None = None
608
+ reasoning: str | None = None
609
+
610
+
611
+ class RecordDecisionParams(CamelModel):
612
+ agent_name: str
613
+ decision_type: DecisionType = DecisionType.ROUTING
614
+ chosen: str
615
+ alternatives: list[DecisionAlternative] = Field(default_factory=list)
616
+ reasoning: str | None = None
617
+ confidence: float | None = None
618
+ input_context: dict[str, Any] | None = None
619
+
620
+
621
+ class CostCategory(str, Enum):
622
+ LLM_INPUT = "llm_input"
623
+ LLM_OUTPUT = "llm_output"
624
+ EMBEDDING = "embedding"
625
+ TOOL_CALL = "tool_call"
626
+ OTHER = "other"
627
+
628
+
629
+ class RecordCostParams(CamelModel):
630
+ agent_name: str
631
+ category: CostCategory
632
+ amount: float
633
+ currency: str = "USD"
634
+ model: str | None = None
635
+ tokens: int | None = None
636
+ metadata: dict[str, Any] | None = None
637
+
638
+
639
+ class CostRecord(CamelModel):
640
+ agent_name: str
641
+ category: CostCategory
642
+ amount: float
643
+ currency: str = "USD"
644
+ model: str | None = None
645
+ tokens: int | None = None
646
+ metadata: dict[str, Any] | None = None
647
+ timestamp: datetime | None = None
648
+
649
+
650
+ class WorkflowContext(CamelModel):
651
+ workflow_id: str | None = None
652
+ trace_id: int | None = None
653
+ name: str
654
+ status: WorkflowStatus = WorkflowStatus.RUNNING
655
+ definition: WorkflowDefinition | None = None
656
+ metadata: dict[str, Any] | None = None
657
+ started_at: datetime | None = None
658
+
659
+
660
+ class AgentSpanContext(CamelModel):
661
+ span_id: str | None = None
662
+ agent_name: str
663
+ trace_id: int | None = None
664
+ parent_span_id: str | None = None
665
+ started_at: datetime | None = None
666
+ ended_at: datetime | None = None
@@ -0,0 +1 @@
1
+ """EvalGate SDK utilities."""
@@ -0,0 +1,42 @@
1
+ """Input normalization and hashing for deterministic matching.
2
+
3
+ Must match platform's input-hash logic for reportToEvalGate.
4
+ Port of ``utils/input-hash.ts``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import json
11
+ import re
12
+ from typing import Any
13
+
14
+
15
+ def _sort_keys(obj: dict[str, Any]) -> dict[str, Any]:
16
+ """Recursively sort dictionary keys for stable serialization."""
17
+ sorted_dict: dict[str, Any] = {}
18
+ for k in sorted(obj.keys()):
19
+ v = obj[k]
20
+ if isinstance(v, dict):
21
+ sorted_dict[k] = _sort_keys(v)
22
+ else:
23
+ sorted_dict[k] = v
24
+ return sorted_dict
25
+
26
+
27
+ def normalize_input(input_str: str) -> str:
28
+ """Normalize input for stable matching (whitespace, JSON key order)."""
29
+ s = input_str.strip()
30
+ try:
31
+ obj = json.loads(s)
32
+ if isinstance(obj, dict):
33
+ return json.dumps(_sort_keys(obj), separators=(",", ":"))
34
+ return json.dumps(obj, separators=(",", ":"))
35
+ except (json.JSONDecodeError, TypeError):
36
+ return re.sub(r"\s+", " ", s)
37
+
38
+
39
+ def sha256_input(s: str) -> str:
40
+ """SHA-256 hash of normalized input."""
41
+ normalized = normalize_input(s)
42
+ return hashlib.sha256(normalized.encode("utf-8")).hexdigest()