judgeval 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.8.0.dist-info/RECORD +0 -82
  94. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.8.0.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,57 +1,129 @@
1
1
  # generated by datamodel-codegen:
2
- # filename: openapi_new.json
3
- # timestamp: 2025-08-08T18:50:51+00:00
2
+ # filename: .openapi.json
3
+ # timestamp: 2025-08-29T04:49:38+00:00
4
4
 
5
5
  from __future__ import annotations
6
-
7
6
  from typing import Annotated, Any, Dict, List, Optional, Union
7
+ from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
8
8
 
9
- from pydantic import BaseModel, ConfigDict, Field
10
9
 
10
+ class TraceAndSpanId(RootModel[List]):
11
+ root: Annotated[List, Field(max_length=2, min_length=2)]
12
+
13
+
14
+ class EvalResultsFetch(BaseModel):
15
+ experiment_run_id: Annotated[str, Field(title="Experiment Run Id")]
16
+ project_name: Annotated[str, Field(title="Project Name")]
11
17
 
12
- class ValidationErrorJudgmentType(BaseModel):
13
- loc: Annotated[List[Union[str, int]], Field(title="Location")]
14
- msg: Annotated[str, Field(title="Message")]
15
- type: Annotated[str, Field(title="Error Type")]
16
18
 
19
+ class DatasetFetch(BaseModel):
20
+ dataset_alias: Annotated[str, Field(title="Dataset Alias")]
21
+ project_name: Annotated[str, Field(title="Project Name")]
17
22
 
18
- class ScorerDataJudgmentType(BaseModel):
23
+
24
+ class TraceSave(BaseModel):
25
+ project_name: Annotated[str, Field(title="Project Name")]
26
+ trace_id: Annotated[str, Field(title="Trace Id")]
19
27
  name: Annotated[str, Field(title="Name")]
28
+ created_at: Annotated[str, Field(title="Created At")]
29
+ duration: Annotated[float, Field(title="Duration")]
30
+ offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
31
+ has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
32
+ customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
33
+ tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
34
+ metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = None
35
+ update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
36
+
37
+
38
+ class TraceFetch(BaseModel):
39
+ trace_id: Annotated[str, Field(title="Trace Id")]
40
+
41
+
42
+ class TraceAddToDataset(BaseModel):
43
+ trace_id: Annotated[str, Field(title="Trace Id")]
44
+ trace_span_id: Annotated[str, Field(title="Trace Span Id")]
45
+ dataset_alias: Annotated[str, Field(title="Dataset Alias")]
46
+ project_name: Annotated[str, Field(title="Project Name")]
47
+
48
+
49
+ class EvaluationRunsBatchRequest(BaseModel):
50
+ organization_id: Annotated[str, Field(title="Organization Id")]
51
+ evaluation_entries: Annotated[
52
+ List[Dict[str, Any]], Field(title="Evaluation Entries")
53
+ ]
54
+
55
+
56
+ class ProjectAdd(BaseModel):
57
+ project_name: Annotated[str, Field(title="Project Name")]
58
+
59
+
60
+ class ProjectAddResponse(BaseModel):
61
+ project_id: Annotated[str, Field(title="Project Id")]
62
+
63
+
64
+ class ProjectDeleteFromJudgevalResponse(BaseModel):
65
+ project_name: Annotated[str, Field(title="Project Name")]
66
+
67
+
68
+ class ProjectDeleteResponse(BaseModel):
69
+ message: Annotated[str, Field(title="Message")]
70
+
71
+
72
+ class ScorerExistsRequest(BaseModel):
73
+ name: Annotated[str, Field(title="Name")]
74
+
75
+
76
+ class ScorerExistsResponse(BaseModel):
77
+ exists: Annotated[bool, Field(title="Exists")]
78
+
79
+
80
+ class SavePromptScorerRequest(BaseModel):
81
+ name: Annotated[str, Field(title="Name")]
82
+ prompt: Annotated[str, Field(title="Prompt")]
20
83
  threshold: Annotated[float, Field(title="Threshold")]
21
- success: Annotated[bool, Field(title="Success")]
22
- score: Annotated[Optional[float], Field(title="Score")] = None
23
- reason: Annotated[Optional[str], Field(title="Reason")] = None
24
- strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
25
- evaluation_model: Annotated[
26
- Optional[Union[List[str], str]], Field(title="Evaluation Model")
27
- ] = None
28
- error: Annotated[Optional[str], Field(title="Error")] = None
29
- additional_metadata: Annotated[
30
- Optional[Dict[str, Any]], Field(title="Additional Metadata")
31
- ] = None
84
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
85
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
32
86
 
33
87
 
34
- class ExampleJudgmentType(BaseModel):
35
- model_config = ConfigDict(
36
- extra="allow",
37
- )
38
- example_id: Annotated[str, Field(title="Example Id")]
39
- created_at: Annotated[str, Field(title="Created At")]
40
- name: Annotated[Optional[str], Field(title="Name")] = None
88
+ class SavePromptScorerResponse(BaseModel):
89
+ message: Annotated[str, Field(title="Message")]
90
+ name: Annotated[str, Field(title="Name")]
41
91
 
42
92
 
43
- class ScorerConfigJudgmentType(BaseModel):
44
- score_type: Annotated[str, Field(title="Score Type")]
45
- name: Annotated[Optional[str], Field(title="Name")] = None
46
- threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
47
- strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
48
- required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = (
49
- Field(default_factory=list)
50
- )
51
- kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
93
+ class FetchPromptScorerRequest(BaseModel):
94
+ name: Annotated[str, Field(title="Name")]
95
+
96
+
97
+ class CustomScorerUploadPayload(BaseModel):
98
+ scorer_name: Annotated[str, Field(title="Scorer Name")]
99
+ scorer_code: Annotated[str, Field(title="Scorer Code")]
100
+ requirements_text: Annotated[str, Field(title="Requirements Text")]
101
+
52
102
 
103
+ class CustomScorerTemplateResponse(BaseModel):
104
+ scorer_name: Annotated[str, Field(title="Scorer Name")]
105
+ status: Annotated[str, Field(title="Status")]
106
+ message: Annotated[str, Field(title="Message")]
53
107
 
54
- class BaseScorerJudgmentType(BaseModel):
108
+
109
+ class ResolveProjectNameRequest(BaseModel):
110
+ project_name: Annotated[str, Field(title="Project Name")]
111
+
112
+
113
+ class ResolveProjectNameResponse(BaseModel):
114
+ project_id: Annotated[str, Field(title="Project Id")]
115
+
116
+
117
+ class TraceIdRequest(BaseModel):
118
+ trace_id: Annotated[str, Field(title="Trace Id")]
119
+
120
+
121
+ class SpanScoreRequest(BaseModel):
122
+ span_id: Annotated[str, Field(title="Span Id")]
123
+ trace_id: Annotated[str, Field(title="Trace Id")]
124
+
125
+
126
+ class BaseScorer(BaseModel):
55
127
  score_type: Annotated[str, Field(title="Score Type")]
56
128
  threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
57
129
  name: Annotated[Optional[str], Field(title="Name")] = None
@@ -76,7 +148,87 @@ class BaseScorerJudgmentType(BaseModel):
76
148
  server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
77
149
 
78
150
 
79
- class TraceUsageJudgmentType(BaseModel):
151
+ class ScorerConfig(BaseModel):
152
+ score_type: Annotated[str, Field(title="Score Type")]
153
+ name: Annotated[Optional[str], Field(title="Name")] = None
154
+ threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
155
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
156
+ required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
157
+ kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
158
+
159
+
160
+ class Example(BaseModel):
161
+ model_config = ConfigDict(
162
+ extra="allow",
163
+ )
164
+ example_id: Annotated[str, Field(title="Example Id")]
165
+ created_at: Annotated[str, Field(title="Created At")]
166
+ name: Annotated[Optional[str], Field(title="Name")] = None
167
+
168
+
169
+ class ValidationError(BaseModel):
170
+ loc: Annotated[List[Union[str, int]], Field(title="Location")]
171
+ msg: Annotated[str, Field(title="Message")]
172
+ type: Annotated[str, Field(title="Error Type")]
173
+
174
+
175
+ class SpanBatchItem(BaseModel):
176
+ span_id: Annotated[str, Field(title="Span Id")]
177
+ trace_id: Annotated[str, Field(title="Trace Id")]
178
+ function: Annotated[str, Field(title="Function")]
179
+ created_at: Annotated[Any, Field(title="Created At")] = None
180
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
181
+ span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
182
+ inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
183
+ output: Annotated[Any, Field(title="Output")] = None
184
+ error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
185
+ usage: Annotated[Optional[Dict[str, Any]], Field(title="Usage")] = None
186
+ duration: Annotated[Optional[float], Field(title="Duration")] = None
187
+ expected_tools: Annotated[
188
+ Optional[List[Dict[str, Any]]], Field(title="Expected Tools")
189
+ ] = None
190
+ additional_metadata: Annotated[
191
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
192
+ ] = None
193
+ has_evaluation: Annotated[Optional[bool], Field(title="Has Evaluation")] = False
194
+ agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
195
+ class_name: Annotated[Optional[str], Field(title="Class Name")] = None
196
+ state_before: Annotated[Optional[Dict[str, Any]], Field(title="State Before")] = (
197
+ None
198
+ )
199
+ state_after: Annotated[Optional[Dict[str, Any]], Field(title="State After")] = None
200
+ span_state: Annotated[str, Field(title="Span State")]
201
+ update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
202
+ queued_at: Annotated[float, Field(title="Queued At")]
203
+
204
+
205
+ class PromptScorer(BaseModel):
206
+ name: Annotated[str, Field(title="Name")]
207
+ prompt: Annotated[str, Field(title="Prompt")]
208
+ threshold: Annotated[float, Field(title="Threshold")]
209
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
210
+ created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
211
+ updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
212
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
213
+
214
+
215
+ class ScorerData(BaseModel):
216
+ name: Annotated[str, Field(title="Name")]
217
+ threshold: Annotated[float, Field(title="Threshold")]
218
+ success: Annotated[bool, Field(title="Success")]
219
+ score: Annotated[Optional[float], Field(title="Score")] = None
220
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
221
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
222
+ evaluation_model: Annotated[
223
+ Optional[Union[List[str], str]], Field(title="Evaluation Model")
224
+ ] = None
225
+ error: Annotated[Optional[str], Field(title="Error")] = None
226
+ additional_metadata: Annotated[
227
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
228
+ ] = None
229
+
230
+
231
+ class TraceUsage(BaseModel):
80
232
  prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
81
233
  completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
82
234
  cache_creation_input_tokens: Annotated[
@@ -96,7 +248,7 @@ class TraceUsageJudgmentType(BaseModel):
96
248
  model_name: Annotated[Optional[str], Field(title="Model Name")] = None
97
249
 
98
250
 
99
- class ToolJudgmentType(BaseModel):
251
+ class Tool(BaseModel):
100
252
  tool_name: Annotated[str, Field(title="Tool Name")]
101
253
  parameters: Annotated[Optional[Dict[str, Any]], Field(title="Parameters")] = None
102
254
  agent_name: Annotated[Optional[str], Field(title="Agent Name")] = None
@@ -109,45 +261,75 @@ class ToolJudgmentType(BaseModel):
109
261
  require_all: Annotated[Optional[bool], Field(title="Require All")] = None
110
262
 
111
263
 
112
- class HTTPValidationErrorJudgmentType(BaseModel):
113
- detail: Annotated[
114
- Optional[List[ValidationErrorJudgmentType]], Field(title="Detail")
115
- ] = None
116
-
117
-
118
- class EvaluationRunJudgmentType(BaseModel):
264
+ class ExampleEvaluationRun(BaseModel):
119
265
  id: Annotated[Optional[str], Field(title="Id")] = None
120
266
  project_name: Annotated[Optional[str], Field(title="Project Name")] = None
121
267
  eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
122
- examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
123
268
  custom_scorers: Annotated[
124
- Optional[List[BaseScorerJudgmentType]], Field(title="Custom Scorers")
125
- ] = Field(default_factory=list)
269
+ Optional[List[BaseScorer]], Field(title="Custom Scorers")
270
+ ] = []
126
271
  judgment_scorers: Annotated[
127
- Optional[List[ScorerConfigJudgmentType]], Field(title="Judgment Scorers")
128
- ] = Field(default_factory=list)
272
+ Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
273
+ ] = []
129
274
  model: Annotated[str, Field(title="Model")]
275
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
276
+ examples: Annotated[List[Example], Field(title="Examples")]
130
277
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
131
278
  trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
279
+
280
+
281
+ class HTTPValidationError(BaseModel):
282
+ detail: Annotated[Optional[List[ValidationError]], Field(title="Detail")] = None
283
+
284
+
285
+ class TraceEvaluationRun(BaseModel):
286
+ id: Annotated[Optional[str], Field(title="Id")] = None
287
+ project_name: Annotated[Optional[str], Field(title="Project Name")] = None
288
+ eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
289
+ custom_scorers: Annotated[
290
+ Optional[List[BaseScorer]], Field(title="Custom Scorers")
291
+ ] = []
292
+ judgment_scorers: Annotated[
293
+ Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
294
+ ] = []
295
+ model: Annotated[str, Field(title="Model")]
132
296
  created_at: Annotated[Optional[str], Field(title="Created At")] = None
297
+ trace_and_span_ids: Annotated[
298
+ List[TraceAndSpanId], Field(title="Trace And Span Ids")
299
+ ]
300
+ is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
301
+
302
+
303
+ class DatasetInsertExamples(BaseModel):
304
+ dataset_alias: Annotated[str, Field(title="Dataset Alias")]
305
+ examples: Annotated[List[Example], Field(title="Examples")]
306
+ project_name: Annotated[str, Field(title="Project Name")]
307
+
133
308
 
309
+ class SpansBatchRequest(BaseModel):
310
+ spans: Annotated[List[SpanBatchItem], Field(title="Spans")]
311
+ organization_id: Annotated[str, Field(title="Organization Id")]
134
312
 
135
- class TraceSpanJudgmentType(BaseModel):
313
+
314
+ class FetchPromptScorerResponse(BaseModel):
315
+ scorer: PromptScorer
316
+
317
+
318
+ class TraceSpan(BaseModel):
136
319
  span_id: Annotated[str, Field(title="Span Id")]
137
320
  trace_id: Annotated[str, Field(title="Trace Id")]
138
321
  function: Annotated[str, Field(title="Function")]
139
- depth: Annotated[int, Field(title="Depth")]
140
322
  created_at: Annotated[Any, Field(title="Created At")] = None
141
323
  parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
142
324
  span_type: Annotated[Optional[str], Field(title="Span Type")] = "span"
143
325
  inputs: Annotated[Optional[Dict[str, Any]], Field(title="Inputs")] = None
144
326
  error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
145
327
  output: Annotated[Any, Field(title="Output")] = None
146
- usage: Optional[TraceUsageJudgmentType] = None
328
+ usage: Optional[TraceUsage] = None
147
329
  duration: Annotated[Optional[float], Field(title="Duration")] = None
148
- expected_tools: Annotated[
149
- Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
150
- ] = None
330
+ expected_tools: Annotated[Optional[List[Tool]], Field(title="Expected Tools")] = (
331
+ None
332
+ )
151
333
  additional_metadata: Annotated[
152
334
  Optional[Dict[str, Any]], Field(title="Additional Metadata")
153
335
  ] = None
@@ -161,54 +343,56 @@ class TraceSpanJudgmentType(BaseModel):
161
343
  update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
162
344
 
163
345
 
164
- class TraceJudgmentType(BaseModel):
346
+ class Trace(BaseModel):
165
347
  trace_id: Annotated[str, Field(title="Trace Id")]
166
348
  name: Annotated[str, Field(title="Name")]
167
349
  created_at: Annotated[str, Field(title="Created At")]
168
350
  duration: Annotated[float, Field(title="Duration")]
169
- trace_spans: Annotated[List[TraceSpanJudgmentType], Field(title="Trace Spans")]
351
+ trace_spans: Annotated[List[TraceSpan], Field(title="Trace Spans")]
170
352
  offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
171
- rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = Field(
172
- default_factory=dict
173
- )
353
+ rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = {}
174
354
  has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = False
175
355
  customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
176
- tags: Annotated[Optional[List[str]], Field(title="Tags")] = Field(
177
- default_factory=list
178
- )
179
- metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = Field(
180
- default_factory=dict
181
- )
356
+ tags: Annotated[Optional[List[str]], Field(title="Tags")] = []
357
+ metadata: Annotated[Optional[Dict[str, Any]], Field(title="Metadata")] = {}
182
358
  update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
183
359
 
184
360
 
185
- class ScoringResultJudgmentType(BaseModel):
361
+ class ScoringResult(BaseModel):
186
362
  success: Annotated[bool, Field(title="Success")]
187
- scorers_data: Annotated[
188
- Optional[List[ScorerDataJudgmentType]], Field(title="Scorers Data")
189
- ] = None
363
+ scorers_data: Annotated[Optional[List[ScorerData]], Field(title="Scorers Data")] = (
364
+ None
365
+ )
190
366
  name: Annotated[Optional[str], Field(title="Name")] = None
191
367
  data_object: Annotated[
192
- Optional[Union[TraceSpanJudgmentType, ExampleJudgmentType]],
193
- Field(title="Data Object"),
368
+ Optional[Union[TraceSpan, Example]], Field(title="Data Object")
194
369
  ] = None
195
370
  trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
196
371
  run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
197
372
  evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
198
373
 
199
374
 
200
- class TraceRunJudgmentType(BaseModel):
375
+ class TraceRun(BaseModel):
201
376
  project_name: Annotated[Optional[str], Field(title="Project Name")] = None
202
377
  eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
203
- traces: Annotated[List[TraceJudgmentType], Field(title="Traces")]
204
- scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
378
+ traces: Annotated[List[Trace], Field(title="Traces")]
379
+ scorers: Annotated[List[ScorerConfig], Field(title="Scorers")]
205
380
  model: Annotated[str, Field(title="Model")]
206
381
  trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
207
382
  tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
208
383
 
209
384
 
210
- class EvalResultsJudgmentType(BaseModel):
211
- results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
212
- run: Annotated[
213
- Union[TraceRunJudgmentType, EvaluationRunJudgmentType], Field(title="Run")
214
- ]
385
+ class EvalResults(BaseModel):
386
+ results: Annotated[List[ScoringResult], Field(title="Results")]
387
+ run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
388
+
389
+
390
+ class DatasetPush(BaseModel):
391
+ dataset_alias: Annotated[str, Field(title="Dataset Alias")]
392
+ comments: Annotated[Optional[str], Field(title="Comments")] = None
393
+ source_file: Annotated[Optional[str], Field(title="Source File")] = None
394
+ examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
395
+ traces: Annotated[Optional[List[Trace]], Field(title="Traces")] = None
396
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
397
+ project_name: Annotated[str, Field(title="Project Name")]
398
+ overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
judgeval/data/result.py CHANGED
@@ -1,10 +1,10 @@
1
1
  from typing import List, Union
2
2
  from judgeval.data import ScorerData, Example
3
3
  from judgeval.data.trace import TraceSpan
4
- from judgeval.data.judgment_types import ScoringResultJudgmentType
4
+ from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
5
5
 
6
6
 
7
- class ScoringResult(ScoringResultJudgmentType):
7
+ class ScoringResult(JudgmentScoringResult):
8
8
  """
9
9
  A ScoringResult contains the output of one or more scorers applied to a single example.
10
10
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
@@ -17,9 +17,8 @@ class ScoringResult(ScoringResultJudgmentType):
17
17
 
18
18
  """
19
19
 
20
- data_object: (
21
- Example # Need to override this so that it uses this repo's Example class
22
- )
20
+ # Need to override this so that it uses this repo's Example class
21
+ data_object: Example
23
22
 
24
23
  def model_dump(self, **kwargs):
25
24
  data = super().model_dump(**kwargs)
@@ -4,12 +4,14 @@ Implementation of the ScorerData class.
4
4
  ScorerData holds the information related to a single, completed Scorer evaluation run.
5
5
  """
6
6
 
7
- from judgeval.data.judgment_types import ScorerDataJudgmentType
7
+ from __future__ import annotations
8
+
9
+ from judgeval.data.judgment_types import ScorerData as JudgmentScorerData
8
10
  from judgeval.scorers import BaseScorer
9
11
  from typing import List
10
12
 
11
13
 
12
- class ScorerData(ScorerDataJudgmentType):
14
+ class ScorerData(JudgmentScorerData):
13
15
  """
14
16
  ScorerData holds the information related to a single, completed Scorer evaluation run.
15
17
 
judgeval/data/tool.py CHANGED
@@ -1,5 +1,5 @@
1
- from judgeval.data.judgment_types import ToolJudgmentType
1
+ from judgeval.data.judgment_types import Tool as JudgmentTool
2
2
 
3
3
 
4
- class Tool(ToolJudgmentType):
4
+ class Tool(JudgmentTool):
5
5
  pass
judgeval/data/trace.py CHANGED
@@ -1,24 +1,21 @@
1
- import threading
2
1
  from datetime import datetime, timezone
3
2
  from judgeval.data.judgment_types import (
4
- TraceUsageJudgmentType,
5
- TraceSpanJudgmentType,
6
- TraceJudgmentType,
3
+ TraceUsage as JudgmentTraceUsage,
4
+ TraceSpan as JudgmentTraceSpan,
5
+ Trace as JudgmentTrace,
7
6
  )
8
- from judgeval.constants import SPAN_LIFECYCLE_END_UPDATE_ID
9
- from judgeval.common.api.json_encoder import json_encoder
7
+ from judgeval.utils.serialize import json_encoder
10
8
 
11
9
 
12
- class TraceUsage(TraceUsageJudgmentType):
10
+ class TraceUsage(JudgmentTraceUsage):
13
11
  pass
14
12
 
15
13
 
16
- class TraceSpan(TraceSpanJudgmentType):
14
+ class TraceSpan(JudgmentTraceSpan):
17
15
  def model_dump(self, **kwargs):
18
16
  return {
19
17
  "span_id": self.span_id,
20
18
  "trace_id": self.trace_id,
21
- "depth": self.depth,
22
19
  "created_at": datetime.fromtimestamp(
23
20
  self.created_at, tz=timezone.utc
24
21
  ).isoformat(),
@@ -32,52 +29,12 @@ class TraceSpan(TraceSpanJudgmentType):
32
29
  "usage": self.usage.model_dump() if self.usage else None,
33
30
  "has_evaluation": self.has_evaluation,
34
31
  "agent_name": self.agent_name,
35
- "class_name": self.class_name,
36
32
  "state_before": self.state_before,
37
33
  "state_after": self.state_after,
38
34
  "additional_metadata": json_encoder(self.additional_metadata),
39
35
  "update_id": self.update_id,
40
36
  }
41
37
 
42
- def __init__(self, **data):
43
- super().__init__(**data)
44
- # Initialize thread lock for thread-safe update_id increment
45
- self._update_id_lock = threading.Lock()
46
38
 
47
- def increment_update_id(self) -> int:
48
- """
49
- Thread-safe method to increment the update_id counter.
50
- Returns:
51
- int: The new update_id value after incrementing
52
- """
53
- with self._update_id_lock:
54
- self.update_id += 1
55
- return self.update_id
56
-
57
- def set_update_id_to_ending_number(
58
- self, ending_number: int = SPAN_LIFECYCLE_END_UPDATE_ID
59
- ) -> int:
60
- """
61
- Thread-safe method to set the update_id to a predetermined ending number.
62
-
63
- Args:
64
- ending_number (int): The number to set update_id to. Defaults to SPAN_LIFECYCLE_END_UPDATE_ID.
65
-
66
- Returns:
67
- int: The new update_id value after setting
68
- """
69
- with self._update_id_lock:
70
- self.update_id = ending_number
71
- return self.update_id
72
-
73
- def print_span(self):
74
- """Print the span with proper formatting and parent relationship information."""
75
- indent = " " * self.depth
76
- parent_info = (
77
- f" (parent_id: {self.parent_span_id})" if self.parent_span_id else ""
78
- )
79
- print(f"{indent}→ {self.function} (id: {self.span_id}){parent_info}")
80
-
81
-
82
- class Trace(TraceJudgmentType):
39
+ class Trace(JudgmentTrace):
83
40
  pass
@@ -2,8 +2,7 @@ from pydantic import BaseModel
2
2
  from typing import List, Optional, Dict, Any, Union
3
3
  from judgeval.data import Trace
4
4
  from judgeval.scorers import APIScorerConfig, BaseScorer
5
- from judgeval.rules import Rule
6
- from judgeval.constants import DEFAULT_GPT_MODEL
5
+ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
7
6
 
8
7
 
9
8
  class TraceRun(BaseModel):
@@ -27,9 +26,13 @@ class TraceRun(BaseModel):
27
26
  eval_name: Optional[str] = None
28
27
  traces: Optional[List[Trace]] = None
29
28
  scorers: List[Union[APIScorerConfig, BaseScorer]]
30
- model: Optional[str] = DEFAULT_GPT_MODEL
29
+ model: Optional[str] = JUDGMENT_DEFAULT_GPT_MODEL
31
30
  trace_span_id: Optional[str] = None
32
- rules: Optional[List[Rule]] = None
31
+ append: Optional[bool] = False
32
+ override: Optional[bool] = False
33
+
34
+ # TODO: ?
35
+ rules: Any = None
33
36
  tools: Optional[List[Dict[str, Any]]] = None
34
37
 
35
38
  class Config: