judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,407 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: .openapi.json
3
+ # timestamp: 2025-10-25T22:30:20+00:00
4
+
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
7
+ from typing_extensions import NotRequired
8
+
9
+
10
+ TraceAndSpanId = List
11
+
12
+
13
+ class EvalResultsFetch(TypedDict):
14
+ experiment_run_id: str
15
+ project_name: str
16
+
17
+
18
+ class DatasetFetch(TypedDict):
19
+ dataset_name: str
20
+ project_name: str
21
+
22
+
23
+ class DatasetsFetch(TypedDict):
24
+ project_name: str
25
+
26
+
27
+ class ProjectAdd(TypedDict):
28
+ project_name: str
29
+
30
+
31
+ class ProjectAddResponse(TypedDict):
32
+ project_id: str
33
+
34
+
35
+ class ProjectDeleteFromJudgevalResponse(TypedDict):
36
+ project_name: str
37
+
38
+
39
+ class ProjectDeleteResponse(TypedDict):
40
+ message: str
41
+
42
+
43
+ class ScorerExistsRequest(TypedDict):
44
+ name: str
45
+
46
+
47
+ class ScorerExistsResponse(TypedDict):
48
+ exists: bool
49
+
50
+
51
+ class SavePromptScorerRequest(TypedDict):
52
+ name: str
53
+ prompt: str
54
+ threshold: float
55
+ model: NotRequired[str]
56
+ is_trace: NotRequired[bool]
57
+ options: NotRequired[Optional[Dict[str, float]]]
58
+ description: NotRequired[Optional[str]]
59
+
60
+
61
+ class FetchPromptScorersRequest(TypedDict):
62
+ names: NotRequired[Optional[List[str]]]
63
+ is_trace: NotRequired[Optional[bool]]
64
+
65
+
66
+ class CustomScorerUploadPayload(TypedDict):
67
+ scorer_name: str
68
+ scorer_code: str
69
+ requirements_text: str
70
+ overwrite: NotRequired[bool]
71
+
72
+
73
+ class CustomScorerTemplateResponse(TypedDict):
74
+ scorer_name: str
75
+ status: str
76
+ message: str
77
+
78
+
79
+ class PromptInsertRequest(TypedDict):
80
+ project_id: str
81
+ name: str
82
+ prompt: str
83
+ tags: List[str]
84
+
85
+
86
+ class PromptInsertResponse(TypedDict):
87
+ commit_id: str
88
+ parent_commit_id: NotRequired[Optional[str]]
89
+ created_at: str
90
+
91
+
92
+ class PromptTagRequest(TypedDict):
93
+ project_id: str
94
+ name: str
95
+ commit_id: str
96
+ tags: List[str]
97
+
98
+
99
+ class PromptTagResponse(TypedDict):
100
+ commit_id: str
101
+
102
+
103
+ class PromptUntagRequest(TypedDict):
104
+ project_id: str
105
+ name: str
106
+ tags: List[str]
107
+
108
+
109
+ class PromptUntagResponse(TypedDict):
110
+ commit_ids: List[str]
111
+
112
+
113
+ class ResolveProjectNameRequest(TypedDict):
114
+ project_name: str
115
+
116
+
117
+ class ResolveProjectNameResponse(TypedDict):
118
+ project_id: str
119
+
120
+
121
+ class TraceIdRequest(TypedDict):
122
+ trace_id: str
123
+
124
+
125
+ class SpanScoreRequest(TypedDict):
126
+ span_id: str
127
+ trace_id: str
128
+
129
+
130
+ class BaseScorer(TypedDict):
131
+ score_type: str
132
+ threshold: NotRequired[float]
133
+ name: NotRequired[Optional[str]]
134
+ class_name: NotRequired[Optional[str]]
135
+ score: NotRequired[Optional[float]]
136
+ score_breakdown: NotRequired[Optional[Dict[str, Any]]]
137
+ reason: NotRequired[Optional[str]]
138
+ using_native_model: NotRequired[Optional[bool]]
139
+ success: NotRequired[Optional[bool]]
140
+ model: NotRequired[Optional[str]]
141
+ model_client: NotRequired[Any]
142
+ strict_mode: NotRequired[bool]
143
+ error: NotRequired[Optional[str]]
144
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
145
+ user: NotRequired[Optional[str]]
146
+ server_hosted: NotRequired[bool]
147
+
148
+
149
+ class ScorerConfig(TypedDict):
150
+ score_type: str
151
+ name: NotRequired[Optional[str]]
152
+ threshold: NotRequired[float]
153
+ model: NotRequired[Optional[str]]
154
+ strict_mode: NotRequired[bool]
155
+ required_params: NotRequired[List[str]]
156
+ kwargs: NotRequired[Optional[Dict[str, Any]]]
157
+
158
+
159
+ class Example(TypedDict):
160
+ example_id: NotRequired[str]
161
+ created_at: NotRequired[str]
162
+ name: NotRequired[Optional[str]]
163
+
164
+
165
+ class ValidationError(TypedDict):
166
+ loc: List[Union[str, int]]
167
+ msg: str
168
+ type: str
169
+
170
+
171
+ class UsageInfo(TypedDict):
172
+ total_judgees: int
173
+ regular_use: int
174
+ pay_as_you_go_use: int
175
+ remaining_regular: int
176
+ remaining_after: int
177
+
178
+
179
+ DatasetKind = Literal["trace", "example"]
180
+
181
+
182
+ class PromptScorer(TypedDict):
183
+ id: str
184
+ user_id: str
185
+ organization_id: str
186
+ name: str
187
+ prompt: str
188
+ threshold: float
189
+ model: NotRequired[str]
190
+ options: NotRequired[Optional[Dict[str, float]]]
191
+ description: NotRequired[Optional[str]]
192
+ created_at: NotRequired[Optional[str]]
193
+ updated_at: NotRequired[Optional[str]]
194
+ is_trace: NotRequired[Optional[bool]]
195
+ is_bucket_rubric: NotRequired[Optional[bool]]
196
+
197
+
198
+ class PromptCommitInfo(TypedDict):
199
+ name: str
200
+ prompt: str
201
+ tags: List[str]
202
+ commit_id: str
203
+ parent_commit_id: NotRequired[Optional[str]]
204
+ created_at: str
205
+ first_name: str
206
+ last_name: str
207
+ user_email: str
208
+
209
+
210
+ class ScorerData(TypedDict):
211
+ id: NotRequired[str]
212
+ name: str
213
+ threshold: float
214
+ success: bool
215
+ score: NotRequired[Optional[float]]
216
+ reason: NotRequired[Optional[str]]
217
+ strict_mode: NotRequired[Optional[bool]]
218
+ evaluation_model: NotRequired[Optional[str]]
219
+ error: NotRequired[Optional[str]]
220
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
221
+
222
+
223
+ class OtelTraceSpan(TypedDict):
224
+ organization_id: str
225
+ project_id: NotRequired[Optional[str]]
226
+ user_id: str
227
+ timestamp: str
228
+ trace_id: str
229
+ span_id: str
230
+ parent_span_id: NotRequired[Optional[str]]
231
+ trace_state: NotRequired[Optional[str]]
232
+ span_name: NotRequired[Optional[str]]
233
+ span_kind: NotRequired[Optional[str]]
234
+ service_name: NotRequired[Optional[str]]
235
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
236
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
237
+ duration: NotRequired[Optional[int]]
238
+ status_code: NotRequired[Optional[int]]
239
+ status_message: NotRequired[Optional[str]]
240
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
241
+ links: NotRequired[Optional[List[Dict[str, Any]]]]
242
+
243
+
244
+ class OtelSpanListItemScores(TypedDict):
245
+ success: bool
246
+ score: float
247
+ reason: NotRequired[Optional[str]]
248
+ name: str
249
+
250
+
251
+ class OtelSpanDetailScores(TypedDict):
252
+ success: bool
253
+ score: float
254
+ reason: NotRequired[Optional[str]]
255
+ name: str
256
+ example_id: NotRequired[Optional[str]]
257
+
258
+
259
+ class ExampleEvaluationRun(TypedDict):
260
+ id: NotRequired[str]
261
+ project_name: str
262
+ eval_name: str
263
+ custom_scorers: NotRequired[List[BaseScorer]]
264
+ judgment_scorers: NotRequired[List[ScorerConfig]]
265
+ model: NotRequired[Optional[str]]
266
+ created_at: NotRequired[str]
267
+ examples: List[Example]
268
+ trace_span_id: NotRequired[Optional[str]]
269
+ trace_id: NotRequired[Optional[str]]
270
+
271
+
272
+ class HTTPValidationError(TypedDict):
273
+ detail: NotRequired[List[ValidationError]]
274
+
275
+
276
+ class TraceEvaluationRun(TypedDict):
277
+ id: NotRequired[str]
278
+ project_name: str
279
+ eval_name: str
280
+ custom_scorers: NotRequired[List[BaseScorer]]
281
+ judgment_scorers: NotRequired[List[ScorerConfig]]
282
+ model: NotRequired[Optional[str]]
283
+ created_at: NotRequired[str]
284
+ trace_and_span_ids: List[TraceAndSpanId]
285
+ is_offline: NotRequired[bool]
286
+ is_bucket_run: NotRequired[bool]
287
+
288
+
289
+ class DatasetInsertExamples(TypedDict):
290
+ dataset_name: str
291
+ examples: List[Example]
292
+ project_name: str
293
+
294
+
295
+ class DatasetInfo(TypedDict):
296
+ dataset_id: str
297
+ name: str
298
+ created_at: str
299
+ kind: DatasetKind
300
+ entries: int
301
+ creator: str
302
+
303
+
304
+ class DatasetCreate(TypedDict):
305
+ name: str
306
+ dataset_kind: DatasetKind
307
+ project_name: str
308
+ examples: List[Example]
309
+ overwrite: bool
310
+
311
+
312
+ class SavePromptScorerResponse(TypedDict):
313
+ scorer_response: PromptScorer
314
+
315
+
316
+ class FetchPromptScorersResponse(TypedDict):
317
+ scorers: List[PromptScorer]
318
+
319
+
320
+ class PromptFetchResponse(TypedDict):
321
+ commit: NotRequired[Optional[PromptCommitInfo]]
322
+
323
+
324
+ class PromptVersionsResponse(TypedDict):
325
+ versions: List[PromptCommitInfo]
326
+
327
+
328
+ class ScoringResult(TypedDict):
329
+ success: bool
330
+ scorers_data: List[ScorerData]
331
+ name: NotRequired[Optional[str]]
332
+ data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
333
+ trace_id: NotRequired[Optional[str]]
334
+ run_duration: NotRequired[Optional[float]]
335
+ evaluation_cost: NotRequired[Optional[float]]
336
+
337
+
338
+ class OtelTraceListItem(TypedDict):
339
+ organization_id: str
340
+ project_id: str
341
+ trace_id: str
342
+ created_at: str
343
+ duration: NotRequired[Optional[int]]
344
+ tags: NotRequired[Optional[List[str]]]
345
+ experiment_run_id: NotRequired[Optional[str]]
346
+ span_name: NotRequired[Optional[str]]
347
+ llm_cost: NotRequired[Optional[float]]
348
+ error: NotRequired[str]
349
+ scores: NotRequired[List[OtelSpanListItemScores]]
350
+ rules_invoked: NotRequired[List[str]]
351
+ customer_id: NotRequired[Optional[str]]
352
+ input: NotRequired[Optional[str]]
353
+ output: NotRequired[Optional[str]]
354
+ input_preview: NotRequired[Optional[str]]
355
+ output_preview: NotRequired[Optional[str]]
356
+ annotation_count: NotRequired[int]
357
+ span_id: str
358
+ rule_id: NotRequired[Optional[str]]
359
+
360
+
361
+ class OtelSpanDetail(TypedDict):
362
+ organization_id: str
363
+ project_id: str
364
+ timestamp: str
365
+ trace_id: str
366
+ span_id: str
367
+ parent_span_id: NotRequired[Optional[str]]
368
+ trace_state: NotRequired[Optional[str]]
369
+ span_name: NotRequired[Optional[str]]
370
+ span_kind: NotRequired[Optional[str]]
371
+ service_name: NotRequired[Optional[str]]
372
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
373
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
374
+ duration: NotRequired[Optional[int]]
375
+ status_code: NotRequired[Optional[int]]
376
+ status_message: NotRequired[Optional[str]]
377
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
378
+ links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
379
+ llm_cost: NotRequired[Optional[float]]
380
+ prompt_tokens: NotRequired[Optional[int]]
381
+ completion_tokens: NotRequired[Optional[int]]
382
+ scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
383
+
384
+
385
+ class EvaluateResponse(TypedDict):
386
+ status: str
387
+ results: List[ScoringResult]
388
+ resource_usage: NotRequired[Optional[UsageInfo]]
389
+
390
+
391
+ class EvalResults(TypedDict):
392
+ results: List[ScoringResult]
393
+ run: Union[ExampleEvaluationRun, TraceEvaluationRun]
394
+
395
+
396
+ class DatasetTraceWithSpans(TypedDict):
397
+ dataset_id: str
398
+ trace_detail: OtelTraceListItem
399
+ spans: List[OtelSpanDetail]
400
+
401
+
402
+ class DatasetReturn(TypedDict):
403
+ name: str
404
+ project_name: str
405
+ dataset_kind: DatasetKind
406
+ examples: NotRequired[List[Example]]
407
+ traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
judgeval/cli.py ADDED
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import typer
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ from judgeval.logger import judgeval_logger
7
+ from judgeval import JudgmentClient
8
+ from judgeval.version import get_version
9
+ from judgeval.exceptions import JudgmentAPIError
10
+
11
+ load_dotenv()
12
+
13
+ app = typer.Typer(
14
+ no_args_is_help=True,
15
+ rich_markup_mode=None,
16
+ rich_help_panel=None,
17
+ pretty_exceptions_enable=False,
18
+ pretty_exceptions_show_locals=False,
19
+ pretty_exceptions_short=False,
20
+ )
21
+
22
+
23
+ @app.command("upload_scorer")
24
+ def upload_scorer(
25
+ scorer_file_path: str,
26
+ requirements_file_path: str,
27
+ unique_name: str = typer.Option(
28
+ None, help="Custom name for the scorer (auto-detected if not provided)"
29
+ ),
30
+ overwrite: bool = typer.Option(
31
+ False,
32
+ "--overwrite",
33
+ "-o",
34
+ help="Overwrite existing scorer if it already exists",
35
+ ),
36
+ ):
37
+ # Validate file paths
38
+ if not Path(scorer_file_path).exists():
39
+ judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
40
+ raise typer.Exit(1)
41
+
42
+ if not Path(requirements_file_path).exists():
43
+ judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
44
+ raise typer.Exit(1)
45
+
46
+ try:
47
+ client = JudgmentClient()
48
+
49
+ result = client.upload_custom_scorer(
50
+ scorer_file_path=scorer_file_path,
51
+ requirements_file_path=requirements_file_path,
52
+ unique_name=unique_name,
53
+ overwrite=overwrite,
54
+ )
55
+
56
+ if not result:
57
+ judgeval_logger.error("Failed to upload custom scorer")
58
+ raise typer.Exit(1)
59
+
60
+ judgeval_logger.info("Custom scorer uploaded successfully!")
61
+ raise typer.Exit(0)
62
+ except Exception as e:
63
+ if isinstance(e, JudgmentAPIError) and e.status_code == 409:
64
+ judgeval_logger.error(
65
+ "Duplicate scorer detected. Use --overwrite flag to replace the existing scorer"
66
+ )
67
+ raise typer.Exit(1)
68
+ # Re-raise other exceptions
69
+ raise
70
+
71
+
72
+ @app.command()
73
+ def version():
74
+ """Show version info"""
75
+ judgeval_logger.info(f"Judgeval CLI v{get_version()}")
76
+
77
+
78
+ if __name__ == "__main__":
79
+ app()
judgeval/constants.py CHANGED
@@ -1,68 +1,97 @@
1
- """
2
- Constant variables used throughout source code
3
- """
1
+ from __future__ import annotations
4
2
 
5
3
  from enum import Enum
4
+ from typing import Set
6
5
  import litellm
7
- import os
8
6
 
9
- class APIScorer(str, Enum):
7
+
8
+ class APIScorerType(str, Enum):
10
9
  """
11
10
  Collection of proprietary scorers implemented by Judgment.
12
11
 
13
12
  These are ready-made evaluation scorers that can be used to evaluate
14
13
  Examples via the Judgment API.
15
14
  """
16
- FAITHFULNESS = "faithfulness"
17
- ANSWER_RELEVANCY = "answer_relevancy"
18
- ANSWER_CORRECTNESS = "answer_correctness"
19
- HALLUCINATION = "hallucination"
20
- SUMMARIZATION = "summarization"
21
- CONTEXTUAL_RECALL = "contextual_recall"
22
- CONTEXTUAL_RELEVANCY = "contextual_relevancy"
23
- CONTEXTUAL_PRECISION = "contextual_precision"
24
- TOOL_CORRECTNESS = "tool_correctness"
25
- JSON_CORRECTNESS = "json_correctness"
15
+
16
+ PROMPT_SCORER = "Prompt Scorer"
17
+ TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
18
+ FAITHFULNESS = "Faithfulness"
19
+ ANSWER_RELEVANCY = "Answer Relevancy"
20
+ ANSWER_CORRECTNESS = "Answer Correctness"
21
+ INSTRUCTION_ADHERENCE = "Instruction Adherence"
22
+ EXECUTION_ORDER = "Execution Order"
23
+ CUSTOM = "Custom"
26
24
 
27
25
  @classmethod
28
- def _missing_(cls, value):
29
- # Handle case-insensitive lookup
26
+ def __missing__(cls, value: str) -> APIScorerType:
30
27
  for member in cls:
31
28
  if member.value == value.lower():
32
29
  return member
33
30
 
34
- ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
35
- # API URLs
36
- JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
37
- JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
38
- JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
39
- JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
40
- JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
41
- JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
42
- JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
43
- JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
44
- JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
45
- JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
46
- JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
31
+ raise ValueError(f"Invalid scorer type: {value}")
32
+
47
33
 
48
- # RabbitMQ
49
- RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
50
- RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
51
- RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
34
+ LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
52
35
 
53
- # Models
54
- TOGETHER_SUPPORTED_MODELS = {
55
- "QWEN": "Qwen/Qwen2-72B-Instruct",
56
- "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
57
- "LLAMA3_405B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
58
- "LLAMA3_8B_INSTRUCT_TURBO": "meta-llama/Llama-3.2-3B-Instruct-Turbo",
59
- "MISTRAL_8x22B_INSTRUCT": "mistralai/Mixtral-8x22B-Instruct-v0.1",
60
- "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
61
- }
62
36
 
63
- JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
37
+ TOGETHER_SUPPORTED_MODELS = [
38
+ "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
39
+ "Qwen/Qwen2-VL-72B-Instruct",
40
+ "meta-llama/Llama-Vision-Free",
41
+ "Gryphe/MythoMax-L2-13b",
42
+ "Qwen/Qwen2.5-72B-Instruct-Turbo",
43
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
44
+ "deepseek-ai/DeepSeek-R1",
45
+ "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
46
+ "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
47
+ "google/gemma-2-27b-it",
48
+ "mistralai/Mistral-Small-24B-Instruct-2501",
49
+ "mistralai/Mixtral-8x22B-Instruct-v0.1",
50
+ "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
51
+ "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO",
52
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-classifier",
53
+ "deepseek-ai/DeepSeek-V3",
54
+ "Qwen/Qwen2-72B-Instruct",
55
+ "meta-llama/Meta-Llama-3-8B-Instruct-Lite",
56
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
57
+ "upstage/SOLAR-10.7B-Instruct-v1.0",
58
+ "togethercomputer/MoA-1",
59
+ "Qwen/QwQ-32B-Preview",
60
+ "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo",
61
+ "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
62
+ "mistralai/Mistral-7B-Instruct-v0.2",
63
+ "databricks/dbrx-instruct",
64
+ "meta-llama/Llama-3-8b-chat-hf",
65
+ "google/gemma-2b-it",
66
+ "meta-llama/Meta-Llama-3-70B-Instruct-Lite",
67
+ "google/gemma-2-9b-it",
68
+ "meta-llama/Llama-3.3-70B-Instruct-Turbo",
69
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo-p",
70
+ "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
71
+ "Gryphe/MythoMax-L2-13b-Lite",
72
+ "meta-llama/Llama-2-7b-chat-hf",
73
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
74
+ "meta-llama/Llama-2-13b-chat-hf",
75
+ "scb10x/scb10x-llama3-typhoon-v1-5-8b-instruct",
76
+ "scb10x/scb10x-llama3-typhoon-v1-5x-4f316",
77
+ "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
78
+ "Qwen/Qwen2.5-Coder-32B-Instruct",
79
+ "microsoft/WizardLM-2-8x22B",
80
+ "mistralai/Mistral-7B-Instruct-v0.3",
81
+ "scb10x/scb10x-llama3-1-typhoon2-60256",
82
+ "Qwen/Qwen2.5-7B-Instruct-Turbo",
83
+ "scb10x/scb10x-llama3-1-typhoon-18370",
84
+ "meta-llama/Llama-3.2-3B-Instruct-Turbo",
85
+ "meta-llama/Llama-3-70b-chat-hf",
86
+ "mistralai/Mixtral-8x7B-Instruct-v0.1",
87
+ "togethercomputer/MoA-1-Turbo",
88
+ "deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
89
+ "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
90
+ "mistralai/Mistral-7B-Instruct-v0.1",
91
+ ]
64
92
 
65
- ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
93
+ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
66
94
 
67
- ## System settings
68
- MAX_WORKER_THREADS = 10
95
+ ACCEPTABLE_MODELS = (
96
+ set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
97
+ )
judgeval/data/__init__.py CHANGED
@@ -1,15 +1,15 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
- from judgeval.data.api_example import ProcessExample, create_process_example
3
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
4
+ from judgeval.data.trace import TraceUsage
5
+
5
6
 
6
7
  __all__ = [
7
8
  "Example",
8
9
  "ExampleParams",
9
- "ProcessExample",
10
- "create_process_example",
11
10
  "ScorerData",
12
11
  "create_scorer_data",
13
12
  "ScoringResult",
14
13
  "generate_scoring_result",
14
+ "TraceUsage",
15
15
  ]