judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,413 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: .openapi.json
3
+ # timestamp: 2025-11-18T18:52:11+00:00
4
+
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
7
+ from typing_extensions import NotRequired
8
+
9
+
10
+ TraceAndSpanId = List
11
+
12
+
13
+ class LogEvalResultsResponse(TypedDict):
14
+ ui_results_url: str
15
+
16
+
17
+ class EvalResultsFetch(TypedDict):
18
+ experiment_run_id: str
19
+ project_name: str
20
+
21
+
22
+ class FetchExperimentRunResponse(TypedDict):
23
+ results: NotRequired[Optional[List]]
24
+ ui_results_url: NotRequired[Optional[str]]
25
+
26
+
27
+ class DatasetFetch(TypedDict):
28
+ dataset_name: str
29
+ project_name: str
30
+
31
+
32
+ class DatasetsFetch(TypedDict):
33
+ project_name: str
34
+
35
+
36
+ class ProjectAdd(TypedDict):
37
+ project_name: str
38
+
39
+
40
+ class ProjectAddResponse(TypedDict):
41
+ project_id: str
42
+
43
+
44
+ class ProjectDeleteFromJudgevalResponse(TypedDict):
45
+ project_name: str
46
+
47
+
48
+ class ProjectDeleteResponse(TypedDict):
49
+ message: str
50
+
51
+
52
+ class ScorerExistsRequest(TypedDict):
53
+ name: str
54
+
55
+
56
+ class ScorerExistsResponse(TypedDict):
57
+ exists: bool
58
+
59
+
60
+ class SavePromptScorerRequest(TypedDict):
61
+ name: str
62
+ prompt: str
63
+ threshold: float
64
+ model: NotRequired[str]
65
+ is_trace: NotRequired[bool]
66
+ options: NotRequired[Optional[Dict[str, float]]]
67
+ description: NotRequired[Optional[str]]
68
+
69
+
70
+ class FetchPromptScorersRequest(TypedDict):
71
+ names: NotRequired[Optional[List[str]]]
72
+ is_trace: NotRequired[Optional[bool]]
73
+
74
+
75
+ class CustomScorerUploadPayload(TypedDict):
76
+ scorer_name: str
77
+ scorer_code: str
78
+ requirements_text: str
79
+ overwrite: NotRequired[bool]
80
+
81
+
82
+ class CustomScorerTemplateResponse(TypedDict):
83
+ scorer_name: str
84
+ status: str
85
+ message: str
86
+
87
+
88
+ class PromptInsertRequest(TypedDict):
89
+ project_id: str
90
+ name: str
91
+ prompt: str
92
+ tags: List[str]
93
+
94
+
95
+ class PromptInsertResponse(TypedDict):
96
+ commit_id: str
97
+ parent_commit_id: NotRequired[Optional[str]]
98
+ created_at: str
99
+
100
+
101
+ class PromptTagRequest(TypedDict):
102
+ project_id: str
103
+ name: str
104
+ commit_id: str
105
+ tags: List[str]
106
+
107
+
108
+ class PromptTagResponse(TypedDict):
109
+ commit_id: str
110
+
111
+
112
+ class PromptUntagRequest(TypedDict):
113
+ project_id: str
114
+ name: str
115
+ tags: List[str]
116
+
117
+
118
+ class PromptUntagResponse(TypedDict):
119
+ commit_ids: List[str]
120
+
121
+
122
+ class ResolveProjectNameRequest(TypedDict):
123
+ project_name: str
124
+
125
+
126
+ class ResolveProjectNameResponse(TypedDict):
127
+ project_id: str
128
+
129
+
130
+ class TraceIdRequest(TypedDict):
131
+ trace_id: str
132
+
133
+
134
+ class SpanScoreRequest(TypedDict):
135
+ span_id: str
136
+ trace_id: str
137
+
138
+
139
+ class BaseScorer(TypedDict):
140
+ score_type: str
141
+ threshold: NotRequired[float]
142
+ name: NotRequired[Optional[str]]
143
+ class_name: NotRequired[Optional[str]]
144
+ score: NotRequired[Optional[float]]
145
+ score_breakdown: NotRequired[Optional[Dict[str, Any]]]
146
+ reason: NotRequired[Optional[str]]
147
+ using_native_model: NotRequired[Optional[bool]]
148
+ success: NotRequired[Optional[bool]]
149
+ model: NotRequired[Optional[str]]
150
+ model_client: NotRequired[Any]
151
+ strict_mode: NotRequired[bool]
152
+ error: NotRequired[Optional[str]]
153
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
154
+ user: NotRequired[Optional[str]]
155
+ server_hosted: NotRequired[bool]
156
+
157
+
158
+ class ScorerConfig(TypedDict):
159
+ score_type: str
160
+ name: NotRequired[Optional[str]]
161
+ threshold: NotRequired[float]
162
+ strict_mode: NotRequired[bool]
163
+ required_params: NotRequired[List[str]]
164
+ kwargs: NotRequired[Optional[Dict[str, Any]]]
165
+
166
+
167
+ class Example(TypedDict):
168
+ example_id: NotRequired[str]
169
+ created_at: NotRequired[str]
170
+ name: NotRequired[Optional[str]]
171
+
172
+
173
+ class ValidationError(TypedDict):
174
+ loc: List[Union[str, int]]
175
+ msg: str
176
+ type: str
177
+
178
+
179
+ class UsageInfo(TypedDict):
180
+ total_judgees: int
181
+ regular_use: int
182
+ pay_as_you_go_use: int
183
+ remaining_regular: int
184
+ remaining_after: int
185
+
186
+
187
+ DatasetKind = Literal["trace", "example"]
188
+
189
+
190
+ class PromptScorer(TypedDict):
191
+ id: str
192
+ user_id: str
193
+ organization_id: str
194
+ name: str
195
+ prompt: str
196
+ threshold: float
197
+ model: NotRequired[str]
198
+ options: NotRequired[Optional[Dict[str, float]]]
199
+ description: NotRequired[Optional[str]]
200
+ created_at: NotRequired[Optional[str]]
201
+ updated_at: NotRequired[Optional[str]]
202
+ is_trace: NotRequired[Optional[bool]]
203
+ is_bucket_rubric: NotRequired[Optional[bool]]
204
+
205
+
206
+ class PromptCommitInfo(TypedDict):
207
+ name: str
208
+ prompt: str
209
+ tags: List[str]
210
+ commit_id: str
211
+ parent_commit_id: NotRequired[Optional[str]]
212
+ created_at: str
213
+ first_name: str
214
+ last_name: str
215
+ user_email: str
216
+
217
+
218
+ class ScorerData(TypedDict):
219
+ id: NotRequired[str]
220
+ name: str
221
+ threshold: float
222
+ success: bool
223
+ score: NotRequired[Optional[float]]
224
+ reason: NotRequired[Optional[str]]
225
+ strict_mode: NotRequired[Optional[bool]]
226
+ evaluation_model: NotRequired[Optional[str]]
227
+ error: NotRequired[Optional[str]]
228
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
229
+
230
+
231
+ class OtelTraceSpan(TypedDict):
232
+ organization_id: str
233
+ project_id: NotRequired[Optional[str]]
234
+ user_id: str
235
+ timestamp: str
236
+ trace_id: str
237
+ span_id: str
238
+ parent_span_id: NotRequired[Optional[str]]
239
+ trace_state: NotRequired[Optional[str]]
240
+ span_name: NotRequired[Optional[str]]
241
+ span_kind: NotRequired[Optional[str]]
242
+ service_name: NotRequired[Optional[str]]
243
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
244
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
245
+ duration: NotRequired[Optional[int]]
246
+ status_code: NotRequired[Optional[int]]
247
+ status_message: NotRequired[Optional[str]]
248
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
249
+ links: NotRequired[Optional[List[Dict[str, Any]]]]
250
+
251
+
252
+ class OtelSpanListItemScores(TypedDict):
253
+ success: bool
254
+ score: float
255
+ reason: NotRequired[Optional[str]]
256
+ name: str
257
+
258
+
259
+ class OtelSpanDetailScores(TypedDict):
260
+ success: bool
261
+ score: float
262
+ reason: NotRequired[Optional[str]]
263
+ name: str
264
+ example_id: NotRequired[Optional[str]]
265
+
266
+
267
+ class ExampleEvaluationRun(TypedDict):
268
+ id: NotRequired[str]
269
+ project_name: str
270
+ eval_name: str
271
+ custom_scorers: NotRequired[List[BaseScorer]]
272
+ judgment_scorers: NotRequired[List[ScorerConfig]]
273
+ created_at: NotRequired[str]
274
+ examples: List[Example]
275
+ trace_span_id: NotRequired[Optional[str]]
276
+ trace_id: NotRequired[Optional[str]]
277
+
278
+
279
+ class HTTPValidationError(TypedDict):
280
+ detail: NotRequired[List[ValidationError]]
281
+
282
+
283
+ class TraceEvaluationRun(TypedDict):
284
+ id: NotRequired[str]
285
+ project_name: str
286
+ eval_name: str
287
+ custom_scorers: NotRequired[List[BaseScorer]]
288
+ judgment_scorers: NotRequired[List[ScorerConfig]]
289
+ created_at: NotRequired[str]
290
+ trace_and_span_ids: List[TraceAndSpanId]
291
+ is_offline: NotRequired[bool]
292
+ is_bucket_run: NotRequired[bool]
293
+
294
+
295
+ class DatasetInsertExamples(TypedDict):
296
+ dataset_name: str
297
+ examples: List[Example]
298
+ project_name: str
299
+
300
+
301
+ class DatasetInfo(TypedDict):
302
+ dataset_id: str
303
+ name: str
304
+ created_at: str
305
+ kind: DatasetKind
306
+ entries: int
307
+ creator: str
308
+
309
+
310
+ class DatasetCreate(TypedDict):
311
+ name: str
312
+ dataset_kind: DatasetKind
313
+ project_name: str
314
+ examples: List[Example]
315
+ overwrite: bool
316
+
317
+
318
+ class SavePromptScorerResponse(TypedDict):
319
+ scorer_response: PromptScorer
320
+
321
+
322
+ class FetchPromptScorersResponse(TypedDict):
323
+ scorers: List[PromptScorer]
324
+
325
+
326
+ class PromptFetchResponse(TypedDict):
327
+ commit: NotRequired[Optional[PromptCommitInfo]]
328
+
329
+
330
+ class PromptVersionsResponse(TypedDict):
331
+ versions: List[PromptCommitInfo]
332
+
333
+
334
+ class ScoringResult(TypedDict):
335
+ success: bool
336
+ scorers_data: List[ScorerData]
337
+ name: NotRequired[Optional[str]]
338
+ data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
339
+ trace_id: NotRequired[Optional[str]]
340
+ run_duration: NotRequired[Optional[float]]
341
+ evaluation_cost: NotRequired[Optional[float]]
342
+
343
+
344
+ class OtelTraceListItem(TypedDict):
345
+ organization_id: str
346
+ project_id: str
347
+ trace_id: str
348
+ created_at: str
349
+ duration: NotRequired[Optional[int]]
350
+ tags: NotRequired[Optional[List[str]]]
351
+ experiment_run_id: NotRequired[Optional[str]]
352
+ span_name: NotRequired[Optional[str]]
353
+ llm_cost: NotRequired[Optional[float]]
354
+ error: NotRequired[str]
355
+ scores: NotRequired[List[OtelSpanListItemScores]]
356
+ rules_invoked: NotRequired[List[str]]
357
+ customer_id: NotRequired[Optional[str]]
358
+ input: NotRequired[Optional[str]]
359
+ output: NotRequired[Optional[str]]
360
+ input_preview: NotRequired[Optional[str]]
361
+ output_preview: NotRequired[Optional[str]]
362
+ annotation_count: NotRequired[int]
363
+ span_id: str
364
+ rule_id: NotRequired[Optional[str]]
365
+
366
+
367
+ class OtelSpanDetail(TypedDict):
368
+ organization_id: str
369
+ project_id: str
370
+ timestamp: str
371
+ trace_id: str
372
+ span_id: str
373
+ parent_span_id: NotRequired[Optional[str]]
374
+ trace_state: NotRequired[Optional[str]]
375
+ span_name: NotRequired[Optional[str]]
376
+ span_kind: NotRequired[Optional[str]]
377
+ service_name: NotRequired[Optional[str]]
378
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
379
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
380
+ duration: NotRequired[Optional[int]]
381
+ status_code: NotRequired[Optional[int]]
382
+ status_message: NotRequired[Optional[str]]
383
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
384
+ links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
385
+ llm_cost: NotRequired[Optional[float]]
386
+ prompt_tokens: NotRequired[Optional[int]]
387
+ completion_tokens: NotRequired[Optional[int]]
388
+ scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
389
+
390
+
391
+ class EvaluateResponse(TypedDict):
392
+ status: str
393
+ results: List[ScoringResult]
394
+ resource_usage: NotRequired[Optional[UsageInfo]]
395
+
396
+
397
+ class EvalResults(TypedDict):
398
+ results: List[ScoringResult]
399
+ run: Union[ExampleEvaluationRun, TraceEvaluationRun]
400
+
401
+
402
+ class DatasetTraceWithSpans(TypedDict):
403
+ dataset_id: str
404
+ trace_detail: OtelTraceListItem
405
+ spans: List[OtelSpanDetail]
406
+
407
+
408
+ class DatasetReturn(TypedDict):
409
+ name: str
410
+ project_name: str
411
+ dataset_kind: DatasetKind
412
+ examples: NotRequired[List[Example]]
413
+ traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
judgeval/cli.py ADDED
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import typer
7
+ from pathlib import Path
8
+ from dotenv import load_dotenv
9
+ from judgeval.logger import judgeval_logger
10
+ from judgeval import JudgmentClient
11
+ from judgeval.version import get_version
12
+ from judgeval.exceptions import JudgmentAPIError
13
+ from judgeval.utils.project import _resolve_project_id
14
+ from judgeval.utils.url import url_for
15
+
16
+ load_dotenv()
17
+
18
+ app = typer.Typer(
19
+ no_args_is_help=True,
20
+ pretty_exceptions_enable=False,
21
+ pretty_exceptions_show_locals=False,
22
+ pretty_exceptions_short=False,
23
+ rich_help_panel=None,
24
+ rich_markup_mode=None,
25
+ )
26
+
27
+
28
+ @app.command(
29
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True}
30
+ )
31
+ def load_otel_env(
32
+ ctx: typer.Context,
33
+ project_name: str = typer.Argument(help="Project name to send telemetry to"),
34
+ api_key: str = typer.Option(None, envvar="JUDGMENT_API_KEY"),
35
+ organization_id: str = typer.Option(None, envvar="JUDGMENT_ORG_ID"),
36
+ ):
37
+ """Run command with OpenTelemetry environment variables configured for Judgment."""
38
+ if not api_key or not organization_id:
39
+ raise typer.BadParameter("JUDGMENT_API_KEY and JUDGMENT_ORG_ID required")
40
+
41
+ project_id = _resolve_project_id(project_name, api_key, organization_id)
42
+ if not project_id:
43
+ raise typer.BadParameter(f"Project '{project_name}' not found")
44
+
45
+ if not ctx.args:
46
+ raise typer.BadParameter(
47
+ "No command provided. Usage: judgeval load_otel_env PROJECT_NAME -- COMMAND"
48
+ )
49
+
50
+ env = os.environ.copy()
51
+ env["OTEL_TRACES_EXPORTER"] = "otlp"
52
+ env["OTEL_EXPORTER_OTLP_TRACES_PROTOCOL"] = "http/protobuf"
53
+ env["OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"] = url_for("/otel/v1/traces")
54
+ env["OTEL_EXPORTER_OTLP_HEADERS"] = (
55
+ f"Authorization=Bearer {api_key},X-Organization-Id={organization_id},X-Project-Id={project_id}"
56
+ )
57
+
58
+ result = subprocess.run(ctx.args, env=env)
59
+ sys.exit(result.returncode)
60
+
61
+
62
+ @app.command()
63
+ def upload_scorer(
64
+ scorer_file_path: str = typer.Argument(help="Path to scorer Python file"),
65
+ requirements_file_path: str = typer.Argument(help="Path to requirements.txt file"),
66
+ unique_name: str = typer.Option(
67
+ None, help="Custom scorer name (auto-detected if not provided)"
68
+ ),
69
+ overwrite: bool = typer.Option(
70
+ False, "--overwrite", "-o", help="Overwrite if exists"
71
+ ),
72
+ api_key: str = typer.Option(None, envvar="JUDGMENT_API_KEY"),
73
+ organization_id: str = typer.Option(None, envvar="JUDGMENT_ORG_ID"),
74
+ ):
75
+ """Upload custom scorer to Judgment."""
76
+ scorer_path = Path(scorer_file_path)
77
+ requirements_path = Path(requirements_file_path)
78
+
79
+ if not scorer_path.exists():
80
+ raise typer.BadParameter(f"Scorer file not found: {scorer_file_path}")
81
+ if not requirements_path.exists():
82
+ raise typer.BadParameter(
83
+ f"Requirements file not found: {requirements_file_path}"
84
+ )
85
+
86
+ client = JudgmentClient(api_key=api_key, organization_id=organization_id)
87
+
88
+ try:
89
+ result = client.upload_custom_scorer(
90
+ scorer_file_path=scorer_file_path,
91
+ requirements_file_path=requirements_file_path,
92
+ unique_name=unique_name,
93
+ overwrite=overwrite,
94
+ )
95
+ if not result:
96
+ raise typer.Abort()
97
+ judgeval_logger.info("Custom scorer uploaded successfully!")
98
+ except JudgmentAPIError as e:
99
+ if e.status_code == 409:
100
+ judgeval_logger.error("Scorer exists. Use --overwrite to replace")
101
+ raise typer.Exit(1)
102
+ raise
103
+
104
+
105
+ @app.command()
106
+ def version():
107
+ """Show Judgeval CLI version."""
108
+ typer.echo(f"Judgeval CLI v{get_version()}")
109
+
110
+
111
+ if __name__ == "__main__":
112
+ app()
judgeval/constants.py CHANGED
@@ -1,10 +1,8 @@
1
- """
2
- Constant variables used throughout source code
3
- """
1
+ from __future__ import annotations
4
2
 
5
3
  from enum import Enum
4
+ from typing import Set
6
5
  import litellm
7
- import os
8
6
 
9
7
 
10
8
  class APIScorerType(str, Enum):
@@ -16,37 +14,25 @@ class APIScorerType(str, Enum):
16
14
  """
17
15
 
18
16
  PROMPT_SCORER = "Prompt Scorer"
17
+ TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
19
18
  FAITHFULNESS = "Faithfulness"
20
19
  ANSWER_RELEVANCY = "Answer Relevancy"
21
20
  ANSWER_CORRECTNESS = "Answer Correctness"
22
21
  INSTRUCTION_ADHERENCE = "Instruction Adherence"
23
22
  EXECUTION_ORDER = "Execution Order"
24
- DERAILMENT = "Derailment"
25
- TOOL_ORDER = "Tool Order"
26
- CLASSIFIER = "Classifier"
27
- TOOL_DEPENDENCY = "Tool Dependency"
28
23
  CUSTOM = "Custom"
29
24
 
30
25
  @classmethod
31
- def _missing_(cls, value):
32
- # Handle case-insensitive lookup
26
+ def __missing__(cls, value: str) -> APIScorerType:
33
27
  for member in cls:
34
28
  if member.value == value.lower():
35
29
  return member
36
30
 
31
+ raise ValueError(f"Invalid scorer type: {value}")
37
32
 
38
- UNBOUNDED_SCORERS: set[APIScorerType] = (
39
- set()
40
- ) # scorers whose scores are not bounded between 0-1
41
33
 
42
- # RabbitMQ
43
- RABBITMQ_HOST = os.getenv(
44
- "RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com"
45
- )
46
- RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
47
- RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
48
- # Models
49
- LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
34
+ LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
35
+
50
36
 
51
37
  TOGETHER_SUPPORTED_MODELS = [
52
38
  "meta-llama/Meta-Llama-3-70B-Instruct-Turbo",
@@ -109,12 +95,3 @@ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
109
95
  ACCEPTABLE_MODELS = (
110
96
  set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
111
97
  )
112
-
113
- ## System settings
114
- MAX_WORKER_THREADS = 10
115
-
116
- # Maximum number of concurrent operations for evaluation runs
117
- MAX_CONCURRENT_EVALUATIONS = 50 # Adjust based on system capabilities
118
-
119
- # Span lifecycle management
120
- SPAN_LIFECYCLE_END_UPDATE_ID = 20 # Default ending number for completed spans
judgeval/data/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
3
3
  from judgeval.data.result import ScoringResult, generate_scoring_result
4
- from judgeval.data.trace import Trace, TraceSpan, TraceUsage
4
+ from judgeval.data.trace import TraceUsage
5
5
 
6
6
 
7
7
  __all__ = [
@@ -11,7 +11,5 @@ __all__ = [
11
11
  "create_scorer_data",
12
12
  "ScoringResult",
13
13
  "generate_scoring_result",
14
- "Trace",
15
- "TraceSpan",
16
14
  "TraceUsage",
17
15
  ]