judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,413 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: tmp5p8bhyaw.json
3
+ # timestamp: 2025-11-18T18:52:12+00:00
4
+
5
+ from __future__ import annotations
6
+ from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
7
+ from typing_extensions import NotRequired
8
+
9
+
10
+ TraceAndSpanId = List
11
+
12
+
13
+ class LogEvalResultsResponse(TypedDict):
14
+ ui_results_url: str
15
+
16
+
17
+ class EvalResultsFetch(TypedDict):
18
+ experiment_run_id: str
19
+ project_name: str
20
+
21
+
22
+ class FetchExperimentRunResponse(TypedDict):
23
+ results: NotRequired[Optional[List]]
24
+ ui_results_url: NotRequired[Optional[str]]
25
+
26
+
27
+ class DatasetFetch(TypedDict):
28
+ dataset_name: str
29
+ project_name: str
30
+
31
+
32
+ class DatasetsFetch(TypedDict):
33
+ project_name: str
34
+
35
+
36
+ class ProjectAdd(TypedDict):
37
+ project_name: str
38
+
39
+
40
+ class ProjectAddResponse(TypedDict):
41
+ project_id: str
42
+
43
+
44
+ class ProjectDeleteFromJudgevalResponse(TypedDict):
45
+ project_name: str
46
+
47
+
48
+ class ProjectDeleteResponse(TypedDict):
49
+ message: str
50
+
51
+
52
+ class ScorerExistsRequest(TypedDict):
53
+ name: str
54
+
55
+
56
+ class ScorerExistsResponse(TypedDict):
57
+ exists: bool
58
+
59
+
60
+ class SavePromptScorerRequest(TypedDict):
61
+ name: str
62
+ prompt: str
63
+ threshold: float
64
+ model: NotRequired[str]
65
+ is_trace: NotRequired[bool]
66
+ options: NotRequired[Optional[Dict[str, float]]]
67
+ description: NotRequired[Optional[str]]
68
+
69
+
70
+ class FetchPromptScorersRequest(TypedDict):
71
+ names: NotRequired[Optional[List[str]]]
72
+ is_trace: NotRequired[Optional[bool]]
73
+
74
+
75
+ class CustomScorerUploadPayload(TypedDict):
76
+ scorer_name: str
77
+ scorer_code: str
78
+ requirements_text: str
79
+ overwrite: NotRequired[bool]
80
+
81
+
82
+ class CustomScorerTemplateResponse(TypedDict):
83
+ scorer_name: str
84
+ status: str
85
+ message: str
86
+
87
+
88
+ class PromptInsertRequest(TypedDict):
89
+ project_id: str
90
+ name: str
91
+ prompt: str
92
+ tags: List[str]
93
+
94
+
95
+ class PromptInsertResponse(TypedDict):
96
+ commit_id: str
97
+ parent_commit_id: NotRequired[Optional[str]]
98
+ created_at: str
99
+
100
+
101
+ class PromptTagRequest(TypedDict):
102
+ project_id: str
103
+ name: str
104
+ commit_id: str
105
+ tags: List[str]
106
+
107
+
108
+ class PromptTagResponse(TypedDict):
109
+ commit_id: str
110
+
111
+
112
+ class PromptUntagRequest(TypedDict):
113
+ project_id: str
114
+ name: str
115
+ tags: List[str]
116
+
117
+
118
+ class PromptUntagResponse(TypedDict):
119
+ commit_ids: List[str]
120
+
121
+
122
+ class ResolveProjectNameRequest(TypedDict):
123
+ project_name: str
124
+
125
+
126
+ class ResolveProjectNameResponse(TypedDict):
127
+ project_id: str
128
+
129
+
130
+ class TraceIdRequest(TypedDict):
131
+ trace_id: str
132
+
133
+
134
+ class SpanScoreRequest(TypedDict):
135
+ span_id: str
136
+ trace_id: str
137
+
138
+
139
+ class BaseScorer(TypedDict):
140
+ score_type: str
141
+ threshold: NotRequired[float]
142
+ name: NotRequired[Optional[str]]
143
+ class_name: NotRequired[Optional[str]]
144
+ score: NotRequired[Optional[float]]
145
+ score_breakdown: NotRequired[Optional[Dict[str, Any]]]
146
+ reason: NotRequired[Optional[str]]
147
+ using_native_model: NotRequired[Optional[bool]]
148
+ success: NotRequired[Optional[bool]]
149
+ model: NotRequired[Optional[str]]
150
+ model_client: NotRequired[Any]
151
+ strict_mode: NotRequired[bool]
152
+ error: NotRequired[Optional[str]]
153
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
154
+ user: NotRequired[Optional[str]]
155
+ server_hosted: NotRequired[bool]
156
+
157
+
158
+ class ScorerConfig(TypedDict):
159
+ score_type: str
160
+ name: NotRequired[Optional[str]]
161
+ threshold: NotRequired[float]
162
+ strict_mode: NotRequired[bool]
163
+ required_params: NotRequired[List[str]]
164
+ kwargs: NotRequired[Optional[Dict[str, Any]]]
165
+
166
+
167
+ class Example(TypedDict):
168
+ example_id: NotRequired[str]
169
+ created_at: NotRequired[str]
170
+ name: NotRequired[Optional[str]]
171
+
172
+
173
+ class ValidationError(TypedDict):
174
+ loc: List[Union[str, int]]
175
+ msg: str
176
+ type: str
177
+
178
+
179
+ class UsageInfo(TypedDict):
180
+ total_judgees: int
181
+ regular_use: int
182
+ pay_as_you_go_use: int
183
+ remaining_regular: int
184
+ remaining_after: int
185
+
186
+
187
+ DatasetKind = Literal["trace", "example"]
188
+
189
+
190
+ class PromptScorer(TypedDict):
191
+ id: str
192
+ user_id: str
193
+ organization_id: str
194
+ name: str
195
+ prompt: str
196
+ threshold: float
197
+ model: NotRequired[str]
198
+ options: NotRequired[Optional[Dict[str, float]]]
199
+ description: NotRequired[Optional[str]]
200
+ created_at: NotRequired[Optional[str]]
201
+ updated_at: NotRequired[Optional[str]]
202
+ is_trace: NotRequired[Optional[bool]]
203
+ is_bucket_rubric: NotRequired[Optional[bool]]
204
+
205
+
206
+ class PromptCommitInfo(TypedDict):
207
+ name: str
208
+ prompt: str
209
+ tags: List[str]
210
+ commit_id: str
211
+ parent_commit_id: NotRequired[Optional[str]]
212
+ created_at: str
213
+ first_name: str
214
+ last_name: str
215
+ user_email: str
216
+
217
+
218
+ class ScorerData(TypedDict):
219
+ id: NotRequired[str]
220
+ name: str
221
+ threshold: float
222
+ success: bool
223
+ score: NotRequired[Optional[float]]
224
+ reason: NotRequired[Optional[str]]
225
+ strict_mode: NotRequired[Optional[bool]]
226
+ evaluation_model: NotRequired[Optional[str]]
227
+ error: NotRequired[Optional[str]]
228
+ additional_metadata: NotRequired[Optional[Dict[str, Any]]]
229
+
230
+
231
+ class OtelTraceSpan(TypedDict):
232
+ organization_id: str
233
+ project_id: NotRequired[Optional[str]]
234
+ user_id: str
235
+ timestamp: str
236
+ trace_id: str
237
+ span_id: str
238
+ parent_span_id: NotRequired[Optional[str]]
239
+ trace_state: NotRequired[Optional[str]]
240
+ span_name: NotRequired[Optional[str]]
241
+ span_kind: NotRequired[Optional[str]]
242
+ service_name: NotRequired[Optional[str]]
243
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
244
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
245
+ duration: NotRequired[Optional[int]]
246
+ status_code: NotRequired[Optional[int]]
247
+ status_message: NotRequired[Optional[str]]
248
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
249
+ links: NotRequired[Optional[List[Dict[str, Any]]]]
250
+
251
+
252
+ class OtelSpanListItemScores(TypedDict):
253
+ success: bool
254
+ score: float
255
+ reason: NotRequired[Optional[str]]
256
+ name: str
257
+
258
+
259
+ class OtelSpanDetailScores(TypedDict):
260
+ success: bool
261
+ score: float
262
+ reason: NotRequired[Optional[str]]
263
+ name: str
264
+ example_id: NotRequired[Optional[str]]
265
+
266
+
267
+ class ExampleEvaluationRun(TypedDict):
268
+ id: NotRequired[str]
269
+ project_name: str
270
+ eval_name: str
271
+ custom_scorers: NotRequired[List[BaseScorer]]
272
+ judgment_scorers: NotRequired[List[ScorerConfig]]
273
+ created_at: NotRequired[str]
274
+ examples: List[Example]
275
+ trace_span_id: NotRequired[Optional[str]]
276
+ trace_id: NotRequired[Optional[str]]
277
+
278
+
279
+ class HTTPValidationError(TypedDict):
280
+ detail: NotRequired[List[ValidationError]]
281
+
282
+
283
+ class TraceEvaluationRun(TypedDict):
284
+ id: NotRequired[str]
285
+ project_name: str
286
+ eval_name: str
287
+ custom_scorers: NotRequired[List[BaseScorer]]
288
+ judgment_scorers: NotRequired[List[ScorerConfig]]
289
+ created_at: NotRequired[str]
290
+ trace_and_span_ids: List[TraceAndSpanId]
291
+ is_offline: NotRequired[bool]
292
+ is_bucket_run: NotRequired[bool]
293
+
294
+
295
+ class DatasetInsertExamples(TypedDict):
296
+ dataset_name: str
297
+ examples: List[Example]
298
+ project_name: str
299
+
300
+
301
+ class DatasetInfo(TypedDict):
302
+ dataset_id: str
303
+ name: str
304
+ created_at: str
305
+ kind: DatasetKind
306
+ entries: int
307
+ creator: str
308
+
309
+
310
+ class DatasetCreate(TypedDict):
311
+ name: str
312
+ dataset_kind: DatasetKind
313
+ project_name: str
314
+ examples: List[Example]
315
+ overwrite: bool
316
+
317
+
318
+ class SavePromptScorerResponse(TypedDict):
319
+ scorer_response: PromptScorer
320
+
321
+
322
+ class FetchPromptScorersResponse(TypedDict):
323
+ scorers: List[PromptScorer]
324
+
325
+
326
+ class PromptFetchResponse(TypedDict):
327
+ commit: NotRequired[Optional[PromptCommitInfo]]
328
+
329
+
330
+ class PromptVersionsResponse(TypedDict):
331
+ versions: List[PromptCommitInfo]
332
+
333
+
334
+ class ScoringResult(TypedDict):
335
+ success: bool
336
+ scorers_data: List[ScorerData]
337
+ name: NotRequired[Optional[str]]
338
+ data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
339
+ trace_id: NotRequired[Optional[str]]
340
+ run_duration: NotRequired[Optional[float]]
341
+ evaluation_cost: NotRequired[Optional[float]]
342
+
343
+
344
+ class OtelTraceListItem(TypedDict):
345
+ organization_id: str
346
+ project_id: str
347
+ trace_id: str
348
+ created_at: str
349
+ duration: NotRequired[Optional[int]]
350
+ tags: NotRequired[Optional[List[str]]]
351
+ experiment_run_id: NotRequired[Optional[str]]
352
+ span_name: NotRequired[Optional[str]]
353
+ llm_cost: NotRequired[Optional[float]]
354
+ error: NotRequired[str]
355
+ scores: NotRequired[List[OtelSpanListItemScores]]
356
+ rules_invoked: NotRequired[List[str]]
357
+ customer_id: NotRequired[Optional[str]]
358
+ input: NotRequired[Optional[str]]
359
+ output: NotRequired[Optional[str]]
360
+ input_preview: NotRequired[Optional[str]]
361
+ output_preview: NotRequired[Optional[str]]
362
+ annotation_count: NotRequired[int]
363
+ span_id: str
364
+ rule_id: NotRequired[Optional[str]]
365
+
366
+
367
+ class OtelSpanDetail(TypedDict):
368
+ organization_id: str
369
+ project_id: str
370
+ timestamp: str
371
+ trace_id: str
372
+ span_id: str
373
+ parent_span_id: NotRequired[Optional[str]]
374
+ trace_state: NotRequired[Optional[str]]
375
+ span_name: NotRequired[Optional[str]]
376
+ span_kind: NotRequired[Optional[str]]
377
+ service_name: NotRequired[Optional[str]]
378
+ resource_attributes: NotRequired[Optional[Dict[str, Any]]]
379
+ span_attributes: NotRequired[Optional[Dict[str, Any]]]
380
+ duration: NotRequired[Optional[int]]
381
+ status_code: NotRequired[Optional[int]]
382
+ status_message: NotRequired[Optional[str]]
383
+ events: NotRequired[Optional[List[Dict[str, Any]]]]
384
+ links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
385
+ llm_cost: NotRequired[Optional[float]]
386
+ prompt_tokens: NotRequired[Optional[int]]
387
+ completion_tokens: NotRequired[Optional[int]]
388
+ scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
389
+
390
+
391
+ class EvaluateResponse(TypedDict):
392
+ status: str
393
+ results: List[ScoringResult]
394
+ resource_usage: NotRequired[Optional[UsageInfo]]
395
+
396
+
397
+ class EvalResults(TypedDict):
398
+ results: List[ScoringResult]
399
+ run: Union[ExampleEvaluationRun, TraceEvaluationRun]
400
+
401
+
402
+ class DatasetTraceWithSpans(TypedDict):
403
+ dataset_id: str
404
+ trace_detail: OtelTraceListItem
405
+ spans: List[OtelSpanDetail]
406
+
407
+
408
+ class DatasetReturn(TypedDict):
409
+ name: str
410
+ project_name: str
411
+ dataset_kind: DatasetKind
412
+ examples: NotRequired[List[Example]]
413
+ traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.prompts.prompt import Prompt
4
+ from judgeval.v1.prompts.prompt_factory import PromptFactory
5
+
6
+ __all__ = ["Prompt", "PromptFactory"]
@@ -0,0 +1,29 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from string import Template
6
+ from typing import Dict, List, Optional
7
+
8
+
9
+ @dataclass
10
+ class Prompt:
11
+ name: str
12
+ prompt: str
13
+ created_at: str
14
+ tags: List[str]
15
+ commit_id: str
16
+ parent_commit_id: Optional[str] = None
17
+ metadata: Dict[str, str] = field(default_factory=dict)
18
+ _template: Template = field(init=False, repr=False)
19
+
20
+ def __post_init__(self):
21
+ template_str = re.sub(r"\{\{([^}]+)\}\}", r"$\1", self.prompt)
22
+ self._template = Template(template_str)
23
+
24
+ def compile(self, **kwargs) -> str:
25
+ try:
26
+ return self._template.substitute(**kwargs)
27
+ except KeyError as e:
28
+ missing_var = str(e).strip("'")
29
+ raise ValueError(f"Missing required variable: {missing_var}")
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Optional, overload
4
+
5
+ from judgeval.logger import judgeval_logger
6
+ from judgeval.utils.decorators.dont_throw import dont_throw
7
+ from judgeval.v1.internal.api import JudgmentSyncClient
8
+ from judgeval.v1.prompts.prompt import Prompt
9
+ from judgeval.v1.utils import resolve_project_id
10
+
11
+
12
+ class PromptFactory:
13
+ __slots__ = "_client"
14
+
15
+ def __init__(self, client: JudgmentSyncClient):
16
+ self._client = client
17
+
18
+ def create(
19
+ self,
20
+ project_name: str,
21
+ name: str,
22
+ prompt: str,
23
+ tags: Optional[List[str]] = None,
24
+ ) -> Prompt:
25
+ try:
26
+ if tags is None:
27
+ tags = []
28
+
29
+ project_id = resolve_project_id(self._client, project_name)
30
+ assert project_id is not None
31
+ response = self._client.prompts_insert(
32
+ {
33
+ "project_id": project_id,
34
+ "name": name,
35
+ "prompt": prompt,
36
+ "tags": tags,
37
+ }
38
+ )
39
+ return Prompt(
40
+ name=name,
41
+ prompt=prompt,
42
+ created_at=response["created_at"],
43
+ tags=tags,
44
+ commit_id=response["commit_id"],
45
+ parent_commit_id=response.get("parent_commit_id"),
46
+ )
47
+ except Exception as e:
48
+ judgeval_logger.error(f"Failed to create prompt: {str(e)}")
49
+ raise
50
+
51
+ @overload
52
+ def get(
53
+ self,
54
+ /,
55
+ *,
56
+ project_name: str,
57
+ name: str,
58
+ commit_id: str,
59
+ ) -> Optional[Prompt]: ...
60
+
61
+ @overload
62
+ def get(
63
+ self,
64
+ /,
65
+ *,
66
+ project_name: str,
67
+ name: str,
68
+ tag: str,
69
+ ) -> Optional[Prompt]: ...
70
+
71
+ @dont_throw
72
+ def get(
73
+ self,
74
+ /,
75
+ *,
76
+ project_name: str,
77
+ name: str,
78
+ commit_id: Optional[str] = None,
79
+ tag: Optional[str] = None,
80
+ ) -> Optional[Prompt]:
81
+ if commit_id is not None and tag is not None:
82
+ judgeval_logger.error("Cannot fetch prompt by both commit_id and tag")
83
+ return None
84
+
85
+ project_id = resolve_project_id(self._client, project_name)
86
+ if project_id is None:
87
+ return None
88
+
89
+ response = self._client.prompts_fetch(
90
+ project_id=project_id,
91
+ name=name,
92
+ commit_id=commit_id,
93
+ tag=tag,
94
+ )
95
+
96
+ prompt_config = response.get("commit")
97
+ if prompt_config is None:
98
+ return None
99
+
100
+ return Prompt(
101
+ name=prompt_config["name"],
102
+ prompt=prompt_config["prompt"],
103
+ created_at=prompt_config["created_at"],
104
+ tags=prompt_config["tags"],
105
+ commit_id=prompt_config["commit_id"],
106
+ parent_commit_id=prompt_config.get("parent_commit_id"),
107
+ metadata={
108
+ "creator_first_name": prompt_config["first_name"],
109
+ "creator_last_name": prompt_config["last_name"],
110
+ "creator_email": prompt_config["user_email"],
111
+ },
112
+ )
113
+
114
+ def tag(
115
+ self,
116
+ project_name: str,
117
+ name: str,
118
+ commit_id: str,
119
+ tags: List[str],
120
+ ) -> str:
121
+ try:
122
+ project_id = resolve_project_id(self._client, project_name)
123
+ assert project_id is not None
124
+ response = self._client.prompts_tag(
125
+ {
126
+ "project_id": project_id,
127
+ "name": name,
128
+ "commit_id": commit_id,
129
+ "tags": tags,
130
+ }
131
+ )
132
+ return response["commit_id"]
133
+ except Exception as e:
134
+ judgeval_logger.error(f"Failed to tag prompt: {str(e)}")
135
+ raise
136
+
137
+ def untag(
138
+ self,
139
+ project_name: str,
140
+ name: str,
141
+ tags: List[str],
142
+ ) -> List[str]:
143
+ try:
144
+ project_id = resolve_project_id(self._client, project_name)
145
+ assert project_id is not None
146
+ response = self._client.prompts_untag(
147
+ {
148
+ "project_id": project_id,
149
+ "name": name,
150
+ "tags": tags,
151
+ }
152
+ )
153
+ return response["commit_ids"]
154
+ except Exception as e:
155
+ judgeval_logger.error(f"Failed to untag prompt: {str(e)}")
156
+ raise
157
+
158
+ def list(
159
+ self,
160
+ project_name: str,
161
+ name: str,
162
+ ) -> List[Prompt]:
163
+ try:
164
+ project_id = resolve_project_id(self._client, project_name)
165
+ assert project_id is not None
166
+ response = self._client.prompts_get_prompt_versions(
167
+ project_id=project_id,
168
+ name=name,
169
+ )
170
+
171
+ return [
172
+ Prompt(
173
+ name=prompt_config["name"],
174
+ prompt=prompt_config["prompt"],
175
+ tags=prompt_config["tags"],
176
+ created_at=prompt_config["created_at"],
177
+ commit_id=prompt_config["commit_id"],
178
+ parent_commit_id=prompt_config.get("parent_commit_id"),
179
+ metadata={
180
+ "creator_first_name": prompt_config["first_name"],
181
+ "creator_last_name": prompt_config["last_name"],
182
+ "creator_email": prompt_config["user_email"],
183
+ },
184
+ )
185
+ for prompt_config in response["versions"]
186
+ ]
187
+ except Exception as e:
188
+ judgeval_logger.error(f"Failed to list prompt versions: {str(e)}")
189
+ raise
judgeval/v1/py.typed ADDED
File without changes
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.scorers.base_scorer import BaseScorer
4
+ from judgeval.v1.scorers.scorers_factory import ScorersFactory
5
+
6
+ __all__ = ["BaseScorer", "ScorersFactory"]