judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -1,13 +1,176 @@
1
- # Import key components that should be publicly accessible
2
- from judgeval.clients import client, together_client
3
- from judgeval.judgment_client import JudgmentClient
4
- from judgeval.version_check import check_latest_version
1
+ from __future__ import annotations
2
+
3
+ from judgeval.data.result import ScoringResult
4
+ from judgeval.evaluation import run_eval
5
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
6
+
7
+
8
+ from typing import List, Optional, Union, Sequence
9
+ import ast
10
+ from judgeval.scorers import ExampleAPIScorerConfig
11
+ from judgeval.scorers.example_scorer import ExampleScorer
12
+ from judgeval.data.example import Example
13
+ from judgeval.logger import judgeval_logger
14
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
15
+ from judgeval.utils.meta import SingletonMeta
16
+ from judgeval.exceptions import JudgmentRuntimeError, JudgmentTestError
17
+ from judgeval.api import JudgmentSyncClient
18
+ from judgeval.utils.file_utils import extract_scorer_name
19
+ from judgeval.utils.guards import expect_api_key, expect_organization_id
20
+ from judgeval.utils.version_check import check_latest_version
21
+ from judgeval.utils.testing import assert_test_results
22
+ from judgeval.v1 import Judgeval
5
23
 
6
24
  check_latest_version()
7
25
 
8
- __all__ = [
9
- # Clients
10
- "client",
11
- "together_client",
12
- "JudgmentClient",
13
- ]
26
+
27
+ class JudgmentClient(metaclass=SingletonMeta):
28
+ __slots__ = ("api_key", "organization_id")
29
+
30
+ def __init__(
31
+ self,
32
+ api_key: Optional[str] = None,
33
+ organization_id: Optional[str] = None,
34
+ ):
35
+ _api_key = api_key or JUDGMENT_API_KEY
36
+ _organization_id = organization_id or JUDGMENT_ORG_ID
37
+
38
+ self.api_key = expect_api_key(_api_key)
39
+ self.organization_id = expect_organization_id(_organization_id)
40
+
41
+ def run_evaluation(
42
+ self,
43
+ examples: List[Example],
44
+ scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer, None]],
45
+ project_name: str = "default_project",
46
+ eval_run_name: str = "default_eval_run",
47
+ assert_test: bool = False,
48
+ ) -> List[ScoringResult]:
49
+ try:
50
+ for scorer in scorers:
51
+ if scorer is None:
52
+ raise ValueError(
53
+ "Failed to run evaluation: At least one Prompt Scorer was not successfuly retrieved."
54
+ )
55
+ eval = ExampleEvaluationRun(
56
+ project_name=project_name,
57
+ eval_name=eval_run_name,
58
+ examples=examples,
59
+ scorers=scorers, # type: ignore
60
+ )
61
+
62
+ results = run_eval(eval)
63
+ if assert_test:
64
+ assert_test_results(results)
65
+
66
+ return results
67
+
68
+ except JudgmentTestError as e:
69
+ raise JudgmentTestError(e)
70
+ except ValueError as e:
71
+ raise ValueError(
72
+ f"Please check your EvaluationRun object, one or more fields are invalid: \n{e}"
73
+ )
74
+ except Exception as e:
75
+ raise JudgmentRuntimeError(
76
+ f"An unexpected error occured during evaluation: {e}"
77
+ ) from e
78
+
79
+ def upload_custom_scorer(
80
+ self,
81
+ scorer_file_path: str,
82
+ requirements_file_path: Optional[str] = None,
83
+ unique_name: Optional[str] = None,
84
+ overwrite: bool = False,
85
+ ) -> bool:
86
+ """
87
+ Upload custom ExampleScorer from files to backend.
88
+
89
+ Args:
90
+ scorer_file_path: Path to Python file containing CustomScorer class
91
+ requirements_file_path: Optional path to requirements.txt
92
+ unique_name: Optional unique identifier (auto-detected from scorer.name if not provided)
93
+ overwrite: Whether to overwrite existing scorer if it already exists
94
+
95
+ Returns:
96
+ bool: True if upload successful
97
+
98
+ Raises:
99
+ ValueError: If scorer file is invalid
100
+ FileNotFoundError: If scorer file doesn't exist
101
+ """
102
+ import os
103
+
104
+ if not os.path.exists(scorer_file_path):
105
+ raise FileNotFoundError(f"Scorer file not found: {scorer_file_path}")
106
+
107
+ # Auto-detect scorer name if not provided
108
+ if unique_name is None:
109
+ unique_name = extract_scorer_name(scorer_file_path)
110
+ judgeval_logger.info(f"Auto-detected scorer name: '{unique_name}'")
111
+
112
+ # Read scorer code
113
+ with open(scorer_file_path, "r") as f:
114
+ scorer_code = f.read()
115
+
116
+ try:
117
+ tree = ast.parse(scorer_code, filename=scorer_file_path)
118
+ except SyntaxError as e:
119
+ error_msg = f"Invalid Python syntax in {scorer_file_path}: {e}"
120
+ judgeval_logger.error(error_msg)
121
+ raise ValueError(error_msg)
122
+
123
+ scorer_classes = []
124
+ for node in ast.walk(tree):
125
+ if isinstance(node, ast.ClassDef):
126
+ for base in node.bases:
127
+ if (isinstance(base, ast.Name) and base.id == "ExampleScorer") or (
128
+ isinstance(base, ast.Attribute) and base.attr == "ExampleScorer"
129
+ ):
130
+ scorer_classes.append(node.name)
131
+
132
+ if len(scorer_classes) > 1:
133
+ error_msg = f"Multiple ExampleScorer classes found in {scorer_file_path}: {scorer_classes}. Please only upload one scorer class per file."
134
+ judgeval_logger.error(error_msg)
135
+ raise ValueError(error_msg)
136
+ elif len(scorer_classes) == 0:
137
+ error_msg = f"No ExampleScorer class was found in {scorer_file_path}. Please ensure the file contains a valid scorer class that inherits from ExampleScorer."
138
+ judgeval_logger.error(error_msg)
139
+ raise ValueError(error_msg)
140
+
141
+ # Read requirements (optional)
142
+ requirements_text = ""
143
+ if requirements_file_path and os.path.exists(requirements_file_path):
144
+ with open(requirements_file_path, "r") as f:
145
+ requirements_text = f.read()
146
+
147
+ try:
148
+ if not self.api_key or not self.organization_id:
149
+ raise ValueError("Judgment API key and organization ID are required")
150
+ client = JudgmentSyncClient(
151
+ api_key=self.api_key,
152
+ organization_id=self.organization_id,
153
+ )
154
+ response = client.upload_custom_scorer(
155
+ payload={
156
+ "scorer_name": unique_name,
157
+ "scorer_code": scorer_code,
158
+ "requirements_text": requirements_text,
159
+ "overwrite": overwrite,
160
+ }
161
+ )
162
+
163
+ if response.get("status") == "success":
164
+ judgeval_logger.info(
165
+ f"Successfully uploaded custom scorer: {unique_name}"
166
+ )
167
+ return True
168
+ else:
169
+ judgeval_logger.error(f"Failed to upload custom scorer: {unique_name}")
170
+ return False
171
+
172
+ except Exception:
173
+ raise
174
+
175
+
176
+ __all__ = ("JudgmentClient", "Judgeval")
@@ -0,0 +1,523 @@
1
+ from typing import Dict, Any, Mapping, Literal, Optional
2
+ import httpx
3
+ from httpx import Response
4
+ from judgeval.exceptions import JudgmentAPIError
5
+ from judgeval.utils.url import url_for
6
+ from judgeval.utils.serialize import json_encoder
7
+ from judgeval.api.api_types import *
8
+
9
+
10
+ def _headers(api_key: str, organization_id: str) -> Mapping[str, str]:
11
+ return {
12
+ "Content-Type": "application/json",
13
+ "Authorization": f"Bearer {api_key}",
14
+ "X-Organization-Id": organization_id,
15
+ }
16
+
17
+
18
+ def _handle_response(r: Response) -> Any:
19
+ if r.status_code >= 400:
20
+ try:
21
+ detail = r.json().get("detail", "")
22
+ except Exception:
23
+ detail = r.text
24
+ raise JudgmentAPIError(r.status_code, detail, r)
25
+ return r.json()
26
+
27
+
28
+ class JudgmentSyncClient:
29
+ __slots__ = ("api_key", "organization_id", "client")
30
+
31
+ def __init__(self, api_key: str, organization_id: str):
32
+ self.api_key = api_key
33
+ self.organization_id = organization_id
34
+ self.client = httpx.Client(timeout=30)
35
+
36
+ def _request(
37
+ self,
38
+ method: Literal["POST", "PATCH", "GET", "DELETE"],
39
+ url: str,
40
+ payload: Any,
41
+ params: Optional[Dict[str, Any]] = None,
42
+ ) -> Any:
43
+ if method == "GET":
44
+ r = self.client.request(
45
+ method,
46
+ url,
47
+ params=payload if params is None else params,
48
+ headers=_headers(self.api_key, self.organization_id),
49
+ )
50
+ else:
51
+ r = self.client.request(
52
+ method,
53
+ url,
54
+ json=json_encoder(payload),
55
+ params=params,
56
+ headers=_headers(self.api_key, self.organization_id),
57
+ )
58
+ return _handle_response(r)
59
+
60
+ def add_to_run_eval_queue_examples(self, payload: ExampleEvaluationRun) -> Any:
61
+ return self._request(
62
+ "POST",
63
+ url_for("/add_to_run_eval_queue/examples"),
64
+ payload,
65
+ )
66
+
67
+ def add_to_run_eval_queue_traces(self, payload: TraceEvaluationRun) -> Any:
68
+ return self._request(
69
+ "POST",
70
+ url_for("/add_to_run_eval_queue/traces"),
71
+ payload,
72
+ )
73
+
74
+ def evaluate_examples(
75
+ self, payload: ExampleEvaluationRun, stream: Optional[str] = None
76
+ ) -> EvaluateResponse:
77
+ query_params = {}
78
+ if stream is not None:
79
+ query_params["stream"] = stream
80
+ return self._request(
81
+ "POST",
82
+ url_for("/evaluate/examples"),
83
+ payload,
84
+ params=query_params,
85
+ )
86
+
87
+ def evaluate_traces(
88
+ self, payload: TraceEvaluationRun, stream: Optional[str] = None
89
+ ) -> EvaluateResponse:
90
+ query_params = {}
91
+ if stream is not None:
92
+ query_params["stream"] = stream
93
+ return self._request(
94
+ "POST",
95
+ url_for("/evaluate/traces"),
96
+ payload,
97
+ params=query_params,
98
+ )
99
+
100
+ def log_eval_results(self, payload: EvalResults) -> LogEvalResultsResponse:
101
+ return self._request(
102
+ "POST",
103
+ url_for("/log_eval_results/"),
104
+ payload,
105
+ )
106
+
107
+ def fetch_experiment_run(
108
+ self, payload: EvalResultsFetch
109
+ ) -> FetchExperimentRunResponse:
110
+ return self._request(
111
+ "POST",
112
+ url_for("/fetch_experiment_run/"),
113
+ payload,
114
+ )
115
+
116
+ def datasets_insert_examples_for_judgeval(
117
+ self, payload: DatasetInsertExamples
118
+ ) -> Any:
119
+ return self._request(
120
+ "POST",
121
+ url_for("/datasets/insert_examples_for_judgeval/"),
122
+ payload,
123
+ )
124
+
125
+ def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
126
+ return self._request(
127
+ "POST",
128
+ url_for("/datasets/pull_for_judgeval/"),
129
+ payload,
130
+ )
131
+
132
+ def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
133
+ return self._request(
134
+ "POST",
135
+ url_for("/datasets/pull_all_for_judgeval/"),
136
+ payload,
137
+ )
138
+
139
+ def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
140
+ return self._request(
141
+ "POST",
142
+ url_for("/datasets/create_for_judgeval/"),
143
+ payload,
144
+ )
145
+
146
+ def projects_add(self, payload: ProjectAdd) -> ProjectAddResponse:
147
+ return self._request(
148
+ "POST",
149
+ url_for("/projects/add/"),
150
+ payload,
151
+ )
152
+
153
+ def projects_delete_from_judgeval(
154
+ self, payload: ProjectDeleteFromJudgevalResponse
155
+ ) -> ProjectDeleteResponse:
156
+ return self._request(
157
+ "DELETE",
158
+ url_for("/projects/delete_from_judgeval/"),
159
+ payload,
160
+ )
161
+
162
+ def scorer_exists(self, payload: ScorerExistsRequest) -> ScorerExistsResponse:
163
+ return self._request(
164
+ "POST",
165
+ url_for("/scorer_exists/"),
166
+ payload,
167
+ )
168
+
169
+ def save_scorer(self, payload: SavePromptScorerRequest) -> SavePromptScorerResponse:
170
+ return self._request(
171
+ "POST",
172
+ url_for("/save_scorer/"),
173
+ payload,
174
+ )
175
+
176
+ def fetch_scorers(
177
+ self, payload: FetchPromptScorersRequest
178
+ ) -> FetchPromptScorersResponse:
179
+ return self._request(
180
+ "POST",
181
+ url_for("/fetch_scorers/"),
182
+ payload,
183
+ )
184
+
185
+ def upload_custom_scorer(
186
+ self, payload: CustomScorerUploadPayload
187
+ ) -> CustomScorerTemplateResponse:
188
+ return self._request(
189
+ "POST",
190
+ url_for("/upload_custom_scorer/"),
191
+ payload,
192
+ )
193
+
194
+ def prompts_insert(self, payload: PromptInsertRequest) -> PromptInsertResponse:
195
+ return self._request(
196
+ "POST",
197
+ url_for("/prompts/insert/"),
198
+ payload,
199
+ )
200
+
201
+ def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
202
+ return self._request(
203
+ "POST",
204
+ url_for("/prompts/tag/"),
205
+ payload,
206
+ )
207
+
208
+ def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
209
+ return self._request(
210
+ "POST",
211
+ url_for("/prompts/untag/"),
212
+ payload,
213
+ )
214
+
215
+ def prompts_fetch(
216
+ self,
217
+ project_id: str,
218
+ name: str,
219
+ commit_id: Optional[str] = None,
220
+ tag: Optional[str] = None,
221
+ ) -> PromptFetchResponse:
222
+ query_params = {}
223
+ query_params["project_id"] = project_id
224
+ query_params["name"] = name
225
+ if commit_id is not None:
226
+ query_params["commit_id"] = commit_id
227
+ if tag is not None:
228
+ query_params["tag"] = tag
229
+ return self._request(
230
+ "GET",
231
+ url_for("/prompts/fetch/"),
232
+ query_params,
233
+ )
234
+
235
+ def prompts_get_prompt_versions(
236
+ self, project_id: str, name: str
237
+ ) -> PromptVersionsResponse:
238
+ query_params = {}
239
+ query_params["project_id"] = project_id
240
+ query_params["name"] = name
241
+ return self._request(
242
+ "GET",
243
+ url_for("/prompts/get_prompt_versions/"),
244
+ query_params,
245
+ )
246
+
247
+ def projects_resolve(
248
+ self, payload: ResolveProjectNameRequest
249
+ ) -> ResolveProjectNameResponse:
250
+ return self._request(
251
+ "POST",
252
+ url_for("/projects/resolve/"),
253
+ payload,
254
+ )
255
+
256
+ def e2e_fetch_trace(self, payload: TraceIdRequest) -> Any:
257
+ return self._request(
258
+ "POST",
259
+ url_for("/e2e_fetch_trace/"),
260
+ payload,
261
+ )
262
+
263
+ def e2e_fetch_span_score(self, payload: SpanScoreRequest) -> Any:
264
+ return self._request(
265
+ "POST",
266
+ url_for("/e2e_fetch_span_score/"),
267
+ payload,
268
+ )
269
+
270
+
271
+ class JudgmentAsyncClient:
272
+ __slots__ = ("api_key", "organization_id", "client")
273
+
274
+ def __init__(self, api_key: str, organization_id: str):
275
+ self.api_key = api_key
276
+ self.organization_id = organization_id
277
+ self.client = httpx.AsyncClient(timeout=30)
278
+
279
+ async def _request(
280
+ self,
281
+ method: Literal["POST", "PATCH", "GET", "DELETE"],
282
+ url: str,
283
+ payload: Any,
284
+ params: Optional[Dict[str, Any]] = None,
285
+ ) -> Any:
286
+ if method == "GET":
287
+ r = self.client.request(
288
+ method,
289
+ url,
290
+ params=payload if params is None else params,
291
+ headers=_headers(self.api_key, self.organization_id),
292
+ )
293
+ else:
294
+ r = self.client.request(
295
+ method,
296
+ url,
297
+ json=json_encoder(payload),
298
+ params=params,
299
+ headers=_headers(self.api_key, self.organization_id),
300
+ )
301
+ return _handle_response(await r)
302
+
303
+ async def add_to_run_eval_queue_examples(
304
+ self, payload: ExampleEvaluationRun
305
+ ) -> Any:
306
+ return await self._request(
307
+ "POST",
308
+ url_for("/add_to_run_eval_queue/examples"),
309
+ payload,
310
+ )
311
+
312
+ async def add_to_run_eval_queue_traces(self, payload: TraceEvaluationRun) -> Any:
313
+ return await self._request(
314
+ "POST",
315
+ url_for("/add_to_run_eval_queue/traces"),
316
+ payload,
317
+ )
318
+
319
+ async def evaluate_examples(
320
+ self, payload: ExampleEvaluationRun, stream: Optional[str] = None
321
+ ) -> EvaluateResponse:
322
+ query_params = {}
323
+ if stream is not None:
324
+ query_params["stream"] = stream
325
+ return await self._request(
326
+ "POST",
327
+ url_for("/evaluate/examples"),
328
+ payload,
329
+ params=query_params,
330
+ )
331
+
332
+ async def evaluate_traces(
333
+ self, payload: TraceEvaluationRun, stream: Optional[str] = None
334
+ ) -> EvaluateResponse:
335
+ query_params = {}
336
+ if stream is not None:
337
+ query_params["stream"] = stream
338
+ return await self._request(
339
+ "POST",
340
+ url_for("/evaluate/traces"),
341
+ payload,
342
+ params=query_params,
343
+ )
344
+
345
+ async def log_eval_results(self, payload: EvalResults) -> LogEvalResultsResponse:
346
+ return await self._request(
347
+ "POST",
348
+ url_for("/log_eval_results/"),
349
+ payload,
350
+ )
351
+
352
+ async def fetch_experiment_run(
353
+ self, payload: EvalResultsFetch
354
+ ) -> FetchExperimentRunResponse:
355
+ return await self._request(
356
+ "POST",
357
+ url_for("/fetch_experiment_run/"),
358
+ payload,
359
+ )
360
+
361
+ async def datasets_insert_examples_for_judgeval(
362
+ self, payload: DatasetInsertExamples
363
+ ) -> Any:
364
+ return await self._request(
365
+ "POST",
366
+ url_for("/datasets/insert_examples_for_judgeval/"),
367
+ payload,
368
+ )
369
+
370
+ async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
371
+ return await self._request(
372
+ "POST",
373
+ url_for("/datasets/pull_for_judgeval/"),
374
+ payload,
375
+ )
376
+
377
+ async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
378
+ return await self._request(
379
+ "POST",
380
+ url_for("/datasets/pull_all_for_judgeval/"),
381
+ payload,
382
+ )
383
+
384
+ async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
385
+ return await self._request(
386
+ "POST",
387
+ url_for("/datasets/create_for_judgeval/"),
388
+ payload,
389
+ )
390
+
391
+ async def projects_add(self, payload: ProjectAdd) -> ProjectAddResponse:
392
+ return await self._request(
393
+ "POST",
394
+ url_for("/projects/add/"),
395
+ payload,
396
+ )
397
+
398
+ async def projects_delete_from_judgeval(
399
+ self, payload: ProjectDeleteFromJudgevalResponse
400
+ ) -> ProjectDeleteResponse:
401
+ return await self._request(
402
+ "DELETE",
403
+ url_for("/projects/delete_from_judgeval/"),
404
+ payload,
405
+ )
406
+
407
+ async def scorer_exists(self, payload: ScorerExistsRequest) -> ScorerExistsResponse:
408
+ return await self._request(
409
+ "POST",
410
+ url_for("/scorer_exists/"),
411
+ payload,
412
+ )
413
+
414
+ async def save_scorer(
415
+ self, payload: SavePromptScorerRequest
416
+ ) -> SavePromptScorerResponse:
417
+ return await self._request(
418
+ "POST",
419
+ url_for("/save_scorer/"),
420
+ payload,
421
+ )
422
+
423
+ async def fetch_scorers(
424
+ self, payload: FetchPromptScorersRequest
425
+ ) -> FetchPromptScorersResponse:
426
+ return await self._request(
427
+ "POST",
428
+ url_for("/fetch_scorers/"),
429
+ payload,
430
+ )
431
+
432
+ async def upload_custom_scorer(
433
+ self, payload: CustomScorerUploadPayload
434
+ ) -> CustomScorerTemplateResponse:
435
+ return await self._request(
436
+ "POST",
437
+ url_for("/upload_custom_scorer/"),
438
+ payload,
439
+ )
440
+
441
+ async def prompts_insert(
442
+ self, payload: PromptInsertRequest
443
+ ) -> PromptInsertResponse:
444
+ return await self._request(
445
+ "POST",
446
+ url_for("/prompts/insert/"),
447
+ payload,
448
+ )
449
+
450
+ async def prompts_tag(self, payload: PromptTagRequest) -> PromptTagResponse:
451
+ return await self._request(
452
+ "POST",
453
+ url_for("/prompts/tag/"),
454
+ payload,
455
+ )
456
+
457
+ async def prompts_untag(self, payload: PromptUntagRequest) -> PromptUntagResponse:
458
+ return await self._request(
459
+ "POST",
460
+ url_for("/prompts/untag/"),
461
+ payload,
462
+ )
463
+
464
+ async def prompts_fetch(
465
+ self,
466
+ project_id: str,
467
+ name: str,
468
+ commit_id: Optional[str] = None,
469
+ tag: Optional[str] = None,
470
+ ) -> PromptFetchResponse:
471
+ query_params = {}
472
+ query_params["project_id"] = project_id
473
+ query_params["name"] = name
474
+ if commit_id is not None:
475
+ query_params["commit_id"] = commit_id
476
+ if tag is not None:
477
+ query_params["tag"] = tag
478
+ return await self._request(
479
+ "GET",
480
+ url_for("/prompts/fetch/"),
481
+ query_params,
482
+ )
483
+
484
+ async def prompts_get_prompt_versions(
485
+ self, project_id: str, name: str
486
+ ) -> PromptVersionsResponse:
487
+ query_params = {}
488
+ query_params["project_id"] = project_id
489
+ query_params["name"] = name
490
+ return await self._request(
491
+ "GET",
492
+ url_for("/prompts/get_prompt_versions/"),
493
+ query_params,
494
+ )
495
+
496
+ async def projects_resolve(
497
+ self, payload: ResolveProjectNameRequest
498
+ ) -> ResolveProjectNameResponse:
499
+ return await self._request(
500
+ "POST",
501
+ url_for("/projects/resolve/"),
502
+ payload,
503
+ )
504
+
505
+ async def e2e_fetch_trace(self, payload: TraceIdRequest) -> Any:
506
+ return await self._request(
507
+ "POST",
508
+ url_for("/e2e_fetch_trace/"),
509
+ payload,
510
+ )
511
+
512
+ async def e2e_fetch_span_score(self, payload: SpanScoreRequest) -> Any:
513
+ return await self._request(
514
+ "POST",
515
+ url_for("/e2e_fetch_span_score/"),
516
+ payload,
517
+ )
518
+
519
+
520
+ __all__ = [
521
+ "JudgmentSyncClient",
522
+ "JudgmentAsyncClient",
523
+ ]