judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,264 @@
1
+ import datetime
2
+ import orjson
3
+ import os
4
+ import yaml
5
+ from dataclasses import dataclass
6
+ from typing import List, Literal, Optional
7
+
8
+ from judgeval.data import Example
9
+ from judgeval.data.trace import Trace
10
+ from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
11
+ from judgeval.api import JudgmentSyncClient
12
+ from judgeval.logger import judgeval_logger
13
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
14
+
15
+ from judgeval.data.judgment_types import DatasetKind
16
+
17
+
18
+ @dataclass
19
+ class DatasetInfo:
20
+ dataset_id: str
21
+ name: str
22
+ created_at: str
23
+ kind: DatasetKind
24
+ entries: int
25
+ creator: str
26
+
27
+
28
+ @dataclass
29
+ class Dataset:
30
+ name: str
31
+ project_name: str
32
+ dataset_kind: DatasetKind = DatasetKind.example
33
+ examples: Optional[List[Example]] = None
34
+ traces: Optional[List[Trace]] = None
35
+ judgment_api_key: str | None = JUDGMENT_API_KEY
36
+ organization_id: str | None = JUDGMENT_ORG_ID
37
+
38
+ @classmethod
39
+ def get(
40
+ cls,
41
+ name: str,
42
+ project_name: str,
43
+ ):
44
+ if not cls.judgment_api_key or not cls.organization_id:
45
+ raise ValueError("Judgment API key and organization ID are required")
46
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
47
+ dataset = client.datasets_pull_for_judgeval(
48
+ {
49
+ "dataset_name": name,
50
+ "project_name": project_name,
51
+ },
52
+ )
53
+ if not dataset:
54
+ raise ValueError(f"Dataset {name} not found in project {project_name}")
55
+
56
+ dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
57
+
58
+ if dataset_kind == DatasetKind.example:
59
+ examples = dataset.get("examples", [])
60
+ if examples is None:
61
+ examples = []
62
+
63
+ for e in examples:
64
+ if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
65
+ e.update(e.pop("data")) # type: ignore
66
+ e.pop(
67
+ "example_id"
68
+ ) # TODO: remove once scorer data migration is complete
69
+ judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
70
+ return cls(
71
+ name=name,
72
+ project_name=project_name,
73
+ dataset_kind=dataset_kind,
74
+ examples=[Example(**e) for e in examples],
75
+ )
76
+
77
+ elif dataset_kind == DatasetKind.trace:
78
+ trace_data = dataset.get("traces", [])
79
+ if trace_data is None:
80
+ trace_data = []
81
+
82
+ traces = []
83
+ for trace_item in trace_data:
84
+ if isinstance(trace_item, dict):
85
+ trace = Trace.from_dataset_trace_with_spans(trace_item)
86
+ traces.append(trace)
87
+
88
+ judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
89
+ return cls(
90
+ name=name,
91
+ project_name=project_name,
92
+ dataset_kind=dataset_kind,
93
+ traces=traces,
94
+ )
95
+
96
+ else:
97
+ raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
98
+
99
+ @classmethod
100
+ def create(
101
+ cls,
102
+ name: str,
103
+ project_name: str,
104
+ examples: List[Example] = [],
105
+ overwrite: bool = False,
106
+ ):
107
+ if not cls.judgment_api_key or not cls.organization_id:
108
+ raise ValueError("Judgment API key and organization ID are required")
109
+ if not examples:
110
+ examples = []
111
+
112
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
113
+ client.datasets_create_for_judgeval(
114
+ {
115
+ "name": name,
116
+ "project_name": project_name,
117
+ "examples": examples, # type: ignore
118
+ "dataset_kind": "example",
119
+ "overwrite": overwrite,
120
+ }
121
+ )
122
+
123
+ judgeval_logger.info(f"Successfully created dataset {name}!")
124
+ return cls(
125
+ name=name,
126
+ project_name=project_name,
127
+ examples=examples,
128
+ )
129
+
130
+ @classmethod
131
+ def list(cls, project_name: str):
132
+ if not cls.judgment_api_key or not cls.organization_id:
133
+ raise ValueError("Judgment API key and organization ID are required")
134
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
135
+ datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
136
+
137
+ judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
138
+
139
+ return [DatasetInfo(**dataset_info) for dataset_info in datasets]
140
+
141
+ def add_from_json(self, file_path: str) -> None:
142
+ """
143
+ Adds examples from a JSON file.
144
+
145
+ The JSON file is expected to have the following format:
146
+ [
147
+ {
148
+ "key_01": "value_01",
149
+ "key_02": "value_02"
150
+ },
151
+ {
152
+ "key_11": "value_11",
153
+ "key_12": "value_12",
154
+ "key_13": "value_13"
155
+ },
156
+ ...
157
+ ]
158
+ """
159
+ examples = get_examples_from_json(file_path)
160
+ self.add_examples(examples)
161
+
162
+ def add_from_yaml(self, file_path: str) -> None:
163
+ """
164
+ Adds examples from a YAML file.
165
+
166
+ The YAML file is expected to have the following format:
167
+ - key_01: value_01
168
+ key_02: value_02
169
+ - key_11: value_11
170
+ key_12: value_12
171
+ key_13: value_13
172
+ ...
173
+ """
174
+
175
+ examples = get_examples_from_yaml(file_path)
176
+ self.add_examples(examples)
177
+
178
+ def add_examples(self, examples: List[Example]) -> None:
179
+ if not isinstance(examples, list):
180
+ raise TypeError("examples must be a list")
181
+
182
+ if not self.judgment_api_key or not self.organization_id:
183
+ raise ValueError("Judgment API key and organization ID are required")
184
+
185
+ client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
186
+ client.datasets_insert_examples_for_judgeval(
187
+ {
188
+ "dataset_name": self.name,
189
+ "project_name": self.project_name,
190
+ "examples": examples, # type: ignore
191
+ }
192
+ )
193
+
194
+ def save_as(
195
+ self,
196
+ file_type: Literal["json", "yaml"],
197
+ dir_path: str,
198
+ save_name: str | None = None,
199
+ ) -> None:
200
+ """
201
+ Saves the dataset as a file. Save only the examples.
202
+
203
+ Args:
204
+ file_type (Literal["json", "csv"]): The file type to save the dataset as.
205
+ dir_path (str): The directory path to save the file to.
206
+ save_name (str, optional): The name of the file to save. Defaults to None.
207
+ """
208
+ if not os.path.exists(dir_path):
209
+ os.makedirs(dir_path)
210
+ file_name = (
211
+ datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
212
+ if save_name is None
213
+ else save_name
214
+ )
215
+ complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
216
+ if file_type == "json":
217
+ with open(complete_path, "wb") as file:
218
+ file.write(
219
+ orjson.dumps(
220
+ {
221
+ "examples": [e.to_dict() for e in self.examples]
222
+ if self.examples
223
+ else [],
224
+ },
225
+ option=orjson.OPT_INDENT_2,
226
+ )
227
+ )
228
+ elif file_type == "yaml":
229
+ with open(complete_path, "w") as file:
230
+ yaml_data = {
231
+ "examples": [e.to_dict() for e in self.examples]
232
+ if self.examples
233
+ else [],
234
+ }
235
+ yaml.dump(yaml_data, file, default_flow_style=False)
236
+ else:
237
+ ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
238
+ raise TypeError(
239
+ f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
240
+ )
241
+
242
+ def __iter__(self):
243
+ if self.dataset_kind == DatasetKind.example and self.examples:
244
+ return iter(self.examples)
245
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
246
+ return iter(self.traces)
247
+ else:
248
+ return iter([])
249
+
250
+ def __len__(self):
251
+ if self.dataset_kind == DatasetKind.example and self.examples:
252
+ return len(self.examples)
253
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
254
+ return len(self.traces)
255
+ else:
256
+ return 0
257
+
258
+ def __str__(self):
259
+ if self.dataset_kind == DatasetKind.example:
260
+ return (
261
+ f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
262
+ )
263
+ else:
264
+ return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"
judgeval/env.py ADDED
@@ -0,0 +1,52 @@
1
+ from __future__ import annotations
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ import os
7
+ from typing import overload
8
+
9
+
10
+ @overload
11
+ def optional_env_var(var_name: str) -> str | None: ...
12
+
13
+
14
+ @overload
15
+ def optional_env_var(var_name: str, default: str) -> str: ...
16
+
17
+
18
+ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
19
+ return os.getenv(var_name, default)
20
+
21
+
22
+ JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
23
+ JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
24
+ JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
25
+
26
+ JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5")
27
+ JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
28
+ "JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
29
+ )
30
+ JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
31
+ optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
32
+ )
33
+
34
+
35
+ JUDGMENT_ENABLE_MONITORING = optional_env_var("JUDGMENT_ENABLE_MONITORING", "true")
36
+ JUDGMENT_ENABLE_EVALUATIONS = optional_env_var("JUDGMENT_ENABLE_EVALUATIONS", "true")
37
+
38
+ JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
39
+ JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
40
+ JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
41
+ JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
42
+ JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
43
+ JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
44
+ JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
45
+ JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
46
+
47
+
48
+ JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
49
+
50
+
51
+ TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
52
+ TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")
@@ -0,0 +1,344 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import concurrent.futures
5
+ import time
6
+ import threading
7
+ from typing import List, Tuple, TYPE_CHECKING
8
+ from rich import print as rprint
9
+
10
+ from judgeval.data import ScorerData, ScoringResult
11
+ from judgeval.scorers.score import a_execute_scoring
12
+ from judgeval.api import JudgmentSyncClient
13
+ from judgeval.env import (
14
+ JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
15
+ )
16
+ from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
17
+ from judgeval.logger import judgeval_logger
18
+
19
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
20
+
21
+ if TYPE_CHECKING:
22
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
23
+
24
+
25
+ def safe_run_async(coro):
26
+ """
27
+ Safely run an async coroutine whether or not there's already an event loop running.
28
+
29
+ Args:
30
+ coro: The coroutine to run
31
+
32
+ Returns:
33
+ The result of the coroutine
34
+ """
35
+ try:
36
+ # Try to get the running loop
37
+ asyncio.get_running_loop()
38
+ # If we get here, there's already a loop running
39
+ # Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
40
+ with concurrent.futures.ThreadPoolExecutor() as executor:
41
+ future = executor.submit(asyncio.run, coro)
42
+ return future.result()
43
+ except RuntimeError:
44
+ # No event loop is running, safe to use asyncio.run()
45
+ return asyncio.run(coro)
46
+
47
+
48
+ def log_evaluation_results(
49
+ scoring_results: List[ScoringResult],
50
+ run: ExampleEvaluationRun,
51
+ ) -> str:
52
+ """
53
+ Logs evaluation results to the Judgment API database.
54
+
55
+ Args:
56
+ merged_results (List[ScoringResult]): The results to log
57
+ evaluation_run (EvaluationRun): The evaluation run containing project info and API key
58
+ judgment_api_key (str): The API key for the Judgment API
59
+
60
+ Raises:
61
+ JudgmentAPIError: If there's an API error during logging
62
+ ValueError: If there's a validation error with the results
63
+ """
64
+ try:
65
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
66
+ raise ValueError("API key and organization ID are required")
67
+
68
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
69
+ response = api_client.log_eval_results(
70
+ {
71
+ "results": scoring_results, # type: ignore
72
+ "run": run.model_dump(warnings=False), # type: ignore
73
+ }
74
+ )
75
+ url = response.get("ui_results_url")
76
+ return url
77
+
78
+ except Exception as e:
79
+ judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
80
+ raise JudgmentRuntimeError(
81
+ f"Request failed while saving evaluation results to DB: {str(e)}"
82
+ )
83
+
84
+
85
+ def _poll_evaluation_until_complete(
86
+ evaluation_run: ExampleEvaluationRun,
87
+ expected_examples_count: int,
88
+ poll_interval_seconds: float = 5,
89
+ max_failures: int = 5,
90
+ max_poll_count: int = 60, # This should be equivalent to 5 minutes
91
+ ) -> Tuple[List[ScoringResult], str]:
92
+ """
93
+ Polls until the evaluation is complete and returns the results.
94
+
95
+ Args:
96
+ eval_name (str): Name of the evaluation run
97
+ project_name (str): Name of the project
98
+ judgment_api_key (str): API key for authentication
99
+ organization_id (str): Organization ID for the evaluation
100
+ poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
101
+ original_examples (List[Example], optional): The original examples sent for evaluation.
102
+ If provided, will match results with original examples.
103
+
104
+ Returns:
105
+ List[ScoringResult]: The evaluation results
106
+ """
107
+ project_name = evaluation_run.project_name
108
+ experiment_run_id = evaluation_run.id
109
+
110
+ if not project_name or not experiment_run_id:
111
+ raise ValueError("Project name and experiment run ID are required")
112
+
113
+ poll_count = 0
114
+ exception_count = 0
115
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
116
+ raise ValueError("Judgment API key and organization ID are required")
117
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
118
+ while poll_count < max_poll_count:
119
+ poll_count += 1
120
+ try:
121
+ # Check status
122
+ results_response = api_client.fetch_experiment_run(
123
+ {
124
+ "experiment_run_id": experiment_run_id,
125
+ "project_name": project_name,
126
+ }
127
+ )
128
+
129
+ example_scorer_pairings = results_response.get("results", [])
130
+ if len(example_scorer_pairings) != expected_examples_count:
131
+ time.sleep(poll_interval_seconds)
132
+ continue
133
+
134
+ url = results_response.get("ui_results_url")
135
+
136
+ scoring_result_list = []
137
+ for res in example_scorer_pairings:
138
+ example = res.get("data", {}).copy()
139
+ example["example_id"] = res.get("example_id")
140
+ scoring_result = ScoringResult(
141
+ scorers_data=res.get("scorers", []),
142
+ success=all(
143
+ t.get("success", False) for t in res.get("scorers", [])
144
+ ),
145
+ data_object=example,
146
+ )
147
+ scoring_result_list.append(scoring_result)
148
+
149
+ return scoring_result_list, url
150
+ except Exception as e:
151
+ exception_count += 1
152
+ if isinstance(e, JudgmentAPIError):
153
+ raise
154
+
155
+ judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
156
+ if exception_count > max_failures:
157
+ raise JudgmentRuntimeError(
158
+ f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
159
+ )
160
+
161
+ time.sleep(poll_interval_seconds)
162
+
163
+ raise JudgmentRuntimeError(
164
+ f"Error checking evaluation status after {poll_count} attempts"
165
+ )
166
+
167
+
168
+ def progress_logger(stop_event, msg="Working...", interval=5):
169
+ start = time.time()
170
+ while not stop_event.is_set():
171
+ elapsed = int(time.time() - start)
172
+ judgeval_logger.info(f"{msg} ({elapsed} sec)")
173
+ stop_event.wait(interval)
174
+
175
+
176
+ def run_eval(
177
+ evaluation_run: ExampleEvaluationRun,
178
+ ) -> List[ScoringResult]:
179
+ """
180
+ Executes an evaluation of `Example`s using one or more `Scorer`s
181
+
182
+ Args:
183
+ evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
184
+
185
+ Returns:
186
+ List[ScoringResult]: A list of ScoringResult objects
187
+ """
188
+ # Check that every example has the same keys
189
+ keys = evaluation_run.examples[0].get_fields().keys()
190
+ for example in evaluation_run.examples:
191
+ current_keys = example.get_fields().keys()
192
+ if current_keys != keys:
193
+ raise ValueError(
194
+ f"All examples must have the same keys: {current_keys} != {keys}"
195
+ )
196
+
197
+ results: List[ScoringResult] = []
198
+ url = ""
199
+
200
+ if (
201
+ len(evaluation_run.custom_scorers) > 0
202
+ and len(evaluation_run.judgment_scorers) > 0
203
+ ):
204
+ error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
205
+ judgeval_logger.error(error_msg)
206
+ raise ValueError(error_msg)
207
+
208
+ e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
209
+
210
+ if evaluation_run.judgment_scorers or e2b_scorers:
211
+ if evaluation_run.judgment_scorers and e2b_scorers:
212
+ error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
213
+ judgeval_logger.error(error_msg)
214
+ raise ValueError(error_msg)
215
+
216
+ if len(e2b_scorers) > 1:
217
+ error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
218
+ judgeval_logger.error(error_msg)
219
+ raise ValueError(error_msg)
220
+
221
+ stop_event = threading.Event()
222
+ t = threading.Thread(
223
+ target=progress_logger, args=(stop_event, "Running evaluation...")
224
+ )
225
+ t.start()
226
+ try:
227
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
228
+ raise ValueError("Judgment API key and organization ID are required")
229
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
230
+ response = api_client.add_to_run_eval_queue_examples(
231
+ evaluation_run.model_dump(warnings=False) # type: ignore
232
+ )
233
+
234
+ if not response.get("success", False):
235
+ error_message = response.error
236
+ judgeval_logger.error(
237
+ f"Error adding evaluation to queue: {error_message}"
238
+ )
239
+ raise JudgmentRuntimeError(error_message)
240
+
241
+ results, url = _poll_evaluation_until_complete(
242
+ evaluation_run=evaluation_run,
243
+ expected_examples_count=len(evaluation_run.examples),
244
+ )
245
+ finally:
246
+ stop_event.set()
247
+ t.join()
248
+ else:
249
+ results = safe_run_async(
250
+ a_execute_scoring(
251
+ evaluation_run.examples,
252
+ evaluation_run.custom_scorers,
253
+ model=evaluation_run.model,
254
+ throttle_value=0,
255
+ max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
256
+ )
257
+ )
258
+
259
+ send_results = [
260
+ scoring_result.model_dump(warnings=False) for scoring_result in results
261
+ ]
262
+ url = log_evaluation_results(send_results, evaluation_run)
263
+ rprint(
264
+ f"\nšŸ” You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
265
+ )
266
+ return results
267
+
268
+
269
+ def assert_test(scoring_results: List[ScoringResult]) -> None:
270
+ """
271
+ Collects all failed scorers from the scoring results.
272
+
273
+ Args:
274
+ ScoringResults (List[ScoringResult]): List of scoring results to check
275
+
276
+ Returns:
277
+ None. Raises exceptions for any failed test cases.
278
+ """
279
+ failed_cases: List[List[ScorerData]] = []
280
+
281
+ for result in scoring_results:
282
+ if not result.success:
283
+ # Create a test case context with all relevant fields
284
+ test_case: List[ScorerData] = []
285
+ if result.scorers_data:
286
+ # If the result was not successful, check each scorer_data
287
+ for scorer_data in result.scorers_data:
288
+ if not scorer_data.success:
289
+ test_case.append(scorer_data)
290
+ failed_cases.append(test_case)
291
+
292
+ if failed_cases:
293
+ error_msg = "The following test cases failed: \n"
294
+ for fail_case in failed_cases:
295
+ for fail_scorer in fail_case:
296
+ error_msg += (
297
+ f"\nScorer Name: {fail_scorer.name}\n"
298
+ f"Threshold: {fail_scorer.threshold}\n"
299
+ f"Success: {fail_scorer.success}\n"
300
+ f"Score: {fail_scorer.score}\n"
301
+ f"Reason: {fail_scorer.reason}\n"
302
+ f"Strict Mode: {fail_scorer.strict_mode}\n"
303
+ f"Evaluation Model: {fail_scorer.evaluation_model}\n"
304
+ f"Error: {fail_scorer.error}\n"
305
+ f"Additional Metadata: {fail_scorer.additional_metadata}\n"
306
+ )
307
+ error_msg += "-" * 100
308
+
309
+ total_tests = len(scoring_results)
310
+ failed_tests = len(failed_cases)
311
+ passed_tests = total_tests - failed_tests
312
+
313
+ # Print summary with colors
314
+ rprint("\n" + "=" * 80)
315
+ if failed_tests == 0:
316
+ rprint(
317
+ f"[bold green]šŸŽ‰ ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
318
+ )
319
+ else:
320
+ rprint(
321
+ f"[bold red]āš ļø TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
322
+ )
323
+ rprint("=" * 80 + "\n")
324
+
325
+ # Print individual test cases
326
+ for i, result in enumerate(scoring_results):
327
+ test_num = i + 1
328
+ if result.success:
329
+ rprint(f"[green]āœ“ Test {test_num}: PASSED[/green]")
330
+ else:
331
+ rprint(f"[red]āœ— Test {test_num}: FAILED[/red]")
332
+ if result.scorers_data:
333
+ for scorer_data in result.scorers_data:
334
+ if not scorer_data.success:
335
+ rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
336
+ rprint(f" [red] Score: {scorer_data.score}[/red]")
337
+ rprint(f" [red] Reason: {scorer_data.reason}[/red]")
338
+ if scorer_data.error:
339
+ rprint(f" [red] Error: {scorer_data.error}[/red]")
340
+ rprint(" " + "-" * 40)
341
+
342
+ rprint("\n" + "=" * 80)
343
+ if failed_tests > 0:
344
+ raise AssertionError(failed_cases)
judgeval/exceptions.py ADDED
@@ -0,0 +1,27 @@
1
+ from __future__ import annotations
2
+
3
+ from httpx import HTTPError, Response
4
+
5
+
6
+ class JudgmentAPIError(HTTPError):
7
+ status_code: int
8
+ detail: str
9
+ response: Response
10
+
11
+ def __init__(self, status_code: int, detail: str, response: Response):
12
+ self.status_code = status_code
13
+ self.detail = detail
14
+ self.response = response
15
+ super().__init__(f"{status_code}: {detail}")
16
+
17
+
18
+ class JudgmentTestError(Exception): ...
19
+
20
+
21
+ class JudgmentRuntimeError(RuntimeError): ...
22
+
23
+
24
+ class InvalidJudgeModelError(Exception): ...
25
+
26
+
27
+ __all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC
4
+ import os
5
+
6
+
7
+ class Langgraph(ABC):
8
+ @staticmethod
9
+ def initialize(otel_only: bool = True):
10
+ os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
11
+ os.environ["LANGSMITH_TRACING"] = "true"
12
+ if otel_only:
13
+ os.environ["LANGSMITH_OTEL_ONLY"] = "true"