judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,335 @@
1
+ import datetime
2
+ import orjson
3
+ import os
4
+ import yaml
5
+ from dataclasses import dataclass
6
+ from typing import List, Literal, Optional, Iterable, Iterator
7
+ from itertools import islice
8
+ from rich.progress import (
9
+ Progress,
10
+ SpinnerColumn,
11
+ TextColumn,
12
+ BarColumn,
13
+ TaskProgressColumn,
14
+ )
15
+
16
+ from judgeval.data import Example
17
+ from judgeval.data.trace import Trace
18
+ from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
19
+ from judgeval.api import JudgmentSyncClient
20
+ from judgeval.logger import judgeval_logger
21
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
22
+
23
+ from judgeval.data.judgment_types import DatasetKind
24
+
25
+
26
+ def _batch_examples(
27
+ examples: Iterable[Example], batch_size: int = 100
28
+ ) -> Iterator[List[Example]]:
29
+ """Generator that yields batches of examples for efficient memory usage.
30
+
31
+ Works with any iterable including generators, consuming only batch_size items at a time.
32
+ """
33
+ iterator = iter(examples)
34
+ while True:
35
+ batch = list(islice(iterator, batch_size))
36
+ if not batch:
37
+ break
38
+ yield batch
39
+
40
+
41
+ @dataclass
42
+ class DatasetInfo:
43
+ dataset_id: str
44
+ name: str
45
+ created_at: str
46
+ kind: DatasetKind
47
+ entries: int
48
+ creator: str
49
+
50
+
51
+ @dataclass
52
+ class Dataset:
53
+ name: str
54
+ project_name: str
55
+ dataset_kind: DatasetKind = DatasetKind.example
56
+ examples: Optional[List[Example]] = None
57
+ traces: Optional[List[Trace]] = None
58
+ judgment_api_key: str | None = JUDGMENT_API_KEY
59
+ organization_id: str | None = JUDGMENT_ORG_ID
60
+
61
+ @classmethod
62
+ def get(
63
+ cls,
64
+ name: str,
65
+ project_name: str,
66
+ ):
67
+ if not cls.judgment_api_key or not cls.organization_id:
68
+ raise ValueError("Judgment API key and organization ID are required")
69
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
70
+ dataset = client.datasets_pull_for_judgeval(
71
+ {
72
+ "dataset_name": name,
73
+ "project_name": project_name,
74
+ },
75
+ )
76
+ if not dataset:
77
+ raise ValueError(f"Dataset {name} not found in project {project_name}")
78
+
79
+ dataset_kind = DatasetKind(dataset.get("dataset_kind", "example"))
80
+
81
+ if dataset_kind == DatasetKind.example:
82
+ examples = dataset.get("examples", [])
83
+ if examples is None:
84
+ examples = []
85
+
86
+ for e in examples:
87
+ if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
88
+ e.update(e.pop("data")) # type: ignore
89
+ e.pop(
90
+ "example_id"
91
+ ) # TODO: remove once scorer data migration is complete
92
+ judgeval_logger.info(f"Successfully retrieved example dataset {name}!")
93
+ return cls(
94
+ name=name,
95
+ project_name=project_name,
96
+ dataset_kind=dataset_kind,
97
+ examples=[Example(**e) for e in examples],
98
+ )
99
+
100
+ elif dataset_kind == DatasetKind.trace:
101
+ trace_data = dataset.get("traces", [])
102
+ if trace_data is None:
103
+ trace_data = []
104
+
105
+ traces = []
106
+ for trace_item in trace_data:
107
+ if isinstance(trace_item, dict):
108
+ trace = Trace.from_dataset_trace_with_spans(trace_item)
109
+ traces.append(trace)
110
+
111
+ judgeval_logger.info(f"Successfully retrieved trace dataset {name}!")
112
+ return cls(
113
+ name=name,
114
+ project_name=project_name,
115
+ dataset_kind=dataset_kind,
116
+ traces=traces,
117
+ )
118
+
119
+ else:
120
+ raise ValueError(f"Unsupported dataset kind: {dataset_kind}")
121
+
122
+ @classmethod
123
+ def create(
124
+ cls,
125
+ name: str,
126
+ project_name: str,
127
+ examples: Iterable[Example] = [],
128
+ overwrite: bool = False,
129
+ batch_size: int = 100,
130
+ ):
131
+ """Create a dataset with batched example uploads for large datasets.
132
+
133
+ Args:
134
+ name: Dataset name
135
+ project_name: Project name
136
+ examples: Iterable of examples to add (can be a list, generator, etc.)
137
+ overwrite: Whether to overwrite existing dataset
138
+ batch_size: Number of examples to upload per batch (default: 100)
139
+ """
140
+ if not cls.judgment_api_key or not cls.organization_id:
141
+ raise ValueError("Judgment API key and organization ID are required")
142
+
143
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
144
+
145
+ client.datasets_create_for_judgeval(
146
+ {
147
+ "name": name,
148
+ "project_name": project_name,
149
+ "examples": [], # type: ignore
150
+ "dataset_kind": "example",
151
+ "overwrite": overwrite,
152
+ }
153
+ )
154
+ judgeval_logger.info(f"Created dataset {name}")
155
+
156
+ if not isinstance(examples, list):
157
+ examples = list(examples)
158
+
159
+ dataset = cls(
160
+ name=name,
161
+ project_name=project_name,
162
+ examples=examples,
163
+ )
164
+ dataset.add_examples(examples, batch_size=batch_size)
165
+
166
+ return dataset
167
+
168
+ @classmethod
169
+ def list(cls, project_name: str):
170
+ if not cls.judgment_api_key or not cls.organization_id:
171
+ raise ValueError("Judgment API key and organization ID are required")
172
+ client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
173
+ datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
174
+
175
+ judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
176
+
177
+ return [DatasetInfo(**dataset_info) for dataset_info in datasets]
178
+
179
+ def add_from_json(self, file_path: str) -> None:
180
+ """
181
+ Adds examples from a JSON file.
182
+
183
+ The JSON file is expected to have the following format:
184
+ [
185
+ {
186
+ "key_01": "value_01",
187
+ "key_02": "value_02"
188
+ },
189
+ {
190
+ "key_11": "value_11",
191
+ "key_12": "value_12",
192
+ "key_13": "value_13"
193
+ },
194
+ ...
195
+ ]
196
+ """
197
+ examples = get_examples_from_json(file_path)
198
+ self.add_examples(examples)
199
+
200
+ def add_from_yaml(self, file_path: str) -> None:
201
+ """
202
+ Adds examples from a YAML file.
203
+
204
+ The YAML file is expected to have the following format:
205
+ - key_01: value_01
206
+ key_02: value_02
207
+ - key_11: value_11
208
+ key_12: value_12
209
+ key_13: value_13
210
+ ...
211
+ """
212
+
213
+ examples = get_examples_from_yaml(file_path)
214
+ self.add_examples(examples)
215
+
216
+ def add_examples(self, examples: Iterable[Example], batch_size: int = 100) -> None:
217
+ if not self.judgment_api_key or not self.organization_id:
218
+ raise ValueError("Judgment API key and organization ID are required")
219
+
220
+ client = JudgmentSyncClient(self.judgment_api_key, self.organization_id)
221
+
222
+ batches = _batch_examples(examples, batch_size)
223
+ total_uploaded = 0
224
+
225
+ with Progress(
226
+ SpinnerColumn(),
227
+ TextColumn("[bold blue]{task.description}"),
228
+ BarColumn(pulse_style="green"),
229
+ TaskProgressColumn(),
230
+ TextColumn("[dim]{task.fields[info]}"),
231
+ ) as progress:
232
+ task = progress.add_task(
233
+ f"Uploading to {self.name}",
234
+ total=None,
235
+ info="",
236
+ )
237
+
238
+ batch_num = 0
239
+ for batch in batches:
240
+ if len(batch) > 0 and not isinstance(batch[0], Example):
241
+ raise TypeError("Examples must be a list of Example objects")
242
+
243
+ batch_num += 1
244
+ batch_size_actual = len(batch)
245
+ total_uploaded += batch_size_actual
246
+
247
+ progress.update(
248
+ task,
249
+ advance=1,
250
+ info=f"Batch {batch_num} ({batch_size_actual} examples, {total_uploaded} total)",
251
+ )
252
+
253
+ client.datasets_insert_examples_for_judgeval(
254
+ {
255
+ "dataset_name": self.name,
256
+ "project_name": self.project_name,
257
+ "examples": batch, # type: ignore
258
+ }
259
+ )
260
+
261
+ judgeval_logger.info(
262
+ f"Successfully added {total_uploaded} examples to dataset {self.name}"
263
+ )
264
+
265
+ def save_as(
266
+ self,
267
+ file_type: Literal["json", "yaml"],
268
+ dir_path: str,
269
+ save_name: str | None = None,
270
+ ) -> None:
271
+ """
272
+ Saves the dataset as a file. Save only the examples.
273
+
274
+ Args:
275
+ file_type (Literal["json", "csv"]): The file type to save the dataset as.
276
+ dir_path (str): The directory path to save the file to.
277
+ save_name (str, optional): The name of the file to save. Defaults to None.
278
+ """
279
+ if not os.path.exists(dir_path):
280
+ os.makedirs(dir_path)
281
+ file_name = (
282
+ datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
283
+ if save_name is None
284
+ else save_name
285
+ )
286
+ complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
287
+ if file_type == "json":
288
+ with open(complete_path, "wb") as file:
289
+ file.write(
290
+ orjson.dumps(
291
+ {
292
+ "examples": [e.to_dict() for e in self.examples]
293
+ if self.examples
294
+ else [],
295
+ },
296
+ option=orjson.OPT_INDENT_2,
297
+ )
298
+ )
299
+ elif file_type == "yaml":
300
+ with open(complete_path, "w") as file:
301
+ yaml_data = {
302
+ "examples": [e.to_dict() for e in self.examples]
303
+ if self.examples
304
+ else [],
305
+ }
306
+ yaml.dump(yaml_data, file, default_flow_style=False)
307
+ else:
308
+ ACCEPTABLE_FILE_TYPES = ["json", "yaml"]
309
+ raise TypeError(
310
+ f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
311
+ )
312
+
313
+ def __iter__(self):
314
+ if self.dataset_kind == DatasetKind.example and self.examples:
315
+ return iter(self.examples)
316
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
317
+ return iter(self.traces)
318
+ else:
319
+ return iter([])
320
+
321
+ def __len__(self):
322
+ if self.dataset_kind == DatasetKind.example and self.examples:
323
+ return len(self.examples)
324
+ elif self.dataset_kind == DatasetKind.trace and self.traces:
325
+ return len(self.traces)
326
+ else:
327
+ return 0
328
+
329
+ def __str__(self):
330
+ if self.dataset_kind == DatasetKind.example:
331
+ return (
332
+ f"{self.__class__.__name__}(examples={self.examples}, name={self.name})"
333
+ )
334
+ else:
335
+ return f"{self.__class__.__name__}(traces={self.traces}, name={self.name})"
judgeval/env.py ADDED
@@ -0,0 +1,55 @@
1
+ from __future__ import annotations
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ import os
7
+ from typing import overload
8
+
9
+
10
+ @overload
11
+ def optional_env_var(var_name: str) -> str | None: ...
12
+
13
+
14
+ @overload
15
+ def optional_env_var(var_name: str, default: str) -> str: ...
16
+
17
+
18
+ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
19
+ return os.getenv(var_name, default)
20
+
21
+
22
+ JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
23
+ JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
24
+ JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
25
+
26
+ JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var(
27
+ "JUDGMENT_DEFAULT_GPT_MODEL", "gpt-5-mini"
28
+ )
29
+ JUDGMENT_DEFAULT_TOGETHER_MODEL = optional_env_var(
30
+ "JUDGMENT_DEFAULT_TOGETHER_MODEL", "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
31
+ )
32
+ JUDGMENT_MAX_CONCURRENT_EVALUATIONS = int(
33
+ optional_env_var("JUDGMENT_MAX_CONCURRENT_EVALUATIONS", "10")
34
+ )
35
+
36
+
37
+ JUDGMENT_ENABLE_MONITORING = optional_env_var("JUDGMENT_ENABLE_MONITORING", "true")
38
+ JUDGMENT_ENABLE_EVALUATIONS = optional_env_var("JUDGMENT_ENABLE_EVALUATIONS", "true")
39
+
40
+ JUDGMENT_S3_ACCESS_KEY_ID = optional_env_var("JUDGMENT_S3_ACCESS_KEY_ID")
41
+ JUDGMENT_S3_SECRET_ACCESS_KEY = optional_env_var("JUDGMENT_S3_SECRET_ACCESS_KEY")
42
+ JUDGMENT_S3_REGION_NAME = optional_env_var("JUDGMENT_S3_REGION_NAME")
43
+ JUDGMENT_S3_BUCKET_NAME = optional_env_var("JUDGMENT_S3_BUCKET_NAME")
44
+ JUDGMENT_S3_PREFIX = optional_env_var("JUDGMENT_S3_PREFIX", "spans/")
45
+ JUDGMENT_S3_ENDPOINT_URL = optional_env_var("JUDGMENT_S3_ENDPOINT_URL")
46
+ JUDGMENT_S3_SIGNATURE_VERSION = optional_env_var("JUDGMENT_S3_SIGNATURE_VERSION", "s3")
47
+ JUDGMENT_S3_ADDRESSING_STYLE = optional_env_var("JUDGMENT_S3_ADDRESSING_STYLE", "auto")
48
+
49
+
50
+ JUDGMENT_NO_COLOR = optional_env_var("JUDGMENT_NO_COLOR")
51
+ JUDGMENT_LOG_LEVEL = optional_env_var("JUDGMENT_LOG_LEVEL", "WARNING")
52
+
53
+
54
+ TOGETHERAI_API_KEY = optional_env_var("TOGETHERAI_API_KEY")
55
+ TOGETHER_API_KEY = optional_env_var("TOGETHER_API_KEY")