judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, Optional
5
+
6
+ from judgeval.v1.internal.api.api_types import ScorerData as APIScorerData
7
+
8
+
9
+ @dataclass(slots=True)
10
+ class ScorerData:
11
+ name: str
12
+ threshold: float
13
+ success: bool
14
+ score: Optional[float] = None
15
+ reason: Optional[str] = None
16
+ strict_mode: Optional[bool] = None
17
+ evaluation_model: Optional[str] = None
18
+ error: Optional[str] = None
19
+ additional_metadata: Dict[str, Any] = field(default_factory=dict)
20
+ id: Optional[str] = None
21
+
22
+ def to_dict(self) -> APIScorerData:
23
+ result: APIScorerData = {
24
+ "name": self.name,
25
+ "threshold": self.threshold,
26
+ "success": self.success,
27
+ }
28
+ if self.score is not None:
29
+ result["score"] = self.score
30
+ if self.reason is not None:
31
+ result["reason"] = self.reason
32
+ if self.strict_mode is not None:
33
+ result["strict_mode"] = self.strict_mode
34
+ if self.evaluation_model is not None:
35
+ result["evaluation_model"] = self.evaluation_model
36
+ if self.error is not None:
37
+ result["error"] = self.error
38
+ if self.additional_metadata:
39
+ result["additional_metadata"] = self.additional_metadata
40
+ if self.id is not None:
41
+ result["id"] = self.id
42
+ return result
@@ -0,0 +1,44 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import List, Optional, Union
5
+
6
+ from judgeval.v1.internal.api.api_types import (
7
+ OtelTraceSpan,
8
+ ScorerData as APIScorerData,
9
+ ScoringResult as APIScoringResult,
10
+ )
11
+ from judgeval.v1.data.example import Example
12
+ from judgeval.v1.data.scorer_data import ScorerData
13
+
14
+
15
+ @dataclass(slots=True)
16
+ class ScoringResult:
17
+ success: bool
18
+ scorers_data: List[ScorerData]
19
+ name: Optional[str] = None
20
+ data_object: Optional[Union[OtelTraceSpan, Example]] = None
21
+ trace_id: Optional[str] = None
22
+ run_duration: Optional[float] = None
23
+ evaluation_cost: Optional[float] = None
24
+
25
+ def to_dict(self) -> APIScoringResult:
26
+ scorers_list: List[APIScorerData] = [s.to_dict() for s in self.scorers_data]
27
+ result: APIScoringResult = {
28
+ "success": self.success,
29
+ "scorers_data": scorers_list,
30
+ }
31
+ if self.name is not None:
32
+ result["name"] = self.name
33
+ if self.data_object is not None:
34
+ if isinstance(self.data_object, Example):
35
+ result["data_object"] = self.data_object.to_dict()
36
+ else:
37
+ result["data_object"] = self.data_object
38
+ if self.trace_id is not None:
39
+ result["trace_id"] = self.trace_id
40
+ if self.run_duration is not None:
41
+ result["run_duration"] = self.run_duration
42
+ if self.evaluation_cost is not None:
43
+ result["evaluation_cost"] = self.evaluation_cost
44
+ return result
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.datasets.dataset import Dataset, DatasetInfo
4
+ from judgeval.v1.datasets.dataset_factory import DatasetFactory
5
+
6
+ __all__ = ["Dataset", "DatasetInfo", "DatasetFactory"]
@@ -0,0 +1,214 @@
1
+ from __future__ import annotations
2
+
3
+ import datetime
4
+ import orjson
5
+ import os
6
+ import yaml
7
+ from dataclasses import dataclass
8
+ from typing import List, Literal, Optional, Iterable, Iterator
9
+ from itertools import islice
10
+ from rich.progress import (
11
+ Progress,
12
+ SpinnerColumn,
13
+ TextColumn,
14
+ BarColumn,
15
+ TaskProgressColumn,
16
+ )
17
+
18
+
19
+ from judgeval.v1.data.example import Example
20
+ from judgeval.v1.internal.api import JudgmentSyncClient
21
+ from judgeval.logger import judgeval_logger
22
+
23
+
24
+ def _batch_examples(
25
+ examples: Iterable[Example], batch_size: int = 100
26
+ ) -> Iterator[List[Example]]:
27
+ """Generator that yields batches of examples for efficient memory usage.
28
+
29
+ Works with any iterable including generators, consuming only batch_size items at a time.
30
+ """
31
+ iterator = iter(examples)
32
+ while True:
33
+ batch = list(islice(iterator, batch_size))
34
+ if not batch:
35
+ break
36
+ yield batch
37
+
38
+
39
+ @dataclass
40
+ class DatasetInfo:
41
+ dataset_id: str
42
+ name: str
43
+ created_at: str
44
+ kind: str
45
+ entries: int
46
+ creator: str
47
+
48
+
49
+ @dataclass
50
+ class Dataset:
51
+ name: str
52
+ project_name: str
53
+ dataset_kind: str = "example"
54
+ examples: Optional[List[Example]] = None
55
+ client: Optional[JudgmentSyncClient] = None
56
+
57
+ def add_from_json(self, file_path: str, batch_size: int = 100) -> None:
58
+ with open(file_path, "rb") as file:
59
+ data = orjson.loads(file.read())
60
+ examples = []
61
+ for e in data:
62
+ if isinstance(e, dict):
63
+ name = e.get("name")
64
+ example = Example(name=name)
65
+ for key, value in e.items():
66
+ if key != "name":
67
+ example.set_property(key, value)
68
+ examples.append(example)
69
+ else:
70
+ examples.append(e)
71
+ self.add_examples(examples, batch_size=batch_size)
72
+
73
+ def add_from_yaml(self, file_path: str, batch_size: int = 100) -> None:
74
+ with open(file_path, "r") as file:
75
+ data = yaml.safe_load(file)
76
+ examples = []
77
+ for e in data:
78
+ if isinstance(e, dict):
79
+ name = e.get("name")
80
+ example = Example(name=name)
81
+ for key, value in e.items():
82
+ if key != "name":
83
+ example.set_property(key, value)
84
+ examples.append(example)
85
+ else:
86
+ examples.append(e)
87
+ self.add_examples(examples, batch_size=batch_size)
88
+
89
+ def add_examples(self, examples: Iterable[Example], batch_size: int = 100) -> None:
90
+ if not self.client:
91
+ return
92
+
93
+ batches = _batch_examples(examples, batch_size)
94
+ total_uploaded = 0
95
+
96
+ with Progress(
97
+ SpinnerColumn(),
98
+ TextColumn("[bold blue]{task.description}"),
99
+ BarColumn(pulse_style="green"),
100
+ TaskProgressColumn(),
101
+ TextColumn("[dim]{task.fields[info]}"),
102
+ ) as progress:
103
+ task = progress.add_task(
104
+ f"Uploading to {self.name}",
105
+ total=None,
106
+ info="",
107
+ )
108
+
109
+ batch_num = 0
110
+ for batch in batches:
111
+ if len(batch) > 0 and not isinstance(batch[0], Example):
112
+ raise TypeError("Examples must be a list of Example objects")
113
+
114
+ batch_num += 1
115
+ batch_size_actual = len(batch)
116
+ total_uploaded += batch_size_actual
117
+
118
+ progress.update(
119
+ task,
120
+ advance=1,
121
+ info=f"Batch {batch_num} ({batch_size_actual} examples, {total_uploaded} total)",
122
+ )
123
+
124
+ self.client.datasets_insert_examples_for_judgeval(
125
+ {
126
+ "dataset_name": self.name,
127
+ "project_name": self.project_name,
128
+ "examples": [e.to_dict() for e in batch],
129
+ }
130
+ )
131
+
132
+ judgeval_logger.info(
133
+ f"Successfully added {total_uploaded} examples to dataset {self.name}"
134
+ )
135
+
136
+ def save_as(
137
+ self,
138
+ file_type: Literal["json", "yaml"],
139
+ dir_path: str,
140
+ save_name: Optional[str] = None,
141
+ ) -> None:
142
+ if not os.path.exists(dir_path):
143
+ os.makedirs(dir_path)
144
+
145
+ file_name = save_name or datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
146
+ complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
147
+
148
+ examples_data = [e.to_dict() for e in self.examples] if self.examples else []
149
+
150
+ if file_type == "json":
151
+ with open(complete_path, "wb") as file:
152
+ file.write(
153
+ orjson.dumps(
154
+ {"examples": examples_data}, option=orjson.OPT_INDENT_2
155
+ )
156
+ )
157
+ elif file_type == "yaml":
158
+ with open(complete_path, "w") as file:
159
+ yaml.dump({"examples": examples_data}, file, default_flow_style=False)
160
+
161
+ def __iter__(self):
162
+ return iter(self.examples or [])
163
+
164
+ def __len__(self):
165
+ return len(self.examples) if self.examples else 0
166
+
167
+ def __str__(self):
168
+ return f"Dataset(name={self.name}, examples={len(self.examples) if self.examples else 0})"
169
+
170
+ def display(self, max_examples: int = 5) -> None:
171
+ from rich.console import Console
172
+ from rich.table import Table
173
+
174
+ console = Console()
175
+
176
+ total = len(self.examples) if self.examples else 0
177
+ console.print(f"\n[bold cyan]Dataset: {self.name}[/bold cyan]")
178
+ console.print(f"[dim]Project:[/dim] {self.project_name}")
179
+ console.print(f"[dim]Total examples:[/dim] {total}")
180
+
181
+ if not self.examples:
182
+ console.print("[dim]No examples found[/dim]")
183
+ return
184
+
185
+ display_count = min(max_examples, total)
186
+
187
+ if total > 0:
188
+ first_example = self.examples[0]
189
+ property_keys = list(first_example.properties.keys())
190
+
191
+ table = Table(show_header=True, header_style="bold")
192
+ table.add_column("#", style="dim", width=4)
193
+ table.add_column("Name", style="cyan")
194
+ for key in property_keys[:3]:
195
+ table.add_column(key, max_width=30)
196
+
197
+ for i, example in enumerate(self.examples[:display_count]):
198
+ row = [str(i + 1), example.name or "—"]
199
+ for key in property_keys[:3]:
200
+ value = str(example.get_property(key) or "")
201
+ if len(value) > 30:
202
+ value = value[:27] + "..."
203
+ row.append(value)
204
+ table.add_row(*row)
205
+
206
+ console.print()
207
+ console.print(table)
208
+
209
+ if total > display_count:
210
+ console.print(
211
+ f"[dim]... and {total - display_count} more examples[/dim]"
212
+ )
213
+
214
+ console.print()
@@ -0,0 +1,94 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Iterable
4
+
5
+ from judgeval.v1.internal.api import JudgmentSyncClient
6
+ from judgeval.v1.datasets.dataset import Dataset, DatasetInfo
7
+ from judgeval.v1.data.example import Example
8
+ from judgeval.logger import judgeval_logger
9
+
10
+
11
+ class DatasetFactory:
12
+ __slots__ = "_client"
13
+
14
+ def __init__(self, client: JudgmentSyncClient):
15
+ self._client = client
16
+
17
+ def get(self, name: str, project_name: str) -> Dataset:
18
+ dataset = self._client.datasets_pull_for_judgeval(
19
+ {
20
+ "dataset_name": name,
21
+ "project_name": project_name,
22
+ }
23
+ )
24
+
25
+ dataset_kind = dataset.get("dataset_kind", "example")
26
+ examples_data = dataset.get("examples", []) or []
27
+
28
+ examples = []
29
+ for e in examples_data:
30
+ if isinstance(e, dict):
31
+ judgeval_logger.debug(f"Raw example keys: {e.keys()}")
32
+
33
+ data_obj = e.get("data", {})
34
+ if isinstance(data_obj, dict):
35
+ example_id = data_obj.get("example_id", "")
36
+ created_at = data_obj.get("created_at", "")
37
+ name_field = data_obj.get("name")
38
+
39
+ example = Example(
40
+ example_id=example_id, created_at=created_at, name=name_field
41
+ )
42
+
43
+ for key, value in data_obj.items():
44
+ if key not in ["example_id", "created_at", "name"]:
45
+ example.set_property(key, value)
46
+
47
+ examples.append(example)
48
+ judgeval_logger.debug(
49
+ f"Created example with name={name_field}, properties={list(example.properties.keys())}"
50
+ )
51
+
52
+ judgeval_logger.info(f"Retrieved dataset {name} with {len(examples)} examples")
53
+ return Dataset(
54
+ name=name,
55
+ project_name=project_name,
56
+ dataset_kind=dataset_kind,
57
+ examples=examples,
58
+ client=self._client,
59
+ )
60
+
61
+ def create(
62
+ self,
63
+ name: str,
64
+ project_name: str,
65
+ examples: Iterable[Example] = [],
66
+ overwrite: bool = False,
67
+ batch_size: int = 100,
68
+ ) -> Dataset:
69
+ self._client.datasets_create_for_judgeval(
70
+ {
71
+ "name": name,
72
+ "project_name": project_name,
73
+ "examples": [],
74
+ "dataset_kind": "example",
75
+ "overwrite": overwrite,
76
+ }
77
+ )
78
+ judgeval_logger.info(f"Created dataset {name}")
79
+
80
+ if not isinstance(examples, list):
81
+ examples = list(examples)
82
+
83
+ dataset = Dataset(
84
+ name=name, project_name=project_name, examples=examples, client=self._client
85
+ )
86
+ dataset.add_examples(examples, batch_size=batch_size)
87
+ return dataset
88
+
89
+ def list(self, project_name: str) -> List[DatasetInfo]:
90
+ datasets = self._client.datasets_pull_all_for_judgeval(
91
+ {"project_name": project_name}
92
+ )
93
+ judgeval_logger.info(f"Fetched datasets for project {project_name}")
94
+ return [DatasetInfo(**d) for d in datasets]
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.evaluation.evaluation import Evaluation
4
+ from judgeval.v1.evaluation.evaluation_factory import EvaluationFactory
5
+
6
+ __all__ = ["Evaluation", "EvaluationFactory"]
@@ -0,0 +1,182 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import uuid
5
+ from datetime import datetime, timezone
6
+ from typing import List, Optional
7
+
8
+ from rich.console import Console
9
+ from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn
10
+
11
+ from judgeval.v1.internal.api import JudgmentSyncClient
12
+ from judgeval.v1.internal.api.api_types import ExampleEvaluationRun
13
+ from judgeval.v1.data.example import Example
14
+ from judgeval.v1.data.scoring_result import ScoringResult
15
+ from judgeval.v1.data.scorer_data import ScorerData
16
+ from judgeval.v1.scorers.base_scorer import BaseScorer
17
+ from judgeval.logger import judgeval_logger
18
+
19
+
20
+ class Evaluation:
21
+ __slots__ = ("_client",)
22
+
23
+ def __init__(self, client: JudgmentSyncClient):
24
+ self._client = client
25
+
26
+ def run(
27
+ self,
28
+ examples: List[Example],
29
+ scorers: List[BaseScorer],
30
+ project_name: str,
31
+ eval_run_name: str,
32
+ model: Optional[str] = None,
33
+ assert_test: bool = False,
34
+ timeout_seconds: int = 300,
35
+ ) -> List[ScoringResult]:
36
+ console = Console()
37
+ eval_id = str(uuid.uuid4())
38
+ created_at = datetime.now(timezone.utc).isoformat()
39
+
40
+ console.print("\n[bold cyan]Starting Evaluation[/bold cyan]")
41
+ console.print(f"[dim]Run:[/dim] {eval_run_name}")
42
+ console.print(f"[dim]Project:[/dim] {project_name}")
43
+ console.print(
44
+ f"[dim]Examples:[/dim] {len(examples)} | [dim]Scorers:[/dim] {len(scorers)}"
45
+ )
46
+ if model:
47
+ console.print(f"[dim]Model:[/dim] {model}")
48
+
49
+ judgeval_logger.info(f"Starting evaluation: {eval_run_name}")
50
+ judgeval_logger.info(f"Examples: {len(examples)}, Scorers: {len(scorers)}")
51
+
52
+ payload: ExampleEvaluationRun = {
53
+ "id": eval_id,
54
+ "project_name": project_name,
55
+ "eval_name": eval_run_name,
56
+ "created_at": created_at,
57
+ "examples": [e.to_dict() for e in examples],
58
+ "judgment_scorers": [s.get_scorer_config() for s in scorers],
59
+ "custom_scorers": [],
60
+ }
61
+
62
+ console.print()
63
+ with Progress(
64
+ SpinnerColumn(),
65
+ TextColumn("[progress.description]{task.description}"),
66
+ TimeElapsedColumn(),
67
+ console=console,
68
+ ) as progress:
69
+ task = progress.add_task("Submitting evaluation...", total=None)
70
+ self._client.add_to_run_eval_queue_examples(payload)
71
+ judgeval_logger.info(f"Evaluation submitted: {eval_id}")
72
+
73
+ progress.update(task, description="Running evaluation...")
74
+ start_time = time.time()
75
+ poll_count = 0
76
+
77
+ while True:
78
+ elapsed = time.time() - start_time
79
+ if elapsed > timeout_seconds:
80
+ raise TimeoutError(f"Evaluation timed out after {timeout_seconds}s")
81
+
82
+ response = self._client.fetch_experiment_run(
83
+ {"experiment_run_id": eval_id, "project_name": project_name}
84
+ )
85
+ results_data = response.get("results", []) or []
86
+ poll_count += 1
87
+
88
+ completed = len(results_data)
89
+ total = len(examples)
90
+ progress.update(
91
+ task,
92
+ description=f"Running evaluation... ({completed}/{total} completed)",
93
+ )
94
+ judgeval_logger.info(
95
+ f"Poll {poll_count}: {completed}/{total} results ready"
96
+ )
97
+
98
+ if completed == total:
99
+ break
100
+ time.sleep(2)
101
+
102
+ console.print(
103
+ f"[green]✓[/green] Evaluation completed in [bold]{elapsed:.1f}s[/bold]"
104
+ )
105
+ judgeval_logger.info(f"Evaluation completed in {elapsed:.1f}s")
106
+
107
+ console.print()
108
+ results = []
109
+ passed = 0
110
+ failed = 0
111
+
112
+ for i, res in enumerate(results_data):
113
+ judgeval_logger.info(f"Processing result {i + 1}: {res.keys()}")
114
+
115
+ scorers_raw = res.get("scorers", [])
116
+ scorers_data = []
117
+ for scorer_dict in scorers_raw:
118
+ judgeval_logger.debug(f"Scorer data fields: {scorer_dict.keys()}")
119
+
120
+ scorer_fields = {
121
+ "name": scorer_dict.get("name"),
122
+ "threshold": scorer_dict.get("threshold"),
123
+ "success": scorer_dict.get("success"),
124
+ "score": scorer_dict.get("score"),
125
+ "reason": scorer_dict.get("reason"),
126
+ "strict_mode": scorer_dict.get("strict_mode"),
127
+ "evaluation_model": scorer_dict.get("evaluation_model"),
128
+ "error": scorer_dict.get("error"),
129
+ "additional_metadata": scorer_dict.get("additional_metadata", {}),
130
+ "id": scorer_dict.get("scorer_data_id") or scorer_dict.get("id"),
131
+ }
132
+ scorers_data.append(ScorerData(**scorer_fields))
133
+
134
+ success = all(s.success for s in scorers_data)
135
+
136
+ if success:
137
+ passed += 1
138
+ console.print(
139
+ f"[green]✓[/green] Example {i + 1}: [green]PASSED[/green]"
140
+ )
141
+ else:
142
+ failed += 1
143
+ console.print(f"[red]✗[/red] Example {i + 1}: [red]FAILED[/red]")
144
+
145
+ for scorer_data in scorers_data:
146
+ score_str = (
147
+ f"{scorer_data.score:.3f}"
148
+ if scorer_data.score is not None
149
+ else "N/A"
150
+ )
151
+ status_color = "green" if scorer_data.success else "red"
152
+ console.print(
153
+ f" [dim]{scorer_data.name}:[/dim] [{status_color}]{score_str}[/{status_color}] (threshold: {scorer_data.threshold})"
154
+ )
155
+
156
+ results.append(
157
+ ScoringResult(
158
+ success=success,
159
+ scorers_data=scorers_data,
160
+ )
161
+ )
162
+
163
+ console.print()
164
+ url = response.get("ui_results_url", "")
165
+
166
+ if passed == len(results):
167
+ console.print(
168
+ f"[bold green]✓ All tests passed![/bold green] ({passed}/{len(results)})"
169
+ )
170
+ else:
171
+ console.print(
172
+ f"[bold yellow]⚠ Results:[/bold yellow] [green]{passed} passed[/green] | [red]{failed} failed[/red]"
173
+ )
174
+
175
+ console.print(f"[dim]View full details:[/dim] [link={url}]{url}[/link]\n")
176
+
177
+ if assert_test and not all(r.success for r in results):
178
+ raise AssertionError(
179
+ f"Evaluation failed: {failed}/{len(results)} tests failed"
180
+ )
181
+
182
+ return results
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from judgeval.v1.internal.api import JudgmentSyncClient
4
+ from judgeval.v1.evaluation.evaluation import Evaluation
5
+
6
+
7
+ class EvaluationFactory:
8
+ __slots__ = "_client"
9
+
10
+ def __init__(
11
+ self,
12
+ client: JudgmentSyncClient,
13
+ ):
14
+ self._client = client
15
+
16
+ def create(self) -> Evaluation:
17
+ return Evaluation(client=self._client)
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from .llm import *
4
+
5
+
6
+ __all__ = ["wrap_provider"]
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ from .config import _detect_provider, wrap_provider
5
+
6
+
7
+ __all__ = ["_detect_provider", "wrap_provider"]