judgeval 0.10.1__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/__init__.py CHANGED
@@ -5,8 +5,9 @@ from judgeval.evaluation import run_eval
5
5
  from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
- from typing import List, Optional, Union
9
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
8
+ from typing import List, Optional, Union, Sequence
9
+ from judgeval.scorers import ExampleAPIScorerConfig
10
+ from judgeval.scorers.example_scorer import ExampleScorer
10
11
  from judgeval.data.example import Example
11
12
  from judgeval.logger import judgeval_logger
12
13
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
38
39
  def run_evaluation(
39
40
  self,
40
41
  examples: List[Example],
41
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
42
+ scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
42
43
  project_name: str = "default_project",
43
44
  eval_run_name: str = "default_eval_run",
44
45
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
51
52
  examples=examples,
52
53
  scorers=scorers,
53
54
  model=model,
54
- organization_id=self.organization_id,
55
55
  )
56
56
 
57
- results = run_eval(eval, self.api_key)
57
+ results = run_eval(eval)
58
58
  if assert_test:
59
59
  assert_test_results(results)
60
60
 
judgeval/api/__init__.py CHANGED
@@ -137,12 +137,13 @@ class JudgmentSyncClient:
137
137
  payload,
138
138
  )
139
139
 
140
- def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> List[DatasetInfo]:
140
+ def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
141
141
  return self._request(
142
142
  "POST",
143
143
  url_for("/datasets/pull_all_for_judgeval/"),
144
144
  payload,
145
145
  )
146
+
146
147
  def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
147
148
  return self._request(
148
149
  "POST",
@@ -180,12 +181,12 @@ class JudgmentSyncClient:
180
181
  payload,
181
182
  )
182
183
 
183
- def fetch_scorer(
184
- self, payload: FetchPromptScorerRequest
185
- ) -> FetchPromptScorerResponse:
184
+ def fetch_scorers(
185
+ self, payload: FetchPromptScorersRequest
186
+ ) -> FetchPromptScorersResponse:
186
187
  return self._request(
187
188
  "POST",
188
- url_for("/fetch_scorer/"),
189
+ url_for("/fetch_scorers/"),
189
190
  payload,
190
191
  )
191
192
 
@@ -345,6 +346,13 @@ class JudgmentAsyncClient:
345
346
  payload,
346
347
  )
347
348
 
349
+ async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
350
+ return await self._request(
351
+ "POST",
352
+ url_for("/datasets/pull_all_for_judgeval/"),
353
+ payload,
354
+ )
355
+
348
356
  async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
349
357
  return await self._request(
350
358
  "POST",
@@ -384,12 +392,12 @@ class JudgmentAsyncClient:
384
392
  payload,
385
393
  )
386
394
 
387
- async def fetch_scorer(
388
- self, payload: FetchPromptScorerRequest
389
- ) -> FetchPromptScorerResponse:
395
+ async def fetch_scorers(
396
+ self, payload: FetchPromptScorersRequest
397
+ ) -> FetchPromptScorersResponse:
390
398
  return await self._request(
391
399
  "POST",
392
- url_for("/fetch_scorer/"),
400
+ url_for("/fetch_scorers/"),
393
401
  payload,
394
402
  )
395
403
 
judgeval/api/api_types.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-10T17:42:12+00:00
3
+ # timestamp: 2025-09-12T16:54:35+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -19,6 +19,7 @@ class DatasetFetch(TypedDict):
19
19
  dataset_name: str
20
20
  project_name: str
21
21
 
22
+
22
23
  class DatasetsFetch(TypedDict):
23
24
  project_name: str
24
25
 
@@ -60,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
60
61
  name: str
61
62
 
62
63
 
63
- class FetchPromptScorerRequest(TypedDict):
64
- name: str
64
+ class FetchPromptScorersRequest(TypedDict):
65
+ names: NotRequired[Optional[List[str]]]
65
66
 
66
67
 
67
68
  class CustomScorerUploadPayload(TypedDict):
@@ -154,7 +155,7 @@ class ScorerData(TypedDict):
154
155
  score: NotRequired[Optional[float]]
155
156
  reason: NotRequired[Optional[str]]
156
157
  strict_mode: NotRequired[Optional[bool]]
157
- evaluation_model: NotRequired[str]
158
+ evaluation_model: NotRequired[Optional[str]]
158
159
  error: NotRequired[Optional[str]]
159
160
  additional_metadata: NotRequired[Optional[Dict[str, Any]]]
160
161
 
@@ -189,13 +190,13 @@ class OtelTraceSpan(TypedDict):
189
190
 
190
191
 
191
192
  class ExampleEvaluationRun(TypedDict):
192
- id: NotRequired[Optional[str]]
193
- project_name: NotRequired[Optional[str]]
194
- eval_name: NotRequired[Optional[str]]
193
+ id: NotRequired[str]
194
+ project_name: str
195
+ eval_name: str
195
196
  custom_scorers: NotRequired[List[BaseScorer]]
196
197
  judgment_scorers: NotRequired[List[ScorerConfig]]
197
198
  model: str
198
- created_at: NotRequired[Optional[str]]
199
+ created_at: NotRequired[str]
199
200
  examples: List[Example]
200
201
  trace_span_id: NotRequired[Optional[str]]
201
202
  trace_id: NotRequired[Optional[str]]
@@ -206,13 +207,13 @@ class HTTPValidationError(TypedDict):
206
207
 
207
208
 
208
209
  class TraceEvaluationRun(TypedDict):
209
- id: NotRequired[Optional[str]]
210
- project_name: NotRequired[Optional[str]]
211
- eval_name: NotRequired[Optional[str]]
210
+ id: NotRequired[str]
211
+ project_name: str
212
+ eval_name: str
212
213
  custom_scorers: NotRequired[List[BaseScorer]]
213
214
  judgment_scorers: NotRequired[List[ScorerConfig]]
214
215
  model: str
215
- created_at: NotRequired[Optional[str]]
216
+ created_at: NotRequired[str]
216
217
  trace_and_span_ids: List[TraceAndSpanId]
217
218
  is_offline: NotRequired[bool]
218
219
 
@@ -228,30 +229,31 @@ class DatasetReturn(TypedDict):
228
229
  project_name: str
229
230
  examples: NotRequired[Optional[List[Example]]]
230
231
 
232
+
231
233
  class DatasetInfo(TypedDict):
232
234
  dataset_id: str
233
235
  name: str
234
236
  created_at: str
235
237
  dataset_kind: DatasetKind
236
238
  entries: int
237
- creator: str
239
+ creator: str
238
240
 
239
241
 
240
242
  class DatasetCreate(TypedDict):
241
243
  name: str
242
244
  dataset_kind: DatasetKind
243
245
  project_name: str
244
- examples: NotRequired[Optional[List[Example]]]
245
- overwrite: NotRequired[Optional[bool]]
246
+ examples: List[Example]
247
+ overwrite: bool
246
248
 
247
249
 
248
- class FetchPromptScorerResponse(TypedDict):
249
- scorer: PromptScorer
250
+ class FetchPromptScorersResponse(TypedDict):
251
+ scorers: List[PromptScorer]
250
252
 
251
253
 
252
254
  class ScoringResult(TypedDict):
253
255
  success: bool
254
- scorers_data: Optional[List[ScorerData]]
256
+ scorers_data: List[ScorerData]
255
257
  name: NotRequired[Optional[str]]
256
258
  data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
257
259
  trace_id: NotRequired[Optional[str]]
@@ -1,11 +1,11 @@
1
- from typing import List, Optional, Union, Tuple
2
- from litellm.files.main import BaseModel
3
- from pydantic import field_validator, model_validator, Field
1
+ from typing import List, Optional, Union, Tuple, Sequence
2
+ from pydantic import field_validator, model_validator, Field, BaseModel
4
3
  from datetime import datetime, timezone
5
4
  import uuid
6
5
 
7
6
  from judgeval.data import Example
8
- from judgeval.scorers import BaseScorer, APIScorerConfig
7
+ from judgeval.scorers import APIScorerConfig
8
+ from judgeval.scorers.example_scorer import ExampleScorer
9
9
  from judgeval.constants import ACCEPTABLE_MODELS
10
10
  from judgeval.data.judgment_types import (
11
11
  ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
@@ -14,19 +14,20 @@ from judgeval.data.judgment_types import (
14
14
 
15
15
 
16
16
  class EvaluationRun(BaseModel):
17
- id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
18
- created_at: Optional[str] = Field(
17
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
18
+ created_at: str = Field(
19
19
  default_factory=lambda: datetime.now(timezone.utc).isoformat()
20
20
  )
21
- organization_id: Optional[str] = None
22
- custom_scorers: Optional[List[BaseScorer]] = None
23
- judgment_scorers: Optional[List[APIScorerConfig]] = None
24
- scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
21
+ custom_scorers: List[ExampleScorer] = Field(default_factory=list)
22
+ judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
23
+ scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
24
+ default_factory=list
25
+ )
25
26
  model: str
26
27
 
27
28
  def __init__(
28
29
  self,
29
- scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
30
+ scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
30
31
  **kwargs,
31
32
  ):
32
33
  """
@@ -38,7 +39,7 @@ class EvaluationRun(BaseModel):
38
39
  """
39
40
  if scorers is not None:
40
41
  # Automatically sort scorers into appropriate fields
41
- custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
42
+ custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
42
43
  judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
43
44
 
44
45
  # Always set both fields as lists (even if empty) to satisfy validation
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-10T17:42:11+00:00
3
+ # timestamp: 2025-09-12T16:54:34+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
22
22
  project_name: Annotated[str, Field(title="Project Name")]
23
23
 
24
24
 
25
+ class DatasetsFetch(BaseModel):
26
+ project_name: Annotated[str, Field(title="Project Name")]
27
+
28
+
25
29
  class ProjectAdd(BaseModel):
26
30
  project_name: Annotated[str, Field(title="Project Name")]
27
31
 
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
59
63
  name: Annotated[str, Field(title="Name")]
60
64
 
61
65
 
62
- class FetchPromptScorerRequest(BaseModel):
63
- name: Annotated[str, Field(title="Name")]
66
+ class FetchPromptScorersRequest(BaseModel):
67
+ names: Annotated[Optional[List[str]], Field(title="Names")] = None
64
68
 
65
69
 
66
70
  class CustomScorerUploadPayload(BaseModel):
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
210
214
 
211
215
  class ExampleEvaluationRun(BaseModel):
212
216
  id: Annotated[Optional[str], Field(title="Id")] = None
213
- project_name: Annotated[Optional[str], Field(title="Project Name")] = None
214
- eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
217
+ project_name: Annotated[str, Field(title="Project Name")]
218
+ eval_name: Annotated[str, Field(title="Eval Name")]
215
219
  custom_scorers: Annotated[
216
220
  Optional[List[BaseScorer]], Field(title="Custom Scorers")
217
221
  ] = []
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
231
235
 
232
236
  class TraceEvaluationRun(BaseModel):
233
237
  id: Annotated[Optional[str], Field(title="Id")] = None
234
- project_name: Annotated[Optional[str], Field(title="Project Name")] = None
235
- eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
238
+ project_name: Annotated[str, Field(title="Project Name")]
239
+ eval_name: Annotated[str, Field(title="Eval Name")]
236
240
  custom_scorers: Annotated[
237
241
  Optional[List[BaseScorer]], Field(title="Custom Scorers")
238
242
  ] = []
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
259
263
  examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
260
264
 
261
265
 
266
+ class DatasetInfo(BaseModel):
267
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
268
+ name: Annotated[str, Field(title="Name")]
269
+ created_at: Annotated[str, Field(title="Created At")]
270
+ dataset_kind: DatasetKind
271
+ entries: Annotated[int, Field(title="Entries")]
272
+ creator: Annotated[str, Field(title="Creator")]
273
+
274
+
262
275
  class DatasetCreate(BaseModel):
263
276
  name: Annotated[str, Field(title="Name")]
264
277
  dataset_kind: DatasetKind
265
278
  project_name: Annotated[str, Field(title="Project Name")]
266
- examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
267
- overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
279
+ examples: Annotated[List[Example], Field(title="Examples")]
280
+ overwrite: Annotated[bool, Field(title="Overwrite")]
268
281
 
269
282
 
270
- class FetchPromptScorerResponse(BaseModel):
271
- scorer: PromptScorer
283
+ class FetchPromptScorersResponse(BaseModel):
284
+ scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
272
285
 
273
286
 
274
287
  class ScoringResult(BaseModel):
275
288
  success: Annotated[bool, Field(title="Success")]
276
- scorers_data: Annotated[Optional[List[ScorerData]], Field(title="Scorers Data")] = (
277
- None
278
- )
289
+ scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
279
290
  name: Annotated[Optional[str], Field(title="Name")] = None
280
291
  data_object: Annotated[
281
292
  Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
judgeval/data/result.py CHANGED
@@ -18,6 +18,7 @@ class ScoringResult(JudgmentScoringResult):
18
18
 
19
19
  # Need to override this so that it uses this repo's Example class
20
20
  data_object: Example
21
+ scorers_data: List[ScorerData]
21
22
 
22
23
  def model_dump(self, **kwargs):
23
24
  data = super().model_dump(**kwargs)
@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from judgeval.data.judgment_types import ScorerData as JudgmentScorerData
9
+ from judgeval.data.judgment_types import ScorerData
10
10
  from judgeval.scorers import BaseScorer
11
11
  from typing import List
12
12
 
13
13
 
14
- class ScorerData(JudgmentScorerData):
15
- """
16
- ScorerData holds the information related to a single, completed Scorer evaluation run.
17
-
18
- For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
19
- object will contain whether the example passed its threshold expectation, as well as more detailed
20
- information surrounding the evaluation run such as the claims and verdicts generated by the
21
- judge model(s).
22
- """
23
-
24
- def to_dict(self) -> dict:
25
- """Convert the ScorerData instance to a JSON-serializable dictionary."""
26
- return {
27
- "name": self.name,
28
- "threshold": self.threshold,
29
- "success": self.success,
30
- "score": self.score,
31
- "reason": self.reason,
32
- "strict_mode": self.strict_mode,
33
- "evaluation_model": self.evaluation_model,
34
- "error": self.error,
35
- "additional_metadata": self.additional_metadata,
36
- }
37
-
38
-
39
14
  def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
40
15
  """
41
16
  After a `scorer` is run, it contains information about the example that was evaluated
@@ -3,7 +3,7 @@ import orjson
3
3
  import os
4
4
  import yaml
5
5
  from dataclasses import dataclass
6
- from typing import List, Literal, Optional
6
+ from typing import List, Literal
7
7
 
8
8
  from judgeval.data import Example
9
9
  from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
@@ -13,15 +13,17 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
13
13
 
14
14
  from judgeval.api.api_types import DatasetKind
15
15
 
16
+
16
17
  @dataclass
17
18
  class DatasetInfo:
18
19
  dataset_id: str
19
- name: str
20
+ name: str
20
21
  created_at: str
21
22
  dataset_kind: DatasetKind
22
23
  entries: int
23
24
  creator: str
24
25
 
26
+
25
27
  @dataclass
26
28
  class Dataset:
27
29
  examples: List[Example]
@@ -46,9 +48,12 @@ class Dataset:
46
48
  if not dataset:
47
49
  raise ValueError(f"Dataset {name} not found in project {project_name}")
48
50
  examples = dataset.get("examples", [])
51
+ if examples is None:
52
+ examples = []
53
+
49
54
  for e in examples:
50
- if isinstance(e, dict) and isinstance(e.get("data"), dict):
51
- e.update(e.pop("data"))
55
+ if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
56
+ e.update(e.pop("data")) # type: ignore
52
57
  e.pop(
53
58
  "example_id"
54
59
  ) # TODO: remove once scorer data migraiton is complete
@@ -64,7 +69,7 @@ class Dataset:
64
69
  cls,
65
70
  name: str,
66
71
  project_name: str,
67
- examples: Optional[List[Example]] = None,
72
+ examples: List[Example] = [],
68
73
  overwrite: bool = False,
69
74
  ):
70
75
  if not examples:
@@ -75,7 +80,7 @@ class Dataset:
75
80
  {
76
81
  "name": name,
77
82
  "project_name": project_name,
78
- "examples": [e.model_dump() for e in examples],
83
+ "examples": examples, # type: ignore
79
84
  "dataset_kind": "example",
80
85
  "overwrite": overwrite,
81
86
  }
@@ -87,18 +92,14 @@ class Dataset:
87
92
  project_name=project_name,
88
93
  examples=examples,
89
94
  )
95
+
90
96
  @classmethod
91
- def list(
92
- cls,
93
- project_name: str
94
- ):
97
+ def list(cls, project_name: str):
95
98
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
96
- datasets = client.datasets_pull_all_for_judgeval(
97
- {"project_name": project_name}
98
- )
99
-
99
+ datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
100
+
100
101
  judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
101
-
102
+
102
103
  return [DatasetInfo(**dataset_info) for dataset_info in datasets]
103
104
 
104
105
  def add_from_json(self, file_path: str) -> None:
@@ -147,7 +148,7 @@ class Dataset:
147
148
  {
148
149
  "dataset_name": self.name,
149
150
  "project_name": self.project_name,
150
- "examples": [e.model_dump() for e in examples],
151
+ "examples": examples, # type: ignore
151
152
  }
152
153
  )
153
154
 
judgeval/env.py CHANGED
@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
19
19
  return os.getenv(var_name, default)
20
20
 
21
21
 
22
- JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
23
- JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
22
+ def required_env_var(var_name: str) -> str:
23
+ value = os.getenv(var_name)
24
+ if value is None:
25
+ raise EnvironmentError(
26
+ f"Environment variable '{var_name}' is required but not set."
27
+ )
28
+ return value
29
+
30
+
31
+ JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
32
+ JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
24
33
  JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
25
34
 
26
35
  JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
@@ -3,14 +3,11 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import concurrent.futures
5
5
  import time
6
- import orjson
7
- import sys
8
6
  import threading
9
- from typing import List, Dict, Union, Tuple, TYPE_CHECKING
7
+ from typing import List, Tuple, TYPE_CHECKING
10
8
  from rich import print as rprint
11
9
 
12
- from judgeval.data import ScorerData, ScoringResult, Example
13
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
10
+ from judgeval.data import ScorerData, ScoringResult
14
11
  from judgeval.scorers.score import a_execute_scoring
15
12
  from judgeval.api import JudgmentSyncClient
16
13
  from judgeval.env import (
@@ -19,9 +16,10 @@ from judgeval.env import (
19
16
  from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
20
17
  from judgeval.logger import judgeval_logger
21
18
 
19
+ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
22
20
 
23
21
  if TYPE_CHECKING:
24
- from judgeval.data.evaluation_run import EvaluationRun
22
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
25
23
 
26
24
 
27
25
  def safe_run_async(coro):
@@ -49,8 +47,7 @@ def safe_run_async(coro):
49
47
 
50
48
  def log_evaluation_results(
51
49
  scoring_results: List[ScoringResult],
52
- run: EvaluationRun,
53
- judgment_api_key: str,
50
+ run: ExampleEvaluationRun,
54
51
  ) -> str:
55
52
  """
56
53
  Logs evaluation results to the Judgment API database.
@@ -65,10 +62,10 @@ def log_evaluation_results(
65
62
  ValueError: If there's a validation error with the results
66
63
  """
67
64
  try:
68
- if not judgment_api_key or not run.organization_id:
65
+ if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
69
66
  raise ValueError("API key and organization ID are required")
70
67
 
71
- api_client = JudgmentSyncClient(judgment_api_key, run.organization_id)
68
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
72
69
  response = api_client.log_eval_results(
73
70
  {
74
71
  "results": scoring_results, # type: ignore
@@ -85,41 +82,8 @@ def log_evaluation_results(
85
82
  )
86
83
 
87
84
 
88
- def check_examples(
89
- examples: List[Example], scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]]
90
- ) -> None:
91
- """
92
- Checks if the example contains the necessary parameters for the scorer.
93
- """
94
- prompt_user = False
95
- for scorer in scorers:
96
- for example in examples:
97
- missing_params = []
98
- for param in scorer.required_params:
99
- if getattr(example, param.value) is None:
100
- missing_params.append(f"{param.value}")
101
- if missing_params:
102
- rprint(
103
- f"[yellow]⚠️ WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
104
- )
105
- rprint(f"Missing parameters: {', '.join(missing_params)}")
106
- rprint(
107
- f"Example: {orjson.dumps(example.model_dump(), option=orjson.OPT_INDENT_2).decode('utf-8')}"
108
- )
109
- rprint("-" * 40)
110
- prompt_user = True
111
-
112
- if prompt_user:
113
- user_input = input("Do you want to continue? (y/n)")
114
- if user_input.lower() != "y":
115
- sys.exit(0)
116
- else:
117
- rprint("[green]Continuing...[/green]")
118
-
119
-
120
85
  def _poll_evaluation_until_complete(
121
- evaluation_run: EvaluationRun,
122
- judgment_api_key: str,
86
+ evaluation_run: ExampleEvaluationRun,
123
87
  expected_scorer_data_count: int,
124
88
  poll_interval_seconds: float = 5,
125
89
  max_failures: int = 5,
@@ -140,13 +104,15 @@ def _poll_evaluation_until_complete(
140
104
  Returns:
141
105
  List[ScoringResult]: The evaluation results
142
106
  """
143
- organization_id = evaluation_run.organization_id
144
107
  project_name = evaluation_run.project_name
145
108
  experiment_run_id = evaluation_run.id
146
109
 
110
+ if not project_name or not experiment_run_id:
111
+ raise ValueError("Project name and experiment run ID are required")
112
+
147
113
  poll_count = 0
148
114
  exception_count = 0
149
- api_client = JudgmentSyncClient(judgment_api_key, organization_id)
115
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
150
116
  while poll_count < max_poll_count:
151
117
  poll_count += 1
152
118
  try:
@@ -213,14 +179,13 @@ def progress_logger(stop_event, msg="Working...", interval=5):
213
179
 
214
180
 
215
181
  def run_eval(
216
- evaluation_run: EvaluationRun,
217
- judgment_api_key: str,
182
+ evaluation_run: ExampleEvaluationRun,
218
183
  ) -> List[ScoringResult]:
219
184
  """
220
185
  Executes an evaluation of `Example`s using one or more `Scorer`s
221
186
 
222
187
  Args:
223
- evaluation_run (EvaluationRun): Stores example and evaluation together for running
188
+ evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
224
189
 
225
190
  Returns:
226
191
  List[ScoringResult]: A list of ScoringResult objects
@@ -258,16 +223,13 @@ def run_eval(
258
223
  judgeval_logger.error(error_msg)
259
224
  raise ValueError(error_msg)
260
225
 
261
- check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
262
226
  stop_event = threading.Event()
263
227
  t = threading.Thread(
264
228
  target=progress_logger, args=(stop_event, "Running evaluation...")
265
229
  )
266
230
  t.start()
267
231
  try:
268
- api_client = JudgmentSyncClient(
269
- judgment_api_key, evaluation_run.organization_id
270
- )
232
+ api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
271
233
  response = api_client.add_to_run_eval_queue_examples(
272
234
  evaluation_run.model_dump(warnings=False) # type: ignore
273
235
  )
@@ -286,7 +248,6 @@ def run_eval(
286
248
  )
287
249
  results, url = _poll_evaluation_until_complete(
288
250
  evaluation_run=evaluation_run,
289
- judgment_api_key=judgment_api_key,
290
251
  expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
291
252
  )
292
253
  finally:
@@ -306,7 +267,7 @@ def run_eval(
306
267
  send_results = [
307
268
  scoring_result.model_dump(warnings=False) for scoring_result in results
308
269
  ]
309
- url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
270
+ url = log_evaluation_results(send_results, evaluation_run)
310
271
  rprint(
311
272
  f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
312
273
  )
@@ -323,27 +284,23 @@ def assert_test(scoring_results: List[ScoringResult]) -> None:
323
284
  Returns:
324
285
  None. Raises exceptions for any failed test cases.
325
286
  """
326
- failed_cases: List[ScorerData] = []
287
+ failed_cases: List[List[ScorerData]] = []
327
288
 
328
289
  for result in scoring_results:
329
290
  if not result.success:
330
291
  # Create a test case context with all relevant fields
331
- test_case: Dict = {"failed_scorers": []}
292
+ test_case: List[ScorerData] = []
332
293
  if result.scorers_data:
333
294
  # If the result was not successful, check each scorer_data
334
295
  for scorer_data in result.scorers_data:
335
296
  if not scorer_data.success:
336
- if scorer_data.name == "Tool Order":
337
- # Remove threshold, evaluation model for Tool Order scorer
338
- scorer_data.threshold = None
339
- scorer_data.evaluation_model = None
340
- test_case["failed_scorers"].append(scorer_data)
297
+ test_case.append(scorer_data)
341
298
  failed_cases.append(test_case)
342
299
 
343
300
  if failed_cases:
344
301
  error_msg = "The following test cases failed: \n"
345
302
  for fail_case in failed_cases:
346
- for fail_scorer in fail_case["failed_scorers"]:
303
+ for fail_scorer in fail_case:
347
304
  error_msg += (
348
305
  f"\nScorer Name: {fail_scorer.name}\n"
349
306
  f"Threshold: {fail_scorer.threshold}\n"
@@ -507,6 +507,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
507
507
  )
508
508
 
509
509
  # Extract response content
510
+ output: Any
510
511
  if response.generations:
511
512
  last_generation = response.generations[-1][-1]
512
513
  if (
@@ -547,7 +548,7 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
547
548
  for key, value in usage_attrs.items():
548
549
  span.set_attribute(key, value)
549
550
 
550
- self._end_span(run_id=run_id, outputs=output, **usage_attrs)
551
+ self._end_span(run_id=run_id, outputs=output, **usage_attrs) # type: ignore
551
552
 
552
553
  except Exception as e:
553
554
  judgeval_logger.exception(f"Error in on_llm_end: {e}")
@@ -4,6 +4,7 @@ from judgeval.scorers.api_scorer import (
4
4
  TraceAPIScorerConfig,
5
5
  )
6
6
  from judgeval.scorers.base_scorer import BaseScorer
7
+ from judgeval.scorers.example_scorer import ExampleScorer
7
8
  from judgeval.scorers.judgeval_scorers.api_scorers import (
8
9
  FaithfulnessScorer,
9
10
  AnswerRelevancyScorer,
@@ -18,6 +19,7 @@ __all__ = [
18
19
  "ExampleAPIScorerConfig",
19
20
  "TraceAPIScorerConfig",
20
21
  "BaseScorer",
22
+ "ExampleScorer",
21
23
  "TracePromptScorer",
22
24
  "PromptScorer",
23
25
  "FaithfulnessScorer",
@@ -1,17 +1,17 @@
1
- from judgeval.scorers.base_scorer import BaseScorer
2
- from judgeval.data.judgment_types import Trace as JudgmentTrace
3
- from typing import List, Optional
4
- from abc import abstractmethod
1
+ # from judgeval.scorers.base_scorer import BaseScorer
2
+ # from judgeval.data.judgment_types import Trace as JudgmentTrace
3
+ # from typing import List, Optional
4
+ # from abc import abstractmethod
5
5
 
6
6
 
7
- class TraceScorer(BaseScorer):
8
- @abstractmethod
9
- async def a_score_trace(
10
- self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
11
- ) -> float:
12
- """
13
- Asynchronously measures the score on a trace
14
- """
15
- raise NotImplementedError(
16
- "You must implement the `a_score_trace` method in your custom scorer"
17
- )
7
+ # class TraceScorer(BaseScorer):
8
+ # @abstractmethod
9
+ # async def a_score_trace(
10
+ # self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
11
+ # ) -> float:
12
+ # """
13
+ # Asynchronously measures the score on a trace
14
+ # """
15
+ # raise NotImplementedError(
16
+ # "You must implement the `a_score_trace` method in your custom scorer"
17
+ # )
@@ -27,7 +27,7 @@ class BaseScorer(BaseModel):
27
27
  threshold: float = 0.5
28
28
 
29
29
  # name of your scorer (Faithfulness, PromptScorer-randomslug)
30
- name: Optional[str] = None
30
+ name: str = ""
31
31
 
32
32
  # The name of the class of the scorer
33
33
  class_name: Optional[str] = None
@@ -42,7 +42,7 @@ class BaseScorer(BaseModel):
42
42
  using_native_model: Optional[bool] = None
43
43
 
44
44
  # Whether the test case passed or failed
45
- success: Optional[bool] = None
45
+ success: bool = False
46
46
 
47
47
  # The name of the model used to evaluate the test case
48
48
  model: Optional[str] = None
@@ -55,7 +55,7 @@ def fetch_prompt_scorer(
55
55
  ):
56
56
  client = JudgmentSyncClient(judgment_api_key, organization_id)
57
57
  try:
58
- scorer_config = client.fetch_scorer({"name": name})["scorer"]
58
+ scorer_config = client.fetch_scorers({"names": [name]})["scorers"][0]
59
59
  scorer_config.pop("created_at")
60
60
  scorer_config.pop("updated_at")
61
61
  return scorer_config
judgeval/scorers/score.py CHANGED
@@ -21,7 +21,7 @@ from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
21
21
 
22
22
 
23
23
  async def safe_a_score_example(
24
- scorer: ExampleScorer,
24
+ scorer: Union[ExampleScorer],
25
25
  example: Example,
26
26
  ):
27
27
  """
@@ -43,8 +43,8 @@ from judgeval.env import (
43
43
  JUDGMENT_ORG_ID,
44
44
  )
45
45
  from judgeval.logger import judgeval_logger
46
- from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
47
- from judgeval.scorers.base_scorer import BaseScorer
46
+ from judgeval.scorers.api_scorer import TraceAPIScorerConfig, ExampleAPIScorerConfig
47
+ from judgeval.scorers.example_scorer import ExampleScorer
48
48
  from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
49
49
  from judgeval.tracer.managers import (
50
50
  sync_span_context,
@@ -358,7 +358,6 @@ class Tracer:
358
358
  eval_run_name = f"async_trace_evaluate_{span_id}"
359
359
 
360
360
  eval_run = TraceEvaluationRun(
361
- organization_id=self.organization_id,
362
361
  project_name=self.project_name,
363
362
  eval_name=eval_run_name,
364
363
  scorers=[scorer],
@@ -862,7 +861,7 @@ class Tracer:
862
861
  self,
863
862
  /,
864
863
  *,
865
- scorer: Union[ExampleAPIScorerConfig, BaseScorer],
864
+ scorer: Union[ExampleAPIScorerConfig, ExampleScorer],
866
865
  example: Example,
867
866
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
868
867
  sampling_rate: float = 1.0,
@@ -871,9 +870,9 @@ class Tracer:
871
870
  judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
872
871
  return
873
872
 
874
- if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
873
+ if not isinstance(scorer, (ExampleAPIScorerConfig, ExampleScorer)):
875
874
  judgeval_logger.error(
876
- "Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
875
+ "Scorer must be an instance of ExampleAPIScorerConfig or ExampleScorer, got %s, skipping evaluation."
877
876
  % type(scorer)
878
877
  )
879
878
  return
@@ -903,12 +902,11 @@ class Tracer:
903
902
  trace_id = format(span_context.trace_id, "032x")
904
903
  span_id = format(span_context.span_id, "016x")
905
904
  hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
906
- isinstance(scorer, BaseScorer) and scorer.server_hosted
905
+ isinstance(scorer, ExampleScorer) and scorer.server_hosted
907
906
  )
908
907
  eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
909
908
  if hosted_scoring:
910
909
  eval_run = ExampleEvaluationRun(
911
- organization_id=self.organization_id,
912
910
  project_name=self.project_name,
913
911
  eval_name=eval_run_name,
914
912
  examples=[example],
@@ -923,7 +921,6 @@ class Tracer:
923
921
  else:
924
922
  # Handle custom scorers using local evaluation queue
925
923
  eval_run = ExampleEvaluationRun(
926
- organization_id=self.organization_id,
927
924
  project_name=self.project_name,
928
925
  eval_name=eval_run_name,
929
926
  examples=[example],
@@ -13,7 +13,7 @@ import time
13
13
  from judgeval.logger import judgeval_logger
14
14
  from judgeval.env import JUDGMENT_MAX_CONCURRENT_EVALUATIONS
15
15
  from judgeval.data import ScoringResult
16
- from judgeval.data.evaluation_run import EvaluationRun
16
+ from judgeval.data.evaluation_run import ExampleEvaluationRun
17
17
  from judgeval.utils.async_utils import safe_run_async
18
18
  from judgeval.scorers.score import a_execute_scoring
19
19
  from judgeval.api import JudgmentSyncClient
@@ -34,7 +34,7 @@ class LocalEvaluationQueue:
34
34
  ):
35
35
  if num_workers <= 0:
36
36
  raise ValueError("num_workers must be a positive integer.")
37
- self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
37
+ self._queue: queue.Queue[Optional[ExampleEvaluationRun]] = queue.Queue()
38
38
  self._max_concurrent = max_concurrent
39
39
  self._num_workers = num_workers # Number of worker threads
40
40
  self._worker_threads: List[threading.Thread] = []
@@ -44,11 +44,11 @@ class LocalEvaluationQueue:
44
44
  organization_id=JUDGMENT_ORG_ID,
45
45
  )
46
46
 
47
- def enqueue(self, evaluation_run: EvaluationRun) -> None:
47
+ def enqueue(self, evaluation_run: ExampleEvaluationRun) -> None:
48
48
  """Add evaluation run to the queue."""
49
49
  self._queue.put(evaluation_run)
50
50
 
51
- def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
51
+ def _process_run(self, evaluation_run: ExampleEvaluationRun) -> List[ScoringResult]:
52
52
  """Execute evaluation run locally and return results."""
53
53
 
54
54
  if not evaluation_run.custom_scorers:
@@ -70,7 +70,9 @@ class LocalEvaluationQueue:
70
70
 
71
71
  def run_all(
72
72
  self,
73
- callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
73
+ callback: Optional[
74
+ Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
75
+ ] = None,
74
76
  ) -> None:
75
77
  """Process all queued runs synchronously.
76
78
 
@@ -134,7 +136,9 @@ class LocalEvaluationQueue:
134
136
 
135
137
  def start_worker(
136
138
  self,
137
- callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
139
+ callback: Optional[
140
+ Callable[[ExampleEvaluationRun, List[ScoringResult]], None]
141
+ ] = None,
138
142
  ) -> Optional[threading.Thread]:
139
143
  """Start a single background thread to process runs (backward compatibility).
140
144
 
@@ -144,7 +148,7 @@ class LocalEvaluationQueue:
144
148
  Returns:
145
149
  The started thread, or None if no threads were started.
146
150
  """
147
- threads = self.start_workers(callback)
151
+ threads = self.start_workers()
148
152
  return threads[0] if threads else None
149
153
 
150
154
  def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
@@ -5,7 +5,7 @@ from typing import Optional, Dict, Any, TYPE_CHECKING
5
5
  import json
6
6
 
7
7
  if TYPE_CHECKING:
8
- from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral
8
+ from fireworks.llm.llm_reinforcement_step import ReinforcementAcceleratorTypeLiteral # type: ignore[import-not-found]
9
9
 
10
10
 
11
11
  @dataclass
@@ -1,4 +1,4 @@
1
- from fireworks import LLM
1
+ from fireworks import LLM # type: ignore[import-not-found]
2
2
  from .config import TrainerConfig, ModelConfig
3
3
  from typing import Optional, Dict, Any, Callable
4
4
  from .console import _model_spinner_progress, _print_model_progress
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  import time
4
4
  from typing import Optional, Callable, Any, List, Union, Dict
5
- from fireworks import Dataset
5
+ from fireworks import Dataset # type: ignore[import-not-found]
6
6
  from .config import TrainerConfig, ModelConfig
7
7
  from .trainable_model import TrainableModel
8
8
  from judgeval.tracer import Tracer
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
10
10
  from judgeval.tracer.exporters import InMemorySpanExporter
11
11
  from judgeval.tracer.keys import AttributeKeys
12
12
  from judgeval import JudgmentClient
13
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
13
+ from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
14
14
  from judgeval.data import Example
15
15
  from .console import _spinner_progress, _print_progress, _print_progress_update
16
16
  from judgeval.exceptions import JudgmentRuntimeError
@@ -85,7 +85,9 @@ class JudgmentTrainer:
85
85
  if not first_found and span_attributes.get(
86
86
  AttributeKeys.JUDGMENT_INPUT
87
87
  ):
88
- input_data = span_attributes.get(AttributeKeys.JUDGMENT_INPUT, {})
88
+ input_data: Any = span_attributes.get(
89
+ AttributeKeys.JUDGMENT_INPUT, {}
90
+ )
89
91
  if isinstance(input_data, dict) and "messages" in input_data:
90
92
  input_messages = input_data["messages"]
91
93
  if input_messages:
@@ -154,7 +156,7 @@ class JudgmentTrainer:
154
156
  async def generate_rollouts_and_rewards(
155
157
  self,
156
158
  agent_function: Callable[[Any], Any],
157
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
159
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
158
160
  prompts: List[Any],
159
161
  num_prompts_per_step: Optional[int] = None,
160
162
  num_generations_per_prompt: Optional[int] = None,
@@ -264,7 +266,7 @@ class JudgmentTrainer:
264
266
  async def run_reinforcement_learning(
265
267
  self,
266
268
  agent_function: Callable[[Any], Any],
267
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
269
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
268
270
  prompts: List[Any],
269
271
  ) -> ModelConfig:
270
272
  """
@@ -370,7 +372,7 @@ class JudgmentTrainer:
370
372
  async def train(
371
373
  self,
372
374
  agent_function: Callable[[Any], Any],
373
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
375
+ scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
374
376
  prompts: List[Any],
375
377
  rft_provider: Optional[str] = None,
376
378
  ) -> ModelConfig:
@@ -2,13 +2,13 @@
2
2
 
3
3
  import asyncio
4
4
  import concurrent.futures
5
- from typing import Awaitable, TypeVar
5
+ from typing import Awaitable, TypeVar, Coroutine
6
6
 
7
7
 
8
8
  T = TypeVar("T")
9
9
 
10
10
 
11
- def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
11
+ def safe_run_async(coro: Awaitable[T]) -> T:
12
12
  """Safely execute an async *coro* from synchronous code.
13
13
 
14
14
  This helper handles two common situations:
@@ -24,6 +24,8 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
24
24
  Returns:
25
25
  The result returned by *coro*.
26
26
  """
27
+ if not isinstance(coro, Coroutine):
28
+ raise TypeError("The provided awaitable must be a coroutine.")
27
29
 
28
30
  try:
29
31
  asyncio.get_running_loop()
@@ -31,5 +33,7 @@ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
31
33
  return asyncio.run(coro)
32
34
 
33
35
  with concurrent.futures.ThreadPoolExecutor() as executor:
34
- future = executor.submit(lambda: asyncio.run(coro))
36
+ future: concurrent.futures.Future[T] = executor.submit(
37
+ lambda: asyncio.run(coro)
38
+ )
35
39
  return future.result()
judgeval/utils/testing.py CHANGED
@@ -26,10 +26,6 @@ def assert_test_results(scoring_results: List[ScoringResult]) -> None:
26
26
  # If the result was not successful, check each scorer_data
27
27
  for scorer_data in result.scorers_data:
28
28
  if not scorer_data.success:
29
- if scorer_data.name == "Tool Order":
30
- # Remove threshold, evaluation model for Tool Order scorer
31
- scorer_data.threshold = None
32
- scorer_data.evaluation_model = None
33
29
  test_case.append(scorer_data)
34
30
  failed_cases.append(test_case)
35
31
 
judgeval/version.py CHANGED
@@ -1,4 +1,4 @@
1
- __version__ = "0.0.0"
1
+ __version__ = "0.12.0"
2
2
 
3
3
 
4
4
  def get_version() -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.10.1
3
+ Version: 0.12.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,38 +1,37 @@
1
- judgeval/__init__.py,sha256=MqB1s0zp-Fr_KvKFjGKnRHUeulutmrlMcUyjNRRAU_4,4962
1
+ judgeval/__init__.py,sha256=LDL_vOvI6LmMwbVt6NMPwponDeEOaGHV-nd_0wSCLHM,4957
2
2
  judgeval/cli.py,sha256=R5IiIQmSVg21kQHX2kL3sOeXCxvvAMSqyva3Z9AoSXc,1560
3
3
  judgeval/constants.py,sha256=h7Cuf_2uvNzHZi8nqRFoMpvsQUZMS3mlNB3s2uduse8,3557
4
- judgeval/env.py,sha256=R0bj7XU29RIVVQjkVMa11ObhOYVMbaE_3LTvL3I9dWM,2212
4
+ judgeval/env.py,sha256=QO_77E2oX5LLf29XgqLdUoYUIqEaGxd9mcCco6rzS-w,2445
5
5
  judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
6
6
  judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
7
- judgeval/version.py,sha256=kJtYsih3hTYZ_rY_Lt0RcFqvjAfF5Xo1uNq0jZWJ5pw,73
7
+ judgeval/version.py,sha256=necdb4jxf2rIhW5LPI_UhDC8zSb9h-dNqtKbwoLv6z8,74
8
8
  judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
9
- judgeval/api/__init__.py,sha256=asbr9nuP7H_0jh53P-LB8sQnRTYIRI6oBTxbigh3YdI,12993
10
- judgeval/api/api_types.py,sha256=hInVnVHrYFdPz9NiDtK5ik0rgRiB29a4PUkpRJYocRs,6666
9
+ judgeval/api/__init__.py,sha256=3Pm0qQ4ZQj76jUsJVrnuazRnYcqF3pzM_Wv_Z6lOv0w,13216
10
+ judgeval/api/api_types.py,sha256=AEh_9WpL0wTDUKZ0CwphkiGV3IeysBgTE9FzX4VYPic,6528
11
11
  judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
12
- judgeval/data/evaluation_run.py,sha256=G7ad4eDQTjketfcQRITk8bs8CIO8rm058H1G_qkLmhc,4729
12
+ judgeval/data/evaluation_run.py,sha256=N47waxScMFKvGBxADX2FrfjW4wT5Zqd8n1PZKWb7JMA,4766
13
13
  judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
14
- judgeval/data/judgment_types.py,sha256=JkhNG6fRBFdryG8ogVZsMWtq3W3JmWh0AYIR8LdBAT4,11773
15
- judgeval/data/result.py,sha256=LA0OzwcVKwD5NkmtmFuA_EusmYRyE10mjDMXa2bgU1g,2067
16
- judgeval/data/scorer_data.py,sha256=g9PE0DNLikW0LgxGWhgpCiNVOX8PzqEaZKivifLOUDI,2997
17
- judgeval/data/tool.py,sha256=bj_WxFg22mypUUVR5KqQRxMDHWvKwiE1MMPjLnTCoDU,99
14
+ judgeval/data/judgment_types.py,sha256=8cGuj6VAHjYPfmHZL_Bb4D0D2bLP0V9-_Wec2WZhjKA,12130
15
+ judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
16
+ judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
18
17
  judgeval/data/trace.py,sha256=R9RF1kv1JHeOpjXLjErJcxV2RrNrJUSqWcWe73l3f9k,503
19
18
  judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
20
19
  judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
21
- judgeval/dataset/__init__.py,sha256=gzjozPF_Sz0DhlPflYyHsdZxU8K4L1MplZ2W9-qSJiU,6393
22
- judgeval/evaluation/__init__.py,sha256=u-aDyLTRebPZigeBbJHpnZk3wQAS7jv_VgLXIi-jMGU,15075
23
- judgeval/integrations/langgraph/__init__.py,sha256=VvqCKOk65A2gLlr8uWrJVzpRF5OnIja5zwF4hGPEFsw,27540
20
+ judgeval/dataset/__init__.py,sha256=2B3ifWP_gn_4l0GgZaY2tB9UuV8m7dI1BEWwMgckDOc,6348
21
+ judgeval/evaluation/__init__.py,sha256=6bSC1Sw-fpJN6OkZTv4UtAoYZqkjUy7OG17lxiRX5qE,13321
22
+ judgeval/integrations/langgraph/__init__.py,sha256=Ow2rl21SmRQNVVR_WfejCsxFPcLvFFlpvKVgG0_igEQ,27580
24
23
  judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
25
24
  judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
26
25
  judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
27
26
  judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
28
27
  judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
29
- judgeval/scorers/__init__.py,sha256=34PMPsfR2_3n7T96wpSfAZJWzWlU6v53S3mGX2PE87k,665
30
- judgeval/scorers/agent_scorer.py,sha256=V1NSwhGWgtXPsX-blKLkDLsPPbEiP-A4614X-95dtlQ,565
28
+ judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
29
+ judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
31
30
  judgeval/scorers/api_scorer.py,sha256=8TUJut9r74v-qMACiSKAUbDI1v3ZItPXrTz8s4_Lrgk,2287
32
- judgeval/scorers/base_scorer.py,sha256=naGiZYHnkn9HVwY-jpOY7O6cYPJJJe5dHbrRBSOikxw,2723
31
+ judgeval/scorers/base_scorer.py,sha256=hsMuqdW8QtW5n9JzruXyaZC7im2K2sSmz1RDkbMisJ4,2702
33
32
  judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
34
33
  judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
35
- judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
34
+ judgeval/scorers/score.py,sha256=95tnNRnihrEVvG0yH-RDTQ8KoiBakDijjukclqxH5KE,7183
36
35
  judgeval/scorers/utils.py,sha256=iSZONwK0HecxUPz-cMCyra_87DSCag1E8BdpF2a4_44,377
37
36
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
37
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
@@ -40,11 +39,11 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=_qa1s
40
39
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
41
40
  judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=lIJ3GgOI9tfbrC7voZMvlxXdK3X1bhdj2zNxqdaGIkM,545
42
41
  judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=bSwbpVNhpkpEeX3GtCJuyz5vFyY1gbyqYEfaBF2KTVY,697
43
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=E2_TVO88iLSBAdcKYnfHYp4cUyffgG_p1th5aCpjCd8,9680
44
- judgeval/tracer/__init__.py,sha256=mQQaca8XJRYwSRn7a5x63dFQeA8xGjwfoZYikQCAAyI,35214
42
+ judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=djPfHC8NP9srwTAgp075kK_zz6Tbn2WFIh6jOZjqppQ,9688
43
+ judgeval/tracer/__init__.py,sha256=YLJklv1YfNDV61GiJw3PflLp_cajxAnXHojJVKitbz4,35074
45
44
  judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
46
45
  judgeval/tracer/keys.py,sha256=qXPoZSkEhVF-YYfQ9-zeDMVdr4GtpPf2W7MPJaN2AQo,2889
47
- judgeval/tracer/local_eval_queue.py,sha256=iv9on1G4woGlhYn1mZATEMkzCiz-qVn2cdzEINzQFYQ,7242
46
+ judgeval/tracer/local_eval_queue.py,sha256=KZKvSSli7B-EVzdHa4-CmXUpv0uOjGLLRa2KTPg8lRc,7320
48
47
  judgeval/tracer/managers.py,sha256=h2ZHJ61_vf3cS-HlEUiodFzKDUuQWIhYC6n7pMVyM9c,6113
49
48
  judgeval/tracer/utils.py,sha256=3_8ZjjF4XgNyAu9LpThq5dVOcwdwI-E3vb-HRl_Px8c,594
50
49
  judgeval/tracer/exporters/__init__.py,sha256=lnZXfPGaQH844HAIuZCQqjqhnmZGA98kHY8Xp-Oi4Ws,1220
@@ -55,21 +54,21 @@ judgeval/tracer/llm/__init__.py,sha256=p9uwWPg9k-NcWjj9TbwQj55sHhBOqRYx2-Ld6YHaF
55
54
  judgeval/tracer/llm/providers.py,sha256=QQLJlSNnDjXRAc2Wqw78o254COJUSXX39D7D_mx3NVA,2651
56
55
  judgeval/tracer/processors/__init__.py,sha256=tXbQaXGMQeutgM_7d5Y2EFTeSjbVEBky685Dst_v3rg,8672
57
56
  judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
58
- judgeval/trainer/config.py,sha256=8s0X8B334PJomorwONaUpb6K8cAMxRdYAeQdtx7HPHs,4258
57
+ judgeval/trainer/config.py,sha256=sAAVBgeoFDJWYjGIgOvoQoiO0gtqNAOI6MHncwdN_mk,4292
59
58
  judgeval/trainer/console.py,sha256=PJ0rCnDwC7aoW-VsLDS96ZyMyagh-l9EOJKff1ATIpo,4342
60
- judgeval/trainer/trainable_model.py,sha256=vSDtHJJ-fLczC2gkaY9jG6TQvLgWqaVjElm1l8YlJcU,8959
61
- judgeval/trainer/trainer.py,sha256=YhepEm3M-5z1RB50cAEsLbZiOIE_fOWiX-thyvBj6v4,16578
62
- judgeval/utils/async_utils.py,sha256=lgCgi8gkLUcAEepruEkx-AGQgJnAJpKmBIhZx6Y0q2s,935
59
+ judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
60
+ judgeval/trainer/trainer.py,sha256=FBhHq2YPooKADDCC_IEKex81L6a5quCmAMyl9mn3QLk,16675
61
+ judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
63
62
  judgeval/utils/decorators.py,sha256=rdqY1w0zNL6O6GU6Wdeo0-x5EgpFTEhU2vkgiWsRYdc,525
64
63
  judgeval/utils/file_utils.py,sha256=3LI1YCZwO5ogTgJreyOgRgDksey3natO2Td1PQqaPyY,3252
65
64
  judgeval/utils/guards.py,sha256=QBb6m6KElxdvt2bskLZCKh_zGHbBcqV-VfGzT63o3hY,807
66
65
  judgeval/utils/meta.py,sha256=wQFCLJTNKF9yUdXcw37AT6mC-wqzZpAvjn5gP_6flD8,349
67
66
  judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6256
68
- judgeval/utils/testing.py,sha256=kJOq4LlEXaNThfg9oSIRqSK7IH8AwLgbukjn5uxMY7A,3661
67
+ judgeval/utils/testing.py,sha256=4HO4UCZQgeB7wi-LQoKPjiAYMbj4PpeApAnxZdmI_8w,3392
69
68
  judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
70
69
  judgeval/utils/version_check.py,sha256=kcF6SvB6GbVKI0Gv9QRVm-kvBn9_z-c3jmPORsXO3h0,1015
71
- judgeval-0.10.1.dist-info/METADATA,sha256=Jd1eGkgAIO5XGpCaD42riNtaJ6DYJQeGCs8JLrl6Ibs,8870
72
- judgeval-0.10.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
73
- judgeval-0.10.1.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
74
- judgeval-0.10.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
75
- judgeval-0.10.1.dist-info/RECORD,,
70
+ judgeval-0.12.0.dist-info/METADATA,sha256=RVS9bm8KrWk-ifawDz1s9oDx_NY3zjGPkbknKKzpjeM,8870
71
+ judgeval-0.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
72
+ judgeval-0.12.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
73
+ judgeval-0.12.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
74
+ judgeval-0.12.0.dist-info/RECORD,,
judgeval/data/tool.py DELETED
@@ -1,5 +0,0 @@
1
- from judgeval.data.judgment_types import Tool as JudgmentTool
2
-
3
-
4
- class Tool(JudgmentTool):
5
- pass