judgeval 0.10.1__tar.gz → 0.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/ci.yaml +2 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/release.yaml +1 -1
- judgeval-0.12.0/.pre-commit-config.yaml +23 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/PKG-INFO +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/pyproject.toml +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/scripts/api_generator.py +2 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/scripts/openapi_transform.py +2 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/__init__.py +5 -5
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/api/__init__.py +17 -9
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/api/api_types.py +20 -18
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/evaluation_run.py +13 -12
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/judgment_types.py +25 -14
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/result.py +1 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/scorer_data.py +1 -26
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/dataset/__init__.py +17 -16
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/env.py +11 -2
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/evaluation/__init__.py +20 -63
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/integrations/langgraph/__init__.py +2 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/__init__.py +2 -0
- judgeval-0.12.0/src/judgeval/scorers/agent_scorer.py +17 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/base_scorer.py +2 -2
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/score.py +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/__init__.py +6 -9
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/local_eval_queue.py +11 -7
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/trainer/config.py +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/trainer/trainable_model.py +1 -1
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/trainer/trainer.py +8 -6
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/async_utils.py +7 -3
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/testing.py +0 -4
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/version.py +1 -1
- judgeval-0.12.0/update_version.py +35 -0
- judgeval-0.10.1/.pre-commit-config.yaml +0 -23
- judgeval-0.10.1/src/judgeval/data/tool.py +0 -5
- judgeval-0.10.1/src/judgeval/scorers/agent_scorer.py +0 -17
- judgeval-0.10.1/update_version.py +0 -32
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/claude.yml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/.gitignore +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/LICENSE.md +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/README.md +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/agent.gif +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/data.gif +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/document.gif +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/errors.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/experiments_page.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/logo-light.svg +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/online_eval.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/product_shot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/test.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/tests.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/trace.gif +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/trace_demo.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/pytest.ini +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/scripts/update_types.sh +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/cli.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/constants.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/example.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/data/trace.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/logger.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/processors/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/tracer/utils.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/src/judgeval/warnings.py +0 -0
- {judgeval-0.10.1 → judgeval-0.12.0}/uv.lock +0 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
repos:
|
2
|
+
- repo: https://github.com/astral-sh/uv-pre-commit
|
3
|
+
rev: 0.8.17
|
4
|
+
hooks:
|
5
|
+
- id: uv-lock
|
6
|
+
|
7
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
8
|
+
rev: v0.13.0
|
9
|
+
hooks:
|
10
|
+
- id: ruff
|
11
|
+
name: ruff (linter)
|
12
|
+
args: [--fix]
|
13
|
+
- id: ruff-format
|
14
|
+
name: ruff (formatter)
|
15
|
+
|
16
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
17
|
+
rev: v1.17.0
|
18
|
+
hooks:
|
19
|
+
- id: mypy
|
20
|
+
language: system
|
21
|
+
# These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
|
22
|
+
verbose: true
|
23
|
+
entry: bash -c 'mypy src/judgeval/ || true'
|
@@ -33,12 +33,13 @@ JUDGEVAL_PATHS: List[str] = [
|
|
33
33
|
"/add_to_run_eval_queue/traces",
|
34
34
|
"/get_evaluation_status/",
|
35
35
|
"/save_scorer/",
|
36
|
-
"/
|
36
|
+
"/fetch_scorers/",
|
37
37
|
"/scorer_exists/",
|
38
38
|
"/upload_custom_scorer/",
|
39
39
|
"/datasets/create_for_judgeval/",
|
40
40
|
"/datasets/insert_examples_for_judgeval/",
|
41
41
|
"/datasets/pull_for_judgeval/",
|
42
|
+
"/datasets/pull_all_for_judgeval/",
|
42
43
|
"/projects/resolve/",
|
43
44
|
"/e2e_fetch_trace/",
|
44
45
|
"/e2e_fetch_span_score/",
|
@@ -32,12 +32,13 @@ JUDGEVAL_PATHS: List[str] = [
|
|
32
32
|
"/add_to_run_eval_queue/traces",
|
33
33
|
"/get_evaluation_status/",
|
34
34
|
"/save_scorer/",
|
35
|
-
"/
|
35
|
+
"/fetch_scorers/",
|
36
36
|
"/scorer_exists/",
|
37
37
|
"/upload_custom_scorer/",
|
38
38
|
"/datasets/create_for_judgeval/",
|
39
39
|
"/datasets/insert_examples_for_judgeval/",
|
40
40
|
"/datasets/pull_for_judgeval/",
|
41
|
+
"/datasets/pull_all_for_judgeval/",
|
41
42
|
"/projects/resolve/",
|
42
43
|
"/e2e_fetch_trace/",
|
43
44
|
"/e2e_fetch_span_score/",
|
@@ -5,8 +5,9 @@ from judgeval.evaluation import run_eval
|
|
5
5
|
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
6
6
|
|
7
7
|
|
8
|
-
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import
|
8
|
+
from typing import List, Optional, Union, Sequence
|
9
|
+
from judgeval.scorers import ExampleAPIScorerConfig
|
10
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
10
11
|
from judgeval.data.example import Example
|
11
12
|
from judgeval.logger import judgeval_logger
|
12
13
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
39
|
def run_evaluation(
|
39
40
|
self,
|
40
41
|
examples: List[Example],
|
41
|
-
scorers:
|
42
|
+
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
42
43
|
project_name: str = "default_project",
|
43
44
|
eval_run_name: str = "default_eval_run",
|
44
45
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
51
52
|
examples=examples,
|
52
53
|
scorers=scorers,
|
53
54
|
model=model,
|
54
|
-
organization_id=self.organization_id,
|
55
55
|
)
|
56
56
|
|
57
|
-
results = run_eval(eval
|
57
|
+
results = run_eval(eval)
|
58
58
|
if assert_test:
|
59
59
|
assert_test_results(results)
|
60
60
|
|
@@ -137,12 +137,13 @@ class JudgmentSyncClient:
|
|
137
137
|
payload,
|
138
138
|
)
|
139
139
|
|
140
|
-
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) ->
|
140
|
+
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
141
141
|
return self._request(
|
142
142
|
"POST",
|
143
143
|
url_for("/datasets/pull_all_for_judgeval/"),
|
144
144
|
payload,
|
145
145
|
)
|
146
|
+
|
146
147
|
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
147
148
|
return self._request(
|
148
149
|
"POST",
|
@@ -180,12 +181,12 @@ class JudgmentSyncClient:
|
|
180
181
|
payload,
|
181
182
|
)
|
182
183
|
|
183
|
-
def
|
184
|
-
self, payload:
|
185
|
-
) ->
|
184
|
+
def fetch_scorers(
|
185
|
+
self, payload: FetchPromptScorersRequest
|
186
|
+
) -> FetchPromptScorersResponse:
|
186
187
|
return self._request(
|
187
188
|
"POST",
|
188
|
-
url_for("/
|
189
|
+
url_for("/fetch_scorers/"),
|
189
190
|
payload,
|
190
191
|
)
|
191
192
|
|
@@ -345,6 +346,13 @@ class JudgmentAsyncClient:
|
|
345
346
|
payload,
|
346
347
|
)
|
347
348
|
|
349
|
+
async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
|
350
|
+
return await self._request(
|
351
|
+
"POST",
|
352
|
+
url_for("/datasets/pull_all_for_judgeval/"),
|
353
|
+
payload,
|
354
|
+
)
|
355
|
+
|
348
356
|
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
349
357
|
return await self._request(
|
350
358
|
"POST",
|
@@ -384,12 +392,12 @@ class JudgmentAsyncClient:
|
|
384
392
|
payload,
|
385
393
|
)
|
386
394
|
|
387
|
-
async def
|
388
|
-
self, payload:
|
389
|
-
) ->
|
395
|
+
async def fetch_scorers(
|
396
|
+
self, payload: FetchPromptScorersRequest
|
397
|
+
) -> FetchPromptScorersResponse:
|
390
398
|
return await self._request(
|
391
399
|
"POST",
|
392
|
-
url_for("/
|
400
|
+
url_for("/fetch_scorers/"),
|
393
401
|
payload,
|
394
402
|
)
|
395
403
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:35+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -19,6 +19,7 @@ class DatasetFetch(TypedDict):
|
|
19
19
|
dataset_name: str
|
20
20
|
project_name: str
|
21
21
|
|
22
|
+
|
22
23
|
class DatasetsFetch(TypedDict):
|
23
24
|
project_name: str
|
24
25
|
|
@@ -60,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
|
|
60
61
|
name: str
|
61
62
|
|
62
63
|
|
63
|
-
class
|
64
|
-
|
64
|
+
class FetchPromptScorersRequest(TypedDict):
|
65
|
+
names: NotRequired[Optional[List[str]]]
|
65
66
|
|
66
67
|
|
67
68
|
class CustomScorerUploadPayload(TypedDict):
|
@@ -154,7 +155,7 @@ class ScorerData(TypedDict):
|
|
154
155
|
score: NotRequired[Optional[float]]
|
155
156
|
reason: NotRequired[Optional[str]]
|
156
157
|
strict_mode: NotRequired[Optional[bool]]
|
157
|
-
evaluation_model: NotRequired[str]
|
158
|
+
evaluation_model: NotRequired[Optional[str]]
|
158
159
|
error: NotRequired[Optional[str]]
|
159
160
|
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
160
161
|
|
@@ -189,13 +190,13 @@ class OtelTraceSpan(TypedDict):
|
|
189
190
|
|
190
191
|
|
191
192
|
class ExampleEvaluationRun(TypedDict):
|
192
|
-
id: NotRequired[
|
193
|
-
project_name:
|
194
|
-
eval_name:
|
193
|
+
id: NotRequired[str]
|
194
|
+
project_name: str
|
195
|
+
eval_name: str
|
195
196
|
custom_scorers: NotRequired[List[BaseScorer]]
|
196
197
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
197
198
|
model: str
|
198
|
-
created_at: NotRequired[
|
199
|
+
created_at: NotRequired[str]
|
199
200
|
examples: List[Example]
|
200
201
|
trace_span_id: NotRequired[Optional[str]]
|
201
202
|
trace_id: NotRequired[Optional[str]]
|
@@ -206,13 +207,13 @@ class HTTPValidationError(TypedDict):
|
|
206
207
|
|
207
208
|
|
208
209
|
class TraceEvaluationRun(TypedDict):
|
209
|
-
id: NotRequired[
|
210
|
-
project_name:
|
211
|
-
eval_name:
|
210
|
+
id: NotRequired[str]
|
211
|
+
project_name: str
|
212
|
+
eval_name: str
|
212
213
|
custom_scorers: NotRequired[List[BaseScorer]]
|
213
214
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
214
215
|
model: str
|
215
|
-
created_at: NotRequired[
|
216
|
+
created_at: NotRequired[str]
|
216
217
|
trace_and_span_ids: List[TraceAndSpanId]
|
217
218
|
is_offline: NotRequired[bool]
|
218
219
|
|
@@ -228,30 +229,31 @@ class DatasetReturn(TypedDict):
|
|
228
229
|
project_name: str
|
229
230
|
examples: NotRequired[Optional[List[Example]]]
|
230
231
|
|
232
|
+
|
231
233
|
class DatasetInfo(TypedDict):
|
232
234
|
dataset_id: str
|
233
235
|
name: str
|
234
236
|
created_at: str
|
235
237
|
dataset_kind: DatasetKind
|
236
238
|
entries: int
|
237
|
-
creator: str
|
239
|
+
creator: str
|
238
240
|
|
239
241
|
|
240
242
|
class DatasetCreate(TypedDict):
|
241
243
|
name: str
|
242
244
|
dataset_kind: DatasetKind
|
243
245
|
project_name: str
|
244
|
-
examples:
|
245
|
-
overwrite:
|
246
|
+
examples: List[Example]
|
247
|
+
overwrite: bool
|
246
248
|
|
247
249
|
|
248
|
-
class
|
249
|
-
|
250
|
+
class FetchPromptScorersResponse(TypedDict):
|
251
|
+
scorers: List[PromptScorer]
|
250
252
|
|
251
253
|
|
252
254
|
class ScoringResult(TypedDict):
|
253
255
|
success: bool
|
254
|
-
scorers_data:
|
256
|
+
scorers_data: List[ScorerData]
|
255
257
|
name: NotRequired[Optional[str]]
|
256
258
|
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
257
259
|
trace_id: NotRequired[Optional[str]]
|
@@ -1,11 +1,11 @@
|
|
1
|
-
from typing import List, Optional, Union, Tuple
|
2
|
-
from
|
3
|
-
from pydantic import field_validator, model_validator, Field
|
1
|
+
from typing import List, Optional, Union, Tuple, Sequence
|
2
|
+
from pydantic import field_validator, model_validator, Field, BaseModel
|
4
3
|
from datetime import datetime, timezone
|
5
4
|
import uuid
|
6
5
|
|
7
6
|
from judgeval.data import Example
|
8
|
-
from judgeval.scorers import
|
7
|
+
from judgeval.scorers import APIScorerConfig
|
8
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
9
9
|
from judgeval.constants import ACCEPTABLE_MODELS
|
10
10
|
from judgeval.data.judgment_types import (
|
11
11
|
ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
|
@@ -14,19 +14,20 @@ from judgeval.data.judgment_types import (
|
|
14
14
|
|
15
15
|
|
16
16
|
class EvaluationRun(BaseModel):
|
17
|
-
id:
|
18
|
-
created_at:
|
17
|
+
id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
18
|
+
created_at: str = Field(
|
19
19
|
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
20
20
|
)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
21
|
+
custom_scorers: List[ExampleScorer] = Field(default_factory=list)
|
22
|
+
judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
|
23
|
+
scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
|
24
|
+
default_factory=list
|
25
|
+
)
|
25
26
|
model: str
|
26
27
|
|
27
28
|
def __init__(
|
28
29
|
self,
|
29
|
-
scorers: Optional[List[Union[
|
30
|
+
scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
|
30
31
|
**kwargs,
|
31
32
|
):
|
32
33
|
"""
|
@@ -38,7 +39,7 @@ class EvaluationRun(BaseModel):
|
|
38
39
|
"""
|
39
40
|
if scorers is not None:
|
40
41
|
# Automatically sort scorers into appropriate fields
|
41
|
-
custom_scorers = [s for s in scorers if isinstance(s,
|
42
|
+
custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
|
42
43
|
judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
|
43
44
|
|
44
45
|
# Always set both fields as lists (even if empty) to satisfy validation
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-12T16:54:34+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
|
|
22
22
|
project_name: Annotated[str, Field(title="Project Name")]
|
23
23
|
|
24
24
|
|
25
|
+
class DatasetsFetch(BaseModel):
|
26
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
27
|
+
|
28
|
+
|
25
29
|
class ProjectAdd(BaseModel):
|
26
30
|
project_name: Annotated[str, Field(title="Project Name")]
|
27
31
|
|
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
|
|
59
63
|
name: Annotated[str, Field(title="Name")]
|
60
64
|
|
61
65
|
|
62
|
-
class
|
63
|
-
|
66
|
+
class FetchPromptScorersRequest(BaseModel):
|
67
|
+
names: Annotated[Optional[List[str]], Field(title="Names")] = None
|
64
68
|
|
65
69
|
|
66
70
|
class CustomScorerUploadPayload(BaseModel):
|
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
|
|
210
214
|
|
211
215
|
class ExampleEvaluationRun(BaseModel):
|
212
216
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
213
|
-
project_name: Annotated[
|
214
|
-
eval_name: Annotated[
|
217
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
218
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
215
219
|
custom_scorers: Annotated[
|
216
220
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
217
221
|
] = []
|
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
|
|
231
235
|
|
232
236
|
class TraceEvaluationRun(BaseModel):
|
233
237
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
234
|
-
project_name: Annotated[
|
235
|
-
eval_name: Annotated[
|
238
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
239
|
+
eval_name: Annotated[str, Field(title="Eval Name")]
|
236
240
|
custom_scorers: Annotated[
|
237
241
|
Optional[List[BaseScorer]], Field(title="Custom Scorers")
|
238
242
|
] = []
|
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
|
|
259
263
|
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
260
264
|
|
261
265
|
|
266
|
+
class DatasetInfo(BaseModel):
|
267
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
268
|
+
name: Annotated[str, Field(title="Name")]
|
269
|
+
created_at: Annotated[str, Field(title="Created At")]
|
270
|
+
dataset_kind: DatasetKind
|
271
|
+
entries: Annotated[int, Field(title="Entries")]
|
272
|
+
creator: Annotated[str, Field(title="Creator")]
|
273
|
+
|
274
|
+
|
262
275
|
class DatasetCreate(BaseModel):
|
263
276
|
name: Annotated[str, Field(title="Name")]
|
264
277
|
dataset_kind: DatasetKind
|
265
278
|
project_name: Annotated[str, Field(title="Project Name")]
|
266
|
-
examples: Annotated[
|
267
|
-
overwrite: Annotated[
|
279
|
+
examples: Annotated[List[Example], Field(title="Examples")]
|
280
|
+
overwrite: Annotated[bool, Field(title="Overwrite")]
|
268
281
|
|
269
282
|
|
270
|
-
class
|
271
|
-
|
283
|
+
class FetchPromptScorersResponse(BaseModel):
|
284
|
+
scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
|
272
285
|
|
273
286
|
|
274
287
|
class ScoringResult(BaseModel):
|
275
288
|
success: Annotated[bool, Field(title="Success")]
|
276
|
-
scorers_data: Annotated[
|
277
|
-
None
|
278
|
-
)
|
289
|
+
scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
|
279
290
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
280
291
|
data_object: Annotated[
|
281
292
|
Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
|
@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
|
|
6
6
|
|
7
7
|
from __future__ import annotations
|
8
8
|
|
9
|
-
from judgeval.data.judgment_types import ScorerData
|
9
|
+
from judgeval.data.judgment_types import ScorerData
|
10
10
|
from judgeval.scorers import BaseScorer
|
11
11
|
from typing import List
|
12
12
|
|
13
13
|
|
14
|
-
class ScorerData(JudgmentScorerData):
|
15
|
-
"""
|
16
|
-
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
17
|
-
|
18
|
-
For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
|
19
|
-
object will contain whether the example passed its threshold expectation, as well as more detailed
|
20
|
-
information surrounding the evaluation run such as the claims and verdicts generated by the
|
21
|
-
judge model(s).
|
22
|
-
"""
|
23
|
-
|
24
|
-
def to_dict(self) -> dict:
|
25
|
-
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
26
|
-
return {
|
27
|
-
"name": self.name,
|
28
|
-
"threshold": self.threshold,
|
29
|
-
"success": self.success,
|
30
|
-
"score": self.score,
|
31
|
-
"reason": self.reason,
|
32
|
-
"strict_mode": self.strict_mode,
|
33
|
-
"evaluation_model": self.evaluation_model,
|
34
|
-
"error": self.error,
|
35
|
-
"additional_metadata": self.additional_metadata,
|
36
|
-
}
|
37
|
-
|
38
|
-
|
39
14
|
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
40
15
|
"""
|
41
16
|
After a `scorer` is run, it contains information about the example that was evaluated
|
@@ -3,7 +3,7 @@ import orjson
|
|
3
3
|
import os
|
4
4
|
import yaml
|
5
5
|
from dataclasses import dataclass
|
6
|
-
from typing import List, Literal
|
6
|
+
from typing import List, Literal
|
7
7
|
|
8
8
|
from judgeval.data import Example
|
9
9
|
from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
|
@@ -13,15 +13,17 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
13
13
|
|
14
14
|
from judgeval.api.api_types import DatasetKind
|
15
15
|
|
16
|
+
|
16
17
|
@dataclass
|
17
18
|
class DatasetInfo:
|
18
19
|
dataset_id: str
|
19
|
-
name: str
|
20
|
+
name: str
|
20
21
|
created_at: str
|
21
22
|
dataset_kind: DatasetKind
|
22
23
|
entries: int
|
23
24
|
creator: str
|
24
25
|
|
26
|
+
|
25
27
|
@dataclass
|
26
28
|
class Dataset:
|
27
29
|
examples: List[Example]
|
@@ -46,9 +48,12 @@ class Dataset:
|
|
46
48
|
if not dataset:
|
47
49
|
raise ValueError(f"Dataset {name} not found in project {project_name}")
|
48
50
|
examples = dataset.get("examples", [])
|
51
|
+
if examples is None:
|
52
|
+
examples = []
|
53
|
+
|
49
54
|
for e in examples:
|
50
|
-
if isinstance(e, dict) and isinstance(e.get("data"), dict):
|
51
|
-
e.update(e.pop("data"))
|
55
|
+
if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
|
56
|
+
e.update(e.pop("data")) # type: ignore
|
52
57
|
e.pop(
|
53
58
|
"example_id"
|
54
59
|
) # TODO: remove once scorer data migraiton is complete
|
@@ -64,7 +69,7 @@ class Dataset:
|
|
64
69
|
cls,
|
65
70
|
name: str,
|
66
71
|
project_name: str,
|
67
|
-
examples:
|
72
|
+
examples: List[Example] = [],
|
68
73
|
overwrite: bool = False,
|
69
74
|
):
|
70
75
|
if not examples:
|
@@ -75,7 +80,7 @@ class Dataset:
|
|
75
80
|
{
|
76
81
|
"name": name,
|
77
82
|
"project_name": project_name,
|
78
|
-
"examples":
|
83
|
+
"examples": examples, # type: ignore
|
79
84
|
"dataset_kind": "example",
|
80
85
|
"overwrite": overwrite,
|
81
86
|
}
|
@@ -87,18 +92,14 @@ class Dataset:
|
|
87
92
|
project_name=project_name,
|
88
93
|
examples=examples,
|
89
94
|
)
|
95
|
+
|
90
96
|
@classmethod
|
91
|
-
def list(
|
92
|
-
cls,
|
93
|
-
project_name: str
|
94
|
-
):
|
97
|
+
def list(cls, project_name: str):
|
95
98
|
client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
|
96
|
-
datasets = client.datasets_pull_all_for_judgeval(
|
97
|
-
|
98
|
-
)
|
99
|
-
|
99
|
+
datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
|
100
|
+
|
100
101
|
judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
|
101
|
-
|
102
|
+
|
102
103
|
return [DatasetInfo(**dataset_info) for dataset_info in datasets]
|
103
104
|
|
104
105
|
def add_from_json(self, file_path: str) -> None:
|
@@ -147,7 +148,7 @@ class Dataset:
|
|
147
148
|
{
|
148
149
|
"dataset_name": self.name,
|
149
150
|
"project_name": self.project_name,
|
150
|
-
"examples":
|
151
|
+
"examples": examples, # type: ignore
|
151
152
|
}
|
152
153
|
)
|
153
154
|
|
@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
|
|
19
19
|
return os.getenv(var_name, default)
|
20
20
|
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
def required_env_var(var_name: str) -> str:
|
23
|
+
value = os.getenv(var_name)
|
24
|
+
if value is None:
|
25
|
+
raise EnvironmentError(
|
26
|
+
f"Environment variable '{var_name}' is required but not set."
|
27
|
+
)
|
28
|
+
return value
|
29
|
+
|
30
|
+
|
31
|
+
JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
|
32
|
+
JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
|
24
33
|
JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
25
34
|
|
26
35
|
JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")
|