judgeval 0.9.4__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/ci.yaml +8 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/.pre-commit-config.yaml +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/PKG-INFO +1 -1
- {judgeval-0.9.4 → judgeval-0.10.0}/pyproject.toml +1 -1
- {judgeval-0.9.4 → judgeval-0.10.0}/scripts/api_generator.py +4 -4
- {judgeval-0.9.4 → judgeval-0.10.0}/scripts/openapi_transform.py +2 -3
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/__init__.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/api/__init__.py +28 -96
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/api/api_types.py +49 -140
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/constants.py +1 -5
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/__init__.py +1 -3
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/example.py +4 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/judgment_types.py +57 -165
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/result.py +1 -2
- judgeval-0.10.0/src/judgeval/data/trace.py +14 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/dataset/__init__.py +15 -42
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/evaluation/__init__.py +23 -34
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/__init__.py +9 -7
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/api_scorer.py +8 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/base_scorer.py +0 -1
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/__init__.py +13 -50
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/local_eval_queue.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/processors/__init__.py +1 -1
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/utils.py +1 -1
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/trainer/trainer.py +4 -4
- judgeval-0.9.4/src/judgeval/data/trace.py +0 -40
- judgeval-0.9.4/src/judgeval/data/trace_run.py +0 -39
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval-0.9.4/src/judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/claude.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/.gitignore +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/LICENSE.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/README.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/agent.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/data.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/document.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/errors.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/experiments_page.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/logo-light.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/online_eval.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/product_shot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/test.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/tests.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/trace.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/trace_demo.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/pytest.ini +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/scripts/update_types.sh +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/cli.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/env.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/integrations/langgraph/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/logger.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/testing.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/version.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/src/judgeval/warnings.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/update_version.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.0}/uv.lock +0 -0
@@ -97,8 +97,11 @@ jobs:
|
|
97
97
|
- name: Run E2E tests
|
98
98
|
working-directory: src
|
99
99
|
run: |
|
100
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
100
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id stg/api-keys/e2e-tests --query SecretString --output text)
|
101
101
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
102
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
103
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
104
|
+
export JUDGMENT_API_URL=https://staging.api.judgmentlabs.ai
|
102
105
|
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
103
106
|
|
104
107
|
- name: Upload coverage HTML report (staging)
|
@@ -156,8 +159,11 @@ jobs:
|
|
156
159
|
- name: Run E2E tests
|
157
160
|
working-directory: src
|
158
161
|
run: |
|
159
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
162
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id prod/api-keys/e2e-tests --query SecretString --output text)
|
160
163
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
164
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
165
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
166
|
+
export JUDGMENT_API_URL=https://api.judgmentlabs.ai
|
161
167
|
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
162
168
|
|
163
169
|
- name: Upload coverage HTML report (production)
|
@@ -36,13 +36,13 @@ JUDGEVAL_PATHS: List[str] = [
|
|
36
36
|
"/fetch_scorer/",
|
37
37
|
"/scorer_exists/",
|
38
38
|
"/upload_custom_scorer/",
|
39
|
-
"/datasets/
|
40
|
-
"/datasets/
|
39
|
+
"/datasets/create_for_judgeval/",
|
40
|
+
"/datasets/insert_examples_for_judgeval/",
|
41
41
|
"/datasets/pull_for_judgeval/",
|
42
|
-
"/datasets/fetch_stats_by_project/",
|
43
42
|
"/projects/resolve/",
|
44
43
|
"/e2e_fetch_trace/",
|
45
44
|
"/e2e_fetch_span_score/",
|
45
|
+
"/e2e_fetch_trace_scorer_span_score/",
|
46
46
|
]
|
47
47
|
|
48
48
|
|
@@ -253,7 +253,7 @@ def generate_client_class(
|
|
253
253
|
|
254
254
|
def generate_api_file() -> str:
|
255
255
|
lines = [
|
256
|
-
"from typing import
|
256
|
+
"from typing import Dict, Any, Mapping, Literal, Optional",
|
257
257
|
"import httpx",
|
258
258
|
"from httpx import Response",
|
259
259
|
"from judgeval.exceptions import JudgmentAPIError",
|
@@ -35,10 +35,9 @@ JUDGEVAL_PATHS: List[str] = [
|
|
35
35
|
"/fetch_scorer/",
|
36
36
|
"/scorer_exists/",
|
37
37
|
"/upload_custom_scorer/",
|
38
|
-
"/datasets/
|
39
|
-
"/datasets/
|
38
|
+
"/datasets/create_for_judgeval/",
|
39
|
+
"/datasets/insert_examples_for_judgeval/",
|
40
40
|
"/datasets/pull_for_judgeval/",
|
41
|
-
"/datasets/fetch_stats_by_project/",
|
42
41
|
"/projects/resolve/",
|
43
42
|
"/e2e_fetch_trace/",
|
44
43
|
"/e2e_fetch_span_score/",
|
@@ -6,7 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
6
6
|
|
7
7
|
|
8
8
|
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import BaseScorer,
|
9
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
10
10
|
from judgeval.data.example import Example
|
11
11
|
from judgeval.logger import judgeval_logger
|
12
12
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +38,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
38
|
def run_evaluation(
|
39
39
|
self,
|
40
40
|
examples: List[Example],
|
41
|
-
scorers: List[Union[
|
41
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
42
42
|
project_name: str = "default_project",
|
43
43
|
eval_run_name: str = "default_eval_run",
|
44
44
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -71,13 +71,6 @@ class JudgmentSyncClient:
|
|
71
71
|
payload,
|
72
72
|
)
|
73
73
|
|
74
|
-
def evaluate_trace(self, payload: TraceRun) -> Any:
|
75
|
-
return self._request(
|
76
|
-
"POST",
|
77
|
-
url_for("/evaluate_trace/"),
|
78
|
-
payload,
|
79
|
-
)
|
80
|
-
|
81
74
|
def evaluate_examples(
|
82
75
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
83
76
|
) -> Any:
|
@@ -128,59 +121,26 @@ class JudgmentSyncClient:
|
|
128
121
|
query_params,
|
129
122
|
)
|
130
123
|
|
131
|
-
def
|
124
|
+
def datasets_insert_examples_for_judgeval(
|
125
|
+
self, payload: DatasetInsertExamples
|
126
|
+
) -> Any:
|
132
127
|
return self._request(
|
133
128
|
"POST",
|
134
|
-
url_for("/datasets/
|
129
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
135
130
|
payload,
|
136
131
|
)
|
137
132
|
|
138
|
-
def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
133
|
+
def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
139
134
|
return self._request(
|
140
135
|
"POST",
|
141
136
|
url_for("/datasets/pull_for_judgeval/"),
|
142
137
|
payload,
|
143
138
|
)
|
144
139
|
|
145
|
-
def
|
140
|
+
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
146
141
|
return self._request(
|
147
142
|
"POST",
|
148
|
-
url_for("/datasets/
|
149
|
-
payload,
|
150
|
-
)
|
151
|
-
|
152
|
-
def traces_upsert(self, payload: TraceSave) -> Any:
|
153
|
-
return self._request(
|
154
|
-
"POST",
|
155
|
-
url_for("/traces/upsert/"),
|
156
|
-
payload,
|
157
|
-
)
|
158
|
-
|
159
|
-
def traces_fetch(self, payload: TraceFetch) -> Any:
|
160
|
-
return self._request(
|
161
|
-
"POST",
|
162
|
-
url_for("/traces/fetch/"),
|
163
|
-
payload,
|
164
|
-
)
|
165
|
-
|
166
|
-
def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
167
|
-
return self._request(
|
168
|
-
"POST",
|
169
|
-
url_for("/traces/add_to_dataset/"),
|
170
|
-
payload,
|
171
|
-
)
|
172
|
-
|
173
|
-
def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
174
|
-
return self._request(
|
175
|
-
"POST",
|
176
|
-
url_for("/traces/spans/batch/"),
|
177
|
-
payload,
|
178
|
-
)
|
179
|
-
|
180
|
-
def traces_evaluation_runs_batch(self, payload: EvaluationRunsBatchRequest) -> Any:
|
181
|
-
return self._request(
|
182
|
-
"POST",
|
183
|
-
url_for("/traces/evaluation_runs/batch/"),
|
143
|
+
url_for("/datasets/create_for_judgeval/"),
|
184
144
|
payload,
|
185
145
|
)
|
186
146
|
|
@@ -255,6 +215,13 @@ class JudgmentSyncClient:
|
|
255
215
|
payload,
|
256
216
|
)
|
257
217
|
|
218
|
+
def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
219
|
+
return self._request(
|
220
|
+
"POST",
|
221
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
222
|
+
payload,
|
223
|
+
)
|
224
|
+
|
258
225
|
|
259
226
|
class JudgmentAsyncClient:
|
260
227
|
__slots__ = ("api_key", "organization_id", "client")
|
@@ -304,13 +271,6 @@ class JudgmentAsyncClient:
|
|
304
271
|
payload,
|
305
272
|
)
|
306
273
|
|
307
|
-
async def evaluate_trace(self, payload: TraceRun) -> Any:
|
308
|
-
return await self._request(
|
309
|
-
"POST",
|
310
|
-
url_for("/evaluate_trace/"),
|
311
|
-
payload,
|
312
|
-
)
|
313
|
-
|
314
274
|
async def evaluate_examples(
|
315
275
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
316
276
|
) -> Any:
|
@@ -363,61 +323,26 @@ class JudgmentAsyncClient:
|
|
363
323
|
query_params,
|
364
324
|
)
|
365
325
|
|
366
|
-
async def
|
326
|
+
async def datasets_insert_examples_for_judgeval(
|
327
|
+
self, payload: DatasetInsertExamples
|
328
|
+
) -> Any:
|
367
329
|
return await self._request(
|
368
330
|
"POST",
|
369
|
-
url_for("/datasets/
|
331
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
370
332
|
payload,
|
371
333
|
)
|
372
334
|
|
373
|
-
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
335
|
+
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
374
336
|
return await self._request(
|
375
337
|
"POST",
|
376
338
|
url_for("/datasets/pull_for_judgeval/"),
|
377
339
|
payload,
|
378
340
|
)
|
379
341
|
|
380
|
-
async def
|
381
|
-
return await self._request(
|
382
|
-
"POST",
|
383
|
-
url_for("/datasets/push/"),
|
384
|
-
payload,
|
385
|
-
)
|
386
|
-
|
387
|
-
async def traces_upsert(self, payload: TraceSave) -> Any:
|
388
|
-
return await self._request(
|
389
|
-
"POST",
|
390
|
-
url_for("/traces/upsert/"),
|
391
|
-
payload,
|
392
|
-
)
|
393
|
-
|
394
|
-
async def traces_fetch(self, payload: TraceFetch) -> Any:
|
395
|
-
return await self._request(
|
396
|
-
"POST",
|
397
|
-
url_for("/traces/fetch/"),
|
398
|
-
payload,
|
399
|
-
)
|
400
|
-
|
401
|
-
async def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
402
|
-
return await self._request(
|
403
|
-
"POST",
|
404
|
-
url_for("/traces/add_to_dataset/"),
|
405
|
-
payload,
|
406
|
-
)
|
407
|
-
|
408
|
-
async def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
409
|
-
return await self._request(
|
410
|
-
"POST",
|
411
|
-
url_for("/traces/spans/batch/"),
|
412
|
-
payload,
|
413
|
-
)
|
414
|
-
|
415
|
-
async def traces_evaluation_runs_batch(
|
416
|
-
self, payload: EvaluationRunsBatchRequest
|
417
|
-
) -> Any:
|
342
|
+
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
418
343
|
return await self._request(
|
419
344
|
"POST",
|
420
|
-
url_for("/
|
345
|
+
url_for("/datasets/create_for_judgeval/"),
|
421
346
|
payload,
|
422
347
|
)
|
423
348
|
|
@@ -494,6 +419,13 @@ class JudgmentAsyncClient:
|
|
494
419
|
payload,
|
495
420
|
)
|
496
421
|
|
422
|
+
async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
423
|
+
return await self._request(
|
424
|
+
"POST",
|
425
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
426
|
+
payload,
|
427
|
+
)
|
428
|
+
|
497
429
|
|
498
430
|
__all__ = [
|
499
431
|
"JudgmentSyncClient",
|
@@ -1,9 +1,9 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-09-10T17:42:12+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
|
-
from typing import Any, Dict, List, Optional, TypedDict, Union
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
7
7
|
from typing_extensions import NotRequired
|
8
8
|
|
9
9
|
|
@@ -16,40 +16,10 @@ class EvalResultsFetch(TypedDict):
|
|
16
16
|
|
17
17
|
|
18
18
|
class DatasetFetch(TypedDict):
|
19
|
-
|
19
|
+
dataset_name: str
|
20
20
|
project_name: str
|
21
21
|
|
22
22
|
|
23
|
-
class TraceSave(TypedDict):
|
24
|
-
project_name: str
|
25
|
-
trace_id: str
|
26
|
-
name: str
|
27
|
-
created_at: str
|
28
|
-
duration: float
|
29
|
-
offline_mode: NotRequired[bool]
|
30
|
-
has_notification: NotRequired[bool]
|
31
|
-
customer_id: NotRequired[Optional[str]]
|
32
|
-
tags: NotRequired[List[str]]
|
33
|
-
metadata: NotRequired[Dict[str, Any]]
|
34
|
-
update_id: NotRequired[int]
|
35
|
-
|
36
|
-
|
37
|
-
class TraceFetch(TypedDict):
|
38
|
-
trace_id: str
|
39
|
-
|
40
|
-
|
41
|
-
class TraceAddToDataset(TypedDict):
|
42
|
-
trace_id: str
|
43
|
-
trace_span_id: str
|
44
|
-
dataset_alias: str
|
45
|
-
project_name: str
|
46
|
-
|
47
|
-
|
48
|
-
class EvaluationRunsBatchRequest(TypedDict):
|
49
|
-
organization_id: str
|
50
|
-
evaluation_entries: List[Dict[str, Any]]
|
51
|
-
|
52
|
-
|
53
23
|
class ProjectAdd(TypedDict):
|
54
24
|
project_name: str
|
55
25
|
|
@@ -149,8 +119,8 @@ class ScorerConfig(TypedDict):
|
|
149
119
|
|
150
120
|
|
151
121
|
class Example(TypedDict):
|
152
|
-
example_id: str
|
153
|
-
created_at: str
|
122
|
+
example_id: NotRequired[str]
|
123
|
+
created_at: NotRequired[str]
|
154
124
|
name: NotRequired[Optional[str]]
|
155
125
|
|
156
126
|
|
@@ -160,28 +130,7 @@ class ValidationError(TypedDict):
|
|
160
130
|
type: str
|
161
131
|
|
162
132
|
|
163
|
-
|
164
|
-
span_id: str
|
165
|
-
trace_id: str
|
166
|
-
function: str
|
167
|
-
created_at: NotRequired[Any]
|
168
|
-
parent_span_id: NotRequired[Optional[str]]
|
169
|
-
span_type: NotRequired[Optional[str]]
|
170
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
171
|
-
output: NotRequired[Any]
|
172
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
173
|
-
usage: NotRequired[Optional[Dict[str, Any]]]
|
174
|
-
duration: NotRequired[Optional[float]]
|
175
|
-
expected_tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
176
|
-
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
177
|
-
has_evaluation: NotRequired[Optional[bool]]
|
178
|
-
agent_name: NotRequired[Optional[str]]
|
179
|
-
class_name: NotRequired[Optional[str]]
|
180
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
181
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
182
|
-
span_state: str
|
183
|
-
update_id: NotRequired[int]
|
184
|
-
queued_at: float
|
133
|
+
DatasetKind = Literal["trace", "example"]
|
185
134
|
|
186
135
|
|
187
136
|
class PromptScorer(TypedDict):
|
@@ -195,36 +144,45 @@ class PromptScorer(TypedDict):
|
|
195
144
|
|
196
145
|
|
197
146
|
class ScorerData(TypedDict):
|
147
|
+
id: NotRequired[str]
|
198
148
|
name: str
|
199
149
|
threshold: float
|
200
150
|
success: bool
|
201
151
|
score: NotRequired[Optional[float]]
|
202
152
|
reason: NotRequired[Optional[str]]
|
203
153
|
strict_mode: NotRequired[Optional[bool]]
|
204
|
-
evaluation_model: NotRequired[
|
154
|
+
evaluation_model: NotRequired[str]
|
205
155
|
error: NotRequired[Optional[str]]
|
206
156
|
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
207
157
|
|
208
158
|
|
209
|
-
class
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
159
|
+
class OtelTraceSpan(TypedDict):
|
160
|
+
organization_id: str
|
161
|
+
project_id: NotRequired[Optional[str]]
|
162
|
+
user_id: str
|
163
|
+
timestamp: str
|
164
|
+
trace_id: str
|
165
|
+
span_id: str
|
166
|
+
parent_span_id: NotRequired[Optional[str]]
|
167
|
+
trace_state: NotRequired[Optional[str]]
|
168
|
+
span_name: NotRequired[Optional[str]]
|
169
|
+
span_kind: NotRequired[Optional[str]]
|
170
|
+
service_name: NotRequired[Optional[str]]
|
171
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
172
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
173
|
+
duration: NotRequired[Optional[int]]
|
174
|
+
status_code: NotRequired[Optional[str]]
|
175
|
+
status_message: NotRequired[Optional[str]]
|
176
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
177
|
+
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
178
|
+
legacy_span_id: NotRequired[Optional[str]]
|
179
|
+
inputs: NotRequired[Optional[Dict[str, Any]]]
|
180
|
+
output: Any
|
181
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
182
|
+
agent_id: NotRequired[Optional[str]]
|
183
|
+
cumulative_llm_cost: NotRequired[Optional[float]]
|
184
|
+
state_after: NotRequired[Optional[Dict[str, Any]]]
|
185
|
+
state_before: NotRequired[Optional[Dict[str, Any]]]
|
228
186
|
|
229
187
|
|
230
188
|
class ExampleEvaluationRun(TypedDict):
|
@@ -257,88 +215,39 @@ class TraceEvaluationRun(TypedDict):
|
|
257
215
|
|
258
216
|
|
259
217
|
class DatasetInsertExamples(TypedDict):
|
260
|
-
|
218
|
+
dataset_name: str
|
261
219
|
examples: List[Example]
|
262
220
|
project_name: str
|
263
221
|
|
264
222
|
|
265
|
-
class
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
class FetchPromptScorerResponse(TypedDict):
|
271
|
-
scorer: PromptScorer
|
223
|
+
class DatasetReturn(TypedDict):
|
224
|
+
name: str
|
225
|
+
project_name: str
|
226
|
+
examples: NotRequired[Optional[List[Example]]]
|
272
227
|
|
273
228
|
|
274
|
-
class
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
span_type: NotRequired[Optional[str]]
|
281
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
282
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
283
|
-
output: NotRequired[Any]
|
284
|
-
usage: NotRequired[Optional[TraceUsage]]
|
285
|
-
duration: NotRequired[Optional[float]]
|
286
|
-
expected_tools: NotRequired[Optional[List[Tool]]]
|
287
|
-
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
288
|
-
has_evaluation: NotRequired[Optional[bool]]
|
289
|
-
agent_name: NotRequired[Optional[str]]
|
290
|
-
class_name: NotRequired[Optional[str]]
|
291
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
292
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
293
|
-
update_id: NotRequired[int]
|
229
|
+
class DatasetCreate(TypedDict):
|
230
|
+
name: str
|
231
|
+
dataset_kind: DatasetKind
|
232
|
+
project_name: str
|
233
|
+
examples: NotRequired[Optional[List[Example]]]
|
234
|
+
overwrite: NotRequired[Optional[bool]]
|
294
235
|
|
295
236
|
|
296
|
-
class
|
297
|
-
|
298
|
-
name: str
|
299
|
-
created_at: str
|
300
|
-
duration: float
|
301
|
-
trace_spans: List[TraceSpan]
|
302
|
-
offline_mode: NotRequired[bool]
|
303
|
-
rules: NotRequired[Dict[str, Any]]
|
304
|
-
has_notification: NotRequired[bool]
|
305
|
-
customer_id: NotRequired[Optional[str]]
|
306
|
-
tags: NotRequired[List[str]]
|
307
|
-
metadata: NotRequired[Dict[str, Any]]
|
308
|
-
update_id: NotRequired[int]
|
237
|
+
class FetchPromptScorerResponse(TypedDict):
|
238
|
+
scorer: PromptScorer
|
309
239
|
|
310
240
|
|
311
241
|
class ScoringResult(TypedDict):
|
312
242
|
success: bool
|
313
243
|
scorers_data: Optional[List[ScorerData]]
|
314
244
|
name: NotRequired[Optional[str]]
|
315
|
-
data_object: NotRequired[Optional[Union[
|
245
|
+
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
316
246
|
trace_id: NotRequired[Optional[str]]
|
317
247
|
run_duration: NotRequired[Optional[float]]
|
318
248
|
evaluation_cost: NotRequired[Optional[float]]
|
319
249
|
|
320
250
|
|
321
|
-
class TraceRun(TypedDict):
|
322
|
-
project_name: NotRequired[Optional[str]]
|
323
|
-
eval_name: NotRequired[Optional[str]]
|
324
|
-
traces: List[Trace]
|
325
|
-
scorers: List[ScorerConfig]
|
326
|
-
model: str
|
327
|
-
trace_span_id: NotRequired[Optional[str]]
|
328
|
-
tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
329
|
-
|
330
|
-
|
331
251
|
class EvalResults(TypedDict):
|
332
252
|
results: List[ScoringResult]
|
333
253
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
334
|
-
|
335
|
-
|
336
|
-
class DatasetPush(TypedDict):
|
337
|
-
dataset_alias: str
|
338
|
-
comments: NotRequired[Optional[str]]
|
339
|
-
source_file: NotRequired[Optional[str]]
|
340
|
-
examples: NotRequired[Optional[List[Example]]]
|
341
|
-
traces: NotRequired[Optional[List[Trace]]]
|
342
|
-
is_trace: NotRequired[bool]
|
343
|
-
project_name: str
|
344
|
-
overwrite: NotRequired[Optional[bool]]
|
@@ -14,16 +14,12 @@ class APIScorerType(str, Enum):
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
PROMPT_SCORER = "Prompt Scorer"
|
17
|
+
TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
|
17
18
|
FAITHFULNESS = "Faithfulness"
|
18
19
|
ANSWER_RELEVANCY = "Answer Relevancy"
|
19
20
|
ANSWER_CORRECTNESS = "Answer Correctness"
|
20
21
|
INSTRUCTION_ADHERENCE = "Instruction Adherence"
|
21
22
|
EXECUTION_ORDER = "Execution Order"
|
22
|
-
DERAILMENT = "Derailment"
|
23
|
-
TOOL_ORDER = "Tool Order"
|
24
|
-
MOCK_TRACE_SCORER = "Mock Trace Scorer"
|
25
|
-
CLASSIFIER = "Classifier"
|
26
|
-
TOOL_DEPENDENCY = "Tool Dependency"
|
27
23
|
CUSTOM = "Custom"
|
28
24
|
|
29
25
|
@classmethod
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
3
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
4
|
-
from judgeval.data.trace import
|
4
|
+
from judgeval.data.trace import TraceUsage
|
5
5
|
|
6
6
|
|
7
7
|
__all__ = [
|
@@ -11,7 +11,5 @@ __all__ = [
|
|
11
11
|
"create_scorer_data",
|
12
12
|
"ScoringResult",
|
13
13
|
"generate_scoring_result",
|
14
|
-
"Trace",
|
15
|
-
"TraceSpan",
|
16
14
|
"TraceUsage",
|
17
15
|
]
|
@@ -6,6 +6,8 @@ from enum import Enum
|
|
6
6
|
from datetime import datetime
|
7
7
|
from typing import Dict, Any, Optional
|
8
8
|
from judgeval.data.judgment_types import Example as JudgmentExample
|
9
|
+
from uuid import uuid4
|
10
|
+
from pydantic import Field
|
9
11
|
|
10
12
|
|
11
13
|
class ExampleParams(str, Enum):
|
@@ -20,8 +22,8 @@ class ExampleParams(str, Enum):
|
|
20
22
|
|
21
23
|
|
22
24
|
class Example(JudgmentExample):
|
23
|
-
example_id: str =
|
24
|
-
created_at: str = datetime.now().isoformat()
|
25
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
26
|
+
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
|
25
27
|
name: Optional[str] = None
|
26
28
|
|
27
29
|
def to_dict(self) -> Dict[str, Any]:
|