judgeval 0.9.4__tar.gz → 0.10.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/ci.yaml +8 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/.pre-commit-config.yaml +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/PKG-INFO +1 -1
- {judgeval-0.9.4 → judgeval-0.10.1}/pyproject.toml +1 -1
- {judgeval-0.9.4 → judgeval-0.10.1}/scripts/api_generator.py +4 -4
- {judgeval-0.9.4 → judgeval-0.10.1}/scripts/openapi_transform.py +2 -3
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/__init__.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/api/__init__.py +30 -92
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/api/api_types.py +57 -137
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/constants.py +1 -5
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/__init__.py +1 -3
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/example.py +4 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/judgment_types.py +57 -165
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/result.py +1 -2
- judgeval-0.10.1/src/judgeval/data/trace.py +14 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/dataset/__init__.py +40 -44
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/evaluation/__init__.py +23 -34
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/__init__.py +9 -7
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/api_scorer.py +8 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/base_scorer.py +0 -1
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/__init__.py +13 -50
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/local_eval_queue.py +2 -2
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/processors/__init__.py +1 -1
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/utils.py +1 -1
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/trainer/trainer.py +4 -4
- judgeval-0.9.4/src/judgeval/data/trace.py +0 -40
- judgeval-0.9.4/src/judgeval/data/trace_run.py +0 -39
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval-0.9.4/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval-0.9.4/src/judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/pull_request_template.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/claude.yml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/release.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/.gitignore +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/LICENSE.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/README.md +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/agent.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/agent_trace_example.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/data.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/document.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/errors.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/experiments_page.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/logo-dark.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/logo-light.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/new_darkmode.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/new_lightmode.svg +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/online_eval.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/product_shot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/test.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/tests.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/trace.gif +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/trace_demo.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/trace_screenshot.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/pytest.ini +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/scripts/update_types.sh +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/cli.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/env.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/integrations/langgraph/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/logger.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/testing.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/version.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/src/judgeval/warnings.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/update_version.py +0 -0
- {judgeval-0.9.4 → judgeval-0.10.1}/uv.lock +0 -0
@@ -97,8 +97,11 @@ jobs:
|
|
97
97
|
- name: Run E2E tests
|
98
98
|
working-directory: src
|
99
99
|
run: |
|
100
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
100
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id stg/api-keys/e2e-tests --query SecretString --output text)
|
101
101
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
102
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
103
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
104
|
+
export JUDGMENT_API_URL=https://staging.api.judgmentlabs.ai
|
102
105
|
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
103
106
|
|
104
107
|
- name: Upload coverage HTML report (staging)
|
@@ -156,8 +159,11 @@ jobs:
|
|
156
159
|
- name: Run E2E tests
|
157
160
|
working-directory: src
|
158
161
|
run: |
|
159
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
162
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id prod/api-keys/e2e-tests --query SecretString --output text)
|
160
163
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
164
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
165
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
166
|
+
export JUDGMENT_API_URL=https://api.judgmentlabs.ai
|
161
167
|
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
162
168
|
|
163
169
|
- name: Upload coverage HTML report (production)
|
@@ -36,13 +36,13 @@ JUDGEVAL_PATHS: List[str] = [
|
|
36
36
|
"/fetch_scorer/",
|
37
37
|
"/scorer_exists/",
|
38
38
|
"/upload_custom_scorer/",
|
39
|
-
"/datasets/
|
40
|
-
"/datasets/
|
39
|
+
"/datasets/create_for_judgeval/",
|
40
|
+
"/datasets/insert_examples_for_judgeval/",
|
41
41
|
"/datasets/pull_for_judgeval/",
|
42
|
-
"/datasets/fetch_stats_by_project/",
|
43
42
|
"/projects/resolve/",
|
44
43
|
"/e2e_fetch_trace/",
|
45
44
|
"/e2e_fetch_span_score/",
|
45
|
+
"/e2e_fetch_trace_scorer_span_score/",
|
46
46
|
]
|
47
47
|
|
48
48
|
|
@@ -253,7 +253,7 @@ def generate_client_class(
|
|
253
253
|
|
254
254
|
def generate_api_file() -> str:
|
255
255
|
lines = [
|
256
|
-
"from typing import
|
256
|
+
"from typing import Dict, Any, Mapping, Literal, Optional",
|
257
257
|
"import httpx",
|
258
258
|
"from httpx import Response",
|
259
259
|
"from judgeval.exceptions import JudgmentAPIError",
|
@@ -35,10 +35,9 @@ JUDGEVAL_PATHS: List[str] = [
|
|
35
35
|
"/fetch_scorer/",
|
36
36
|
"/scorer_exists/",
|
37
37
|
"/upload_custom_scorer/",
|
38
|
-
"/datasets/
|
39
|
-
"/datasets/
|
38
|
+
"/datasets/create_for_judgeval/",
|
39
|
+
"/datasets/insert_examples_for_judgeval/",
|
40
40
|
"/datasets/pull_for_judgeval/",
|
41
|
-
"/datasets/fetch_stats_by_project/",
|
42
41
|
"/projects/resolve/",
|
43
42
|
"/e2e_fetch_trace/",
|
44
43
|
"/e2e_fetch_span_score/",
|
@@ -6,7 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
6
6
|
|
7
7
|
|
8
8
|
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import BaseScorer,
|
9
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
10
10
|
from judgeval.data.example import Example
|
11
11
|
from judgeval.logger import judgeval_logger
|
12
12
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +38,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
38
|
def run_evaluation(
|
39
39
|
self,
|
40
40
|
examples: List[Example],
|
41
|
-
scorers: List[Union[
|
41
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
42
42
|
project_name: str = "default_project",
|
43
43
|
eval_run_name: str = "default_eval_run",
|
44
44
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -71,13 +71,6 @@ class JudgmentSyncClient:
|
|
71
71
|
payload,
|
72
72
|
)
|
73
73
|
|
74
|
-
def evaluate_trace(self, payload: TraceRun) -> Any:
|
75
|
-
return self._request(
|
76
|
-
"POST",
|
77
|
-
url_for("/evaluate_trace/"),
|
78
|
-
payload,
|
79
|
-
)
|
80
|
-
|
81
74
|
def evaluate_examples(
|
82
75
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
83
76
|
) -> Any:
|
@@ -128,59 +121,32 @@ class JudgmentSyncClient:
|
|
128
121
|
query_params,
|
129
122
|
)
|
130
123
|
|
131
|
-
def
|
124
|
+
def datasets_insert_examples_for_judgeval(
|
125
|
+
self, payload: DatasetInsertExamples
|
126
|
+
) -> Any:
|
132
127
|
return self._request(
|
133
128
|
"POST",
|
134
|
-
url_for("/datasets/
|
129
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
135
130
|
payload,
|
136
131
|
)
|
137
132
|
|
138
|
-
def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
133
|
+
def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
139
134
|
return self._request(
|
140
135
|
"POST",
|
141
136
|
url_for("/datasets/pull_for_judgeval/"),
|
142
137
|
payload,
|
143
138
|
)
|
144
139
|
|
145
|
-
def
|
146
|
-
return self._request(
|
147
|
-
"POST",
|
148
|
-
url_for("/datasets/push/"),
|
149
|
-
payload,
|
150
|
-
)
|
151
|
-
|
152
|
-
def traces_upsert(self, payload: TraceSave) -> Any:
|
140
|
+
def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> List[DatasetInfo]:
|
153
141
|
return self._request(
|
154
142
|
"POST",
|
155
|
-
url_for("/
|
143
|
+
url_for("/datasets/pull_all_for_judgeval/"),
|
156
144
|
payload,
|
157
145
|
)
|
158
|
-
|
159
|
-
def traces_fetch(self, payload: TraceFetch) -> Any:
|
146
|
+
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
160
147
|
return self._request(
|
161
148
|
"POST",
|
162
|
-
url_for("/
|
163
|
-
payload,
|
164
|
-
)
|
165
|
-
|
166
|
-
def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
167
|
-
return self._request(
|
168
|
-
"POST",
|
169
|
-
url_for("/traces/add_to_dataset/"),
|
170
|
-
payload,
|
171
|
-
)
|
172
|
-
|
173
|
-
def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
174
|
-
return self._request(
|
175
|
-
"POST",
|
176
|
-
url_for("/traces/spans/batch/"),
|
177
|
-
payload,
|
178
|
-
)
|
179
|
-
|
180
|
-
def traces_evaluation_runs_batch(self, payload: EvaluationRunsBatchRequest) -> Any:
|
181
|
-
return self._request(
|
182
|
-
"POST",
|
183
|
-
url_for("/traces/evaluation_runs/batch/"),
|
149
|
+
url_for("/datasets/create_for_judgeval/"),
|
184
150
|
payload,
|
185
151
|
)
|
186
152
|
|
@@ -255,6 +221,13 @@ class JudgmentSyncClient:
|
|
255
221
|
payload,
|
256
222
|
)
|
257
223
|
|
224
|
+
def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
225
|
+
return self._request(
|
226
|
+
"POST",
|
227
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
228
|
+
payload,
|
229
|
+
)
|
230
|
+
|
258
231
|
|
259
232
|
class JudgmentAsyncClient:
|
260
233
|
__slots__ = ("api_key", "organization_id", "client")
|
@@ -304,13 +277,6 @@ class JudgmentAsyncClient:
|
|
304
277
|
payload,
|
305
278
|
)
|
306
279
|
|
307
|
-
async def evaluate_trace(self, payload: TraceRun) -> Any:
|
308
|
-
return await self._request(
|
309
|
-
"POST",
|
310
|
-
url_for("/evaluate_trace/"),
|
311
|
-
payload,
|
312
|
-
)
|
313
|
-
|
314
280
|
async def evaluate_examples(
|
315
281
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
316
282
|
) -> Any:
|
@@ -363,61 +329,26 @@ class JudgmentAsyncClient:
|
|
363
329
|
query_params,
|
364
330
|
)
|
365
331
|
|
366
|
-
async def
|
332
|
+
async def datasets_insert_examples_for_judgeval(
|
333
|
+
self, payload: DatasetInsertExamples
|
334
|
+
) -> Any:
|
367
335
|
return await self._request(
|
368
336
|
"POST",
|
369
|
-
url_for("/datasets/
|
337
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
370
338
|
payload,
|
371
339
|
)
|
372
340
|
|
373
|
-
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
341
|
+
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
374
342
|
return await self._request(
|
375
343
|
"POST",
|
376
344
|
url_for("/datasets/pull_for_judgeval/"),
|
377
345
|
payload,
|
378
346
|
)
|
379
347
|
|
380
|
-
async def
|
381
|
-
return await self._request(
|
382
|
-
"POST",
|
383
|
-
url_for("/datasets/push/"),
|
384
|
-
payload,
|
385
|
-
)
|
386
|
-
|
387
|
-
async def traces_upsert(self, payload: TraceSave) -> Any:
|
388
|
-
return await self._request(
|
389
|
-
"POST",
|
390
|
-
url_for("/traces/upsert/"),
|
391
|
-
payload,
|
392
|
-
)
|
393
|
-
|
394
|
-
async def traces_fetch(self, payload: TraceFetch) -> Any:
|
395
|
-
return await self._request(
|
396
|
-
"POST",
|
397
|
-
url_for("/traces/fetch/"),
|
398
|
-
payload,
|
399
|
-
)
|
400
|
-
|
401
|
-
async def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
348
|
+
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
402
349
|
return await self._request(
|
403
350
|
"POST",
|
404
|
-
url_for("/
|
405
|
-
payload,
|
406
|
-
)
|
407
|
-
|
408
|
-
async def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
409
|
-
return await self._request(
|
410
|
-
"POST",
|
411
|
-
url_for("/traces/spans/batch/"),
|
412
|
-
payload,
|
413
|
-
)
|
414
|
-
|
415
|
-
async def traces_evaluation_runs_batch(
|
416
|
-
self, payload: EvaluationRunsBatchRequest
|
417
|
-
) -> Any:
|
418
|
-
return await self._request(
|
419
|
-
"POST",
|
420
|
-
url_for("/traces/evaluation_runs/batch/"),
|
351
|
+
url_for("/datasets/create_for_judgeval/"),
|
421
352
|
payload,
|
422
353
|
)
|
423
354
|
|
@@ -494,6 +425,13 @@ class JudgmentAsyncClient:
|
|
494
425
|
payload,
|
495
426
|
)
|
496
427
|
|
428
|
+
async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
429
|
+
return await self._request(
|
430
|
+
"POST",
|
431
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
432
|
+
payload,
|
433
|
+
)
|
434
|
+
|
497
435
|
|
498
436
|
__all__ = [
|
499
437
|
"JudgmentSyncClient",
|
@@ -1,9 +1,9 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-
|
3
|
+
# timestamp: 2025-09-10T17:42:12+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
|
-
from typing import Any, Dict, List, Optional, TypedDict, Union
|
6
|
+
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
7
7
|
from typing_extensions import NotRequired
|
8
8
|
|
9
9
|
|
@@ -16,40 +16,13 @@ class EvalResultsFetch(TypedDict):
|
|
16
16
|
|
17
17
|
|
18
18
|
class DatasetFetch(TypedDict):
|
19
|
-
|
19
|
+
dataset_name: str
|
20
20
|
project_name: str
|
21
21
|
|
22
|
-
|
23
|
-
class TraceSave(TypedDict):
|
24
|
-
project_name: str
|
25
|
-
trace_id: str
|
26
|
-
name: str
|
27
|
-
created_at: str
|
28
|
-
duration: float
|
29
|
-
offline_mode: NotRequired[bool]
|
30
|
-
has_notification: NotRequired[bool]
|
31
|
-
customer_id: NotRequired[Optional[str]]
|
32
|
-
tags: NotRequired[List[str]]
|
33
|
-
metadata: NotRequired[Dict[str, Any]]
|
34
|
-
update_id: NotRequired[int]
|
35
|
-
|
36
|
-
|
37
|
-
class TraceFetch(TypedDict):
|
38
|
-
trace_id: str
|
39
|
-
|
40
|
-
|
41
|
-
class TraceAddToDataset(TypedDict):
|
42
|
-
trace_id: str
|
43
|
-
trace_span_id: str
|
44
|
-
dataset_alias: str
|
22
|
+
class DatasetsFetch(TypedDict):
|
45
23
|
project_name: str
|
46
24
|
|
47
25
|
|
48
|
-
class EvaluationRunsBatchRequest(TypedDict):
|
49
|
-
organization_id: str
|
50
|
-
evaluation_entries: List[Dict[str, Any]]
|
51
|
-
|
52
|
-
|
53
26
|
class ProjectAdd(TypedDict):
|
54
27
|
project_name: str
|
55
28
|
|
@@ -149,8 +122,8 @@ class ScorerConfig(TypedDict):
|
|
149
122
|
|
150
123
|
|
151
124
|
class Example(TypedDict):
|
152
|
-
example_id: str
|
153
|
-
created_at: str
|
125
|
+
example_id: NotRequired[str]
|
126
|
+
created_at: NotRequired[str]
|
154
127
|
name: NotRequired[Optional[str]]
|
155
128
|
|
156
129
|
|
@@ -160,28 +133,7 @@ class ValidationError(TypedDict):
|
|
160
133
|
type: str
|
161
134
|
|
162
135
|
|
163
|
-
|
164
|
-
span_id: str
|
165
|
-
trace_id: str
|
166
|
-
function: str
|
167
|
-
created_at: NotRequired[Any]
|
168
|
-
parent_span_id: NotRequired[Optional[str]]
|
169
|
-
span_type: NotRequired[Optional[str]]
|
170
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
171
|
-
output: NotRequired[Any]
|
172
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
173
|
-
usage: NotRequired[Optional[Dict[str, Any]]]
|
174
|
-
duration: NotRequired[Optional[float]]
|
175
|
-
expected_tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
176
|
-
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
177
|
-
has_evaluation: NotRequired[Optional[bool]]
|
178
|
-
agent_name: NotRequired[Optional[str]]
|
179
|
-
class_name: NotRequired[Optional[str]]
|
180
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
181
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
182
|
-
span_state: str
|
183
|
-
update_id: NotRequired[int]
|
184
|
-
queued_at: float
|
136
|
+
DatasetKind = Literal["trace", "example"]
|
185
137
|
|
186
138
|
|
187
139
|
class PromptScorer(TypedDict):
|
@@ -195,36 +147,45 @@ class PromptScorer(TypedDict):
|
|
195
147
|
|
196
148
|
|
197
149
|
class ScorerData(TypedDict):
|
150
|
+
id: NotRequired[str]
|
198
151
|
name: str
|
199
152
|
threshold: float
|
200
153
|
success: bool
|
201
154
|
score: NotRequired[Optional[float]]
|
202
155
|
reason: NotRequired[Optional[str]]
|
203
156
|
strict_mode: NotRequired[Optional[bool]]
|
204
|
-
evaluation_model: NotRequired[
|
157
|
+
evaluation_model: NotRequired[str]
|
205
158
|
error: NotRequired[Optional[str]]
|
206
159
|
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
207
160
|
|
208
161
|
|
209
|
-
class
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
162
|
+
class OtelTraceSpan(TypedDict):
|
163
|
+
organization_id: str
|
164
|
+
project_id: NotRequired[Optional[str]]
|
165
|
+
user_id: str
|
166
|
+
timestamp: str
|
167
|
+
trace_id: str
|
168
|
+
span_id: str
|
169
|
+
parent_span_id: NotRequired[Optional[str]]
|
170
|
+
trace_state: NotRequired[Optional[str]]
|
171
|
+
span_name: NotRequired[Optional[str]]
|
172
|
+
span_kind: NotRequired[Optional[str]]
|
173
|
+
service_name: NotRequired[Optional[str]]
|
174
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
175
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
176
|
+
duration: NotRequired[Optional[int]]
|
177
|
+
status_code: NotRequired[Optional[str]]
|
178
|
+
status_message: NotRequired[Optional[str]]
|
179
|
+
events: NotRequired[Optional[List[Dict[str, Any]]]]
|
180
|
+
links: NotRequired[Optional[List[Dict[str, Any]]]]
|
181
|
+
legacy_span_id: NotRequired[Optional[str]]
|
182
|
+
inputs: NotRequired[Optional[Dict[str, Any]]]
|
183
|
+
output: Any
|
184
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
185
|
+
agent_id: NotRequired[Optional[str]]
|
186
|
+
cumulative_llm_cost: NotRequired[Optional[float]]
|
187
|
+
state_after: NotRequired[Optional[Dict[str, Any]]]
|
188
|
+
state_before: NotRequired[Optional[Dict[str, Any]]]
|
228
189
|
|
229
190
|
|
230
191
|
class ExampleEvaluationRun(TypedDict):
|
@@ -257,88 +218,47 @@ class TraceEvaluationRun(TypedDict):
|
|
257
218
|
|
258
219
|
|
259
220
|
class DatasetInsertExamples(TypedDict):
|
260
|
-
|
221
|
+
dataset_name: str
|
261
222
|
examples: List[Example]
|
262
223
|
project_name: str
|
263
224
|
|
264
225
|
|
265
|
-
class
|
266
|
-
|
267
|
-
|
268
|
-
|
226
|
+
class DatasetReturn(TypedDict):
|
227
|
+
name: str
|
228
|
+
project_name: str
|
229
|
+
examples: NotRequired[Optional[List[Example]]]
|
269
230
|
|
270
|
-
class
|
271
|
-
|
231
|
+
class DatasetInfo(TypedDict):
|
232
|
+
dataset_id: str
|
233
|
+
name: str
|
234
|
+
created_at: str
|
235
|
+
dataset_kind: DatasetKind
|
236
|
+
entries: int
|
237
|
+
creator: str
|
272
238
|
|
273
239
|
|
274
|
-
class
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
span_type: NotRequired[Optional[str]]
|
281
|
-
inputs: NotRequired[Optional[Dict[str, Any]]]
|
282
|
-
error: NotRequired[Optional[Dict[str, Any]]]
|
283
|
-
output: NotRequired[Any]
|
284
|
-
usage: NotRequired[Optional[TraceUsage]]
|
285
|
-
duration: NotRequired[Optional[float]]
|
286
|
-
expected_tools: NotRequired[Optional[List[Tool]]]
|
287
|
-
additional_metadata: NotRequired[Optional[Dict[str, Any]]]
|
288
|
-
has_evaluation: NotRequired[Optional[bool]]
|
289
|
-
agent_name: NotRequired[Optional[str]]
|
290
|
-
class_name: NotRequired[Optional[str]]
|
291
|
-
state_before: NotRequired[Optional[Dict[str, Any]]]
|
292
|
-
state_after: NotRequired[Optional[Dict[str, Any]]]
|
293
|
-
update_id: NotRequired[int]
|
240
|
+
class DatasetCreate(TypedDict):
|
241
|
+
name: str
|
242
|
+
dataset_kind: DatasetKind
|
243
|
+
project_name: str
|
244
|
+
examples: NotRequired[Optional[List[Example]]]
|
245
|
+
overwrite: NotRequired[Optional[bool]]
|
294
246
|
|
295
247
|
|
296
|
-
class
|
297
|
-
|
298
|
-
name: str
|
299
|
-
created_at: str
|
300
|
-
duration: float
|
301
|
-
trace_spans: List[TraceSpan]
|
302
|
-
offline_mode: NotRequired[bool]
|
303
|
-
rules: NotRequired[Dict[str, Any]]
|
304
|
-
has_notification: NotRequired[bool]
|
305
|
-
customer_id: NotRequired[Optional[str]]
|
306
|
-
tags: NotRequired[List[str]]
|
307
|
-
metadata: NotRequired[Dict[str, Any]]
|
308
|
-
update_id: NotRequired[int]
|
248
|
+
class FetchPromptScorerResponse(TypedDict):
|
249
|
+
scorer: PromptScorer
|
309
250
|
|
310
251
|
|
311
252
|
class ScoringResult(TypedDict):
|
312
253
|
success: bool
|
313
254
|
scorers_data: Optional[List[ScorerData]]
|
314
255
|
name: NotRequired[Optional[str]]
|
315
|
-
data_object: NotRequired[Optional[Union[
|
256
|
+
data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
|
316
257
|
trace_id: NotRequired[Optional[str]]
|
317
258
|
run_duration: NotRequired[Optional[float]]
|
318
259
|
evaluation_cost: NotRequired[Optional[float]]
|
319
260
|
|
320
261
|
|
321
|
-
class TraceRun(TypedDict):
|
322
|
-
project_name: NotRequired[Optional[str]]
|
323
|
-
eval_name: NotRequired[Optional[str]]
|
324
|
-
traces: List[Trace]
|
325
|
-
scorers: List[ScorerConfig]
|
326
|
-
model: str
|
327
|
-
trace_span_id: NotRequired[Optional[str]]
|
328
|
-
tools: NotRequired[Optional[List[Dict[str, Any]]]]
|
329
|
-
|
330
|
-
|
331
262
|
class EvalResults(TypedDict):
|
332
263
|
results: List[ScoringResult]
|
333
264
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
334
|
-
|
335
|
-
|
336
|
-
class DatasetPush(TypedDict):
|
337
|
-
dataset_alias: str
|
338
|
-
comments: NotRequired[Optional[str]]
|
339
|
-
source_file: NotRequired[Optional[str]]
|
340
|
-
examples: NotRequired[Optional[List[Example]]]
|
341
|
-
traces: NotRequired[Optional[List[Trace]]]
|
342
|
-
is_trace: NotRequired[bool]
|
343
|
-
project_name: str
|
344
|
-
overwrite: NotRequired[Optional[bool]]
|
@@ -14,16 +14,12 @@ class APIScorerType(str, Enum):
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
PROMPT_SCORER = "Prompt Scorer"
|
17
|
+
TRACE_PROMPT_SCORER = "Trace Prompt Scorer"
|
17
18
|
FAITHFULNESS = "Faithfulness"
|
18
19
|
ANSWER_RELEVANCY = "Answer Relevancy"
|
19
20
|
ANSWER_CORRECTNESS = "Answer Correctness"
|
20
21
|
INSTRUCTION_ADHERENCE = "Instruction Adherence"
|
21
22
|
EXECUTION_ORDER = "Execution Order"
|
22
|
-
DERAILMENT = "Derailment"
|
23
|
-
TOOL_ORDER = "Tool Order"
|
24
|
-
MOCK_TRACE_SCORER = "Mock Trace Scorer"
|
25
|
-
CLASSIFIER = "Classifier"
|
26
|
-
TOOL_DEPENDENCY = "Tool Dependency"
|
27
23
|
CUSTOM = "Custom"
|
28
24
|
|
29
25
|
@classmethod
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from judgeval.data.example import Example, ExampleParams
|
2
2
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
3
3
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
4
|
-
from judgeval.data.trace import
|
4
|
+
from judgeval.data.trace import TraceUsage
|
5
5
|
|
6
6
|
|
7
7
|
__all__ = [
|
@@ -11,7 +11,5 @@ __all__ = [
|
|
11
11
|
"create_scorer_data",
|
12
12
|
"ScoringResult",
|
13
13
|
"generate_scoring_result",
|
14
|
-
"Trace",
|
15
|
-
"TraceSpan",
|
16
14
|
"TraceUsage",
|
17
15
|
]
|
@@ -6,6 +6,8 @@ from enum import Enum
|
|
6
6
|
from datetime import datetime
|
7
7
|
from typing import Dict, Any, Optional
|
8
8
|
from judgeval.data.judgment_types import Example as JudgmentExample
|
9
|
+
from uuid import uuid4
|
10
|
+
from pydantic import Field
|
9
11
|
|
10
12
|
|
11
13
|
class ExampleParams(str, Enum):
|
@@ -20,8 +22,8 @@ class ExampleParams(str, Enum):
|
|
20
22
|
|
21
23
|
|
22
24
|
class Example(JudgmentExample):
|
23
|
-
example_id: str =
|
24
|
-
created_at: str = datetime.now().isoformat()
|
25
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
26
|
+
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
|
25
27
|
name: Optional[str] = None
|
26
28
|
|
27
29
|
def to_dict(self) -> Dict[str, Any]:
|