judgeval 0.9.3__tar.gz → 0.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/ci.yaml +33 -22
- {judgeval-0.9.3 → judgeval-0.10.0}/.pre-commit-config.yaml +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/PKG-INFO +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/pyproject.toml +3 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/scripts/api_generator.py +4 -4
- {judgeval-0.9.3 → judgeval-0.10.0}/scripts/openapi_transform.py +2 -3
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/__init__.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/api/__init__.py +28 -96
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/api/api_types.py +49 -140
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/constants.py +1 -5
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/__init__.py +1 -3
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/example.py +4 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/judgment_types.py +57 -165
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/result.py +1 -2
- judgeval-0.10.0/src/judgeval/data/trace.py +14 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/dataset/__init__.py +15 -42
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/evaluation/__init__.py +23 -34
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/__init__.py +9 -7
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/api_scorer.py +8 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/base_scorer.py +0 -1
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/__init__.py +40 -93
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/local_eval_queue.py +2 -2
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/processors/__init__.py +84 -6
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/utils.py +1 -1
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/trainer.py +4 -4
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/serialize.py +7 -1
- {judgeval-0.9.3 → judgeval-0.10.0}/uv.lock +2591 -2039
- judgeval-0.9.3/src/judgeval/data/trace.py +0 -40
- judgeval-0.9.3/src/judgeval/data/trace_run.py +0 -39
- judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval-0.9.3/src/judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/pull_request_template.md +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/claude.yml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/.gitignore +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/LICENSE.md +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/README.md +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/agent.gif +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/data.gif +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/document.gif +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/errors.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_page.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/logo-light.svg +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/online_eval.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/product_shot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/test.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/tests.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace.gif +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_demo.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/pytest.ini +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/scripts/update_types.sh +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/cli.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/evaluation_run.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/env.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/integrations/langgraph/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/logger.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/keys.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/llm/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/llm/providers.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/managers.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/meta.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/testing.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/version_check.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/version.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/warnings.py +0 -0
- {judgeval-0.9.3 → judgeval-0.10.0}/update_version.py +0 -0
@@ -18,7 +18,10 @@ jobs:
|
|
18
18
|
matrix:
|
19
19
|
os: [ubuntu-latest, macos-latest]
|
20
20
|
python-version:
|
21
|
+
- "3.10"
|
21
22
|
- "3.11"
|
23
|
+
- "3.12"
|
24
|
+
- "3.13"
|
22
25
|
name: Unit Tests
|
23
26
|
runs-on: ${{ matrix.os }}
|
24
27
|
env:
|
@@ -49,18 +52,19 @@ jobs:
|
|
49
52
|
run-e2e-tests-staging:
|
50
53
|
needs: [validate-branch]
|
51
54
|
if: "github.base_ref == 'staging' && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
|
55
|
+
strategy:
|
56
|
+
fail-fast: false
|
57
|
+
matrix:
|
58
|
+
python-version:
|
59
|
+
- "3.10"
|
60
|
+
- "3.11"
|
61
|
+
- "3.12"
|
62
|
+
- "3.13"
|
52
63
|
name: Staging E2E Tests
|
53
64
|
runs-on: ubuntu-latest
|
54
65
|
env:
|
55
66
|
TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
|
56
67
|
steps:
|
57
|
-
- name: Wait for turn
|
58
|
-
uses: softprops/turnstyle@v2
|
59
|
-
with:
|
60
|
-
poll-interval-seconds: 10
|
61
|
-
same-branch-only: false
|
62
|
-
job-to-wait-for: "Staging E2E Tests"
|
63
|
-
|
64
68
|
- name: Configure AWS Credentials
|
65
69
|
uses: aws-actions/configure-aws-credentials@v4
|
66
70
|
with:
|
@@ -74,7 +78,7 @@ jobs:
|
|
74
78
|
- name: Set up Python
|
75
79
|
uses: actions/setup-python@v4
|
76
80
|
with:
|
77
|
-
python-version:
|
81
|
+
python-version: ${{ matrix.python-version }}
|
78
82
|
|
79
83
|
- name: Install judgeval dependencies
|
80
84
|
run: |
|
@@ -93,32 +97,36 @@ jobs:
|
|
93
97
|
- name: Run E2E tests
|
94
98
|
working-directory: src
|
95
99
|
run: |
|
96
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
100
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id stg/api-keys/e2e-tests --query SecretString --output text)
|
97
101
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
98
|
-
|
102
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
103
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
104
|
+
export JUDGMENT_API_URL=https://staging.api.judgmentlabs.ai
|
105
|
+
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
99
106
|
|
100
107
|
- name: Upload coverage HTML report (staging)
|
101
108
|
if: always()
|
102
109
|
uses: actions/upload-artifact@v4
|
103
110
|
with:
|
104
|
-
name: coverage-html-staging
|
111
|
+
name: coverage-html-staging-${{ matrix.python-version }}
|
105
112
|
path: src/htmlcov
|
106
113
|
|
107
114
|
run-e2e-tests-main:
|
108
115
|
needs: [validate-branch]
|
109
116
|
if: "github.base_ref == 'main' && !contains(github.actor, '[bot]') && needs.validate-branch.result == 'success'"
|
117
|
+
strategy:
|
118
|
+
fail-fast: false
|
119
|
+
matrix:
|
120
|
+
python-version:
|
121
|
+
- "3.10"
|
122
|
+
- "3.11"
|
123
|
+
- "3.12"
|
124
|
+
- "3.13"
|
110
125
|
name: Production E2E Tests
|
111
126
|
runs-on: ubuntu-latest
|
112
127
|
env:
|
113
128
|
TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
|
114
129
|
steps:
|
115
|
-
- name: Wait for turn
|
116
|
-
uses: softprops/turnstyle@v2
|
117
|
-
with:
|
118
|
-
poll-interval-seconds: 10
|
119
|
-
same-branch-only: false
|
120
|
-
job-to-wait-for: "Production E2E Tests"
|
121
|
-
|
122
130
|
- name: Configure AWS Credentials
|
123
131
|
uses: aws-actions/configure-aws-credentials@v4
|
124
132
|
with:
|
@@ -132,7 +140,7 @@ jobs:
|
|
132
140
|
- name: Set up Python
|
133
141
|
uses: actions/setup-python@v4
|
134
142
|
with:
|
135
|
-
python-version:
|
143
|
+
python-version: ${{ matrix.python-version }}
|
136
144
|
|
137
145
|
- name: Install judgeval dependencies
|
138
146
|
run: |
|
@@ -151,13 +159,16 @@ jobs:
|
|
151
159
|
- name: Run E2E tests
|
152
160
|
working-directory: src
|
153
161
|
run: |
|
154
|
-
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id
|
162
|
+
SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id prod/api-keys/e2e-tests --query SecretString --output text)
|
155
163
|
export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
|
156
|
-
|
164
|
+
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
|
165
|
+
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
|
166
|
+
export JUDGMENT_API_URL=https://api.judgmentlabs.ai
|
167
|
+
timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
|
157
168
|
|
158
169
|
- name: Upload coverage HTML report (production)
|
159
170
|
if: always()
|
160
171
|
uses: actions/upload-artifact@v4
|
161
172
|
with:
|
162
|
-
name: coverage-html-production
|
173
|
+
name: coverage-html-production-${{ matrix.python-version }}
|
163
174
|
path: src/htmlcov
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.10.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -9,7 +9,7 @@ License-Expression: Apache-2.0
|
|
9
9
|
License-File: LICENSE.md
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.
|
12
|
+
Requires-Python: >=3.10
|
13
13
|
Requires-Dist: boto3>=1.40.11
|
14
14
|
Requires-Dist: click<8.2.0
|
15
15
|
Requires-Dist: dotenv
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.10.0"
|
4
4
|
authors = [
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
@@ -8,7 +8,7 @@ authors = [
|
|
8
8
|
]
|
9
9
|
description = "Judgeval Package"
|
10
10
|
readme = "README.md"
|
11
|
-
requires-python = ">=3.
|
11
|
+
requires-python = ">=3.10"
|
12
12
|
classifiers = [
|
13
13
|
"Programming Language :: Python :: 3",
|
14
14
|
"Operating System :: OS Independent",
|
@@ -75,6 +75,7 @@ dev = [
|
|
75
75
|
"pytest-cov>=6.2.1",
|
76
76
|
"types-tqdm>=4.67.0.20250809",
|
77
77
|
"pytest-asyncio>=1.1.0",
|
78
|
+
"pytest-xdist>=3.8.0",
|
78
79
|
]
|
79
80
|
|
80
81
|
|
@@ -36,13 +36,13 @@ JUDGEVAL_PATHS: List[str] = [
|
|
36
36
|
"/fetch_scorer/",
|
37
37
|
"/scorer_exists/",
|
38
38
|
"/upload_custom_scorer/",
|
39
|
-
"/datasets/
|
40
|
-
"/datasets/
|
39
|
+
"/datasets/create_for_judgeval/",
|
40
|
+
"/datasets/insert_examples_for_judgeval/",
|
41
41
|
"/datasets/pull_for_judgeval/",
|
42
|
-
"/datasets/fetch_stats_by_project/",
|
43
42
|
"/projects/resolve/",
|
44
43
|
"/e2e_fetch_trace/",
|
45
44
|
"/e2e_fetch_span_score/",
|
45
|
+
"/e2e_fetch_trace_scorer_span_score/",
|
46
46
|
]
|
47
47
|
|
48
48
|
|
@@ -253,7 +253,7 @@ def generate_client_class(
|
|
253
253
|
|
254
254
|
def generate_api_file() -> str:
|
255
255
|
lines = [
|
256
|
-
"from typing import
|
256
|
+
"from typing import Dict, Any, Mapping, Literal, Optional",
|
257
257
|
"import httpx",
|
258
258
|
"from httpx import Response",
|
259
259
|
"from judgeval.exceptions import JudgmentAPIError",
|
@@ -35,10 +35,9 @@ JUDGEVAL_PATHS: List[str] = [
|
|
35
35
|
"/fetch_scorer/",
|
36
36
|
"/scorer_exists/",
|
37
37
|
"/upload_custom_scorer/",
|
38
|
-
"/datasets/
|
39
|
-
"/datasets/
|
38
|
+
"/datasets/create_for_judgeval/",
|
39
|
+
"/datasets/insert_examples_for_judgeval/",
|
40
40
|
"/datasets/pull_for_judgeval/",
|
41
|
-
"/datasets/fetch_stats_by_project/",
|
42
41
|
"/projects/resolve/",
|
43
42
|
"/e2e_fetch_trace/",
|
44
43
|
"/e2e_fetch_span_score/",
|
@@ -6,7 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
6
6
|
|
7
7
|
|
8
8
|
from typing import List, Optional, Union
|
9
|
-
from judgeval.scorers import BaseScorer,
|
9
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
10
10
|
from judgeval.data.example import Example
|
11
11
|
from judgeval.logger import judgeval_logger
|
12
12
|
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
|
@@ -38,7 +38,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
38
38
|
def run_evaluation(
|
39
39
|
self,
|
40
40
|
examples: List[Example],
|
41
|
-
scorers: List[Union[
|
41
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
42
42
|
project_name: str = "default_project",
|
43
43
|
eval_run_name: str = "default_eval_run",
|
44
44
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
@@ -71,13 +71,6 @@ class JudgmentSyncClient:
|
|
71
71
|
payload,
|
72
72
|
)
|
73
73
|
|
74
|
-
def evaluate_trace(self, payload: TraceRun) -> Any:
|
75
|
-
return self._request(
|
76
|
-
"POST",
|
77
|
-
url_for("/evaluate_trace/"),
|
78
|
-
payload,
|
79
|
-
)
|
80
|
-
|
81
74
|
def evaluate_examples(
|
82
75
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
83
76
|
) -> Any:
|
@@ -128,59 +121,26 @@ class JudgmentSyncClient:
|
|
128
121
|
query_params,
|
129
122
|
)
|
130
123
|
|
131
|
-
def
|
124
|
+
def datasets_insert_examples_for_judgeval(
|
125
|
+
self, payload: DatasetInsertExamples
|
126
|
+
) -> Any:
|
132
127
|
return self._request(
|
133
128
|
"POST",
|
134
|
-
url_for("/datasets/
|
129
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
135
130
|
payload,
|
136
131
|
)
|
137
132
|
|
138
|
-
def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
133
|
+
def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
139
134
|
return self._request(
|
140
135
|
"POST",
|
141
136
|
url_for("/datasets/pull_for_judgeval/"),
|
142
137
|
payload,
|
143
138
|
)
|
144
139
|
|
145
|
-
def
|
140
|
+
def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
146
141
|
return self._request(
|
147
142
|
"POST",
|
148
|
-
url_for("/datasets/
|
149
|
-
payload,
|
150
|
-
)
|
151
|
-
|
152
|
-
def traces_upsert(self, payload: TraceSave) -> Any:
|
153
|
-
return self._request(
|
154
|
-
"POST",
|
155
|
-
url_for("/traces/upsert/"),
|
156
|
-
payload,
|
157
|
-
)
|
158
|
-
|
159
|
-
def traces_fetch(self, payload: TraceFetch) -> Any:
|
160
|
-
return self._request(
|
161
|
-
"POST",
|
162
|
-
url_for("/traces/fetch/"),
|
163
|
-
payload,
|
164
|
-
)
|
165
|
-
|
166
|
-
def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
167
|
-
return self._request(
|
168
|
-
"POST",
|
169
|
-
url_for("/traces/add_to_dataset/"),
|
170
|
-
payload,
|
171
|
-
)
|
172
|
-
|
173
|
-
def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
174
|
-
return self._request(
|
175
|
-
"POST",
|
176
|
-
url_for("/traces/spans/batch/"),
|
177
|
-
payload,
|
178
|
-
)
|
179
|
-
|
180
|
-
def traces_evaluation_runs_batch(self, payload: EvaluationRunsBatchRequest) -> Any:
|
181
|
-
return self._request(
|
182
|
-
"POST",
|
183
|
-
url_for("/traces/evaluation_runs/batch/"),
|
143
|
+
url_for("/datasets/create_for_judgeval/"),
|
184
144
|
payload,
|
185
145
|
)
|
186
146
|
|
@@ -255,6 +215,13 @@ class JudgmentSyncClient:
|
|
255
215
|
payload,
|
256
216
|
)
|
257
217
|
|
218
|
+
def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
219
|
+
return self._request(
|
220
|
+
"POST",
|
221
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
222
|
+
payload,
|
223
|
+
)
|
224
|
+
|
258
225
|
|
259
226
|
class JudgmentAsyncClient:
|
260
227
|
__slots__ = ("api_key", "organization_id", "client")
|
@@ -304,13 +271,6 @@ class JudgmentAsyncClient:
|
|
304
271
|
payload,
|
305
272
|
)
|
306
273
|
|
307
|
-
async def evaluate_trace(self, payload: TraceRun) -> Any:
|
308
|
-
return await self._request(
|
309
|
-
"POST",
|
310
|
-
url_for("/evaluate_trace/"),
|
311
|
-
payload,
|
312
|
-
)
|
313
|
-
|
314
274
|
async def evaluate_examples(
|
315
275
|
self, payload: ExampleEvaluationRun, stream: Optional[str] = None
|
316
276
|
) -> Any:
|
@@ -363,61 +323,26 @@ class JudgmentAsyncClient:
|
|
363
323
|
query_params,
|
364
324
|
)
|
365
325
|
|
366
|
-
async def
|
326
|
+
async def datasets_insert_examples_for_judgeval(
|
327
|
+
self, payload: DatasetInsertExamples
|
328
|
+
) -> Any:
|
367
329
|
return await self._request(
|
368
330
|
"POST",
|
369
|
-
url_for("/datasets/
|
331
|
+
url_for("/datasets/insert_examples_for_judgeval/"),
|
370
332
|
payload,
|
371
333
|
)
|
372
334
|
|
373
|
-
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) ->
|
335
|
+
async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
|
374
336
|
return await self._request(
|
375
337
|
"POST",
|
376
338
|
url_for("/datasets/pull_for_judgeval/"),
|
377
339
|
payload,
|
378
340
|
)
|
379
341
|
|
380
|
-
async def
|
381
|
-
return await self._request(
|
382
|
-
"POST",
|
383
|
-
url_for("/datasets/push/"),
|
384
|
-
payload,
|
385
|
-
)
|
386
|
-
|
387
|
-
async def traces_upsert(self, payload: TraceSave) -> Any:
|
388
|
-
return await self._request(
|
389
|
-
"POST",
|
390
|
-
url_for("/traces/upsert/"),
|
391
|
-
payload,
|
392
|
-
)
|
393
|
-
|
394
|
-
async def traces_fetch(self, payload: TraceFetch) -> Any:
|
395
|
-
return await self._request(
|
396
|
-
"POST",
|
397
|
-
url_for("/traces/fetch/"),
|
398
|
-
payload,
|
399
|
-
)
|
400
|
-
|
401
|
-
async def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
|
402
|
-
return await self._request(
|
403
|
-
"POST",
|
404
|
-
url_for("/traces/add_to_dataset/"),
|
405
|
-
payload,
|
406
|
-
)
|
407
|
-
|
408
|
-
async def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
|
409
|
-
return await self._request(
|
410
|
-
"POST",
|
411
|
-
url_for("/traces/spans/batch/"),
|
412
|
-
payload,
|
413
|
-
)
|
414
|
-
|
415
|
-
async def traces_evaluation_runs_batch(
|
416
|
-
self, payload: EvaluationRunsBatchRequest
|
417
|
-
) -> Any:
|
342
|
+
async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
|
418
343
|
return await self._request(
|
419
344
|
"POST",
|
420
|
-
url_for("/
|
345
|
+
url_for("/datasets/create_for_judgeval/"),
|
421
346
|
payload,
|
422
347
|
)
|
423
348
|
|
@@ -494,6 +419,13 @@ class JudgmentAsyncClient:
|
|
494
419
|
payload,
|
495
420
|
)
|
496
421
|
|
422
|
+
async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
|
423
|
+
return await self._request(
|
424
|
+
"POST",
|
425
|
+
url_for("/e2e_fetch_trace_scorer_span_score/"),
|
426
|
+
payload,
|
427
|
+
)
|
428
|
+
|
497
429
|
|
498
430
|
__all__ = [
|
499
431
|
"JudgmentSyncClient",
|