judgeval 0.4.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/pull_request_template.md +1 -8
- {judgeval-0.4.0 → judgeval-0.6.0}/PKG-INFO +11 -12
- {judgeval-0.4.0 → judgeval-0.6.0}/pyproject.toml +33 -32
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/__init__.py +2 -0
- judgeval-0.6.0/src/judgeval/cli.py +65 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/clients.py +2 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/api.py +46 -54
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/constants.py +18 -5
- judgeval-0.6.0/src/judgeval/common/api/json_encoder.py +241 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/core.py +772 -467
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/otel_span_processor.py +1 -1
- judgeval-0.6.0/src/judgeval/common/tracer/providers.py +119 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/span_processor.py +1 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/span_transformer.py +16 -26
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/constants.py +1 -0
- judgeval-0.6.0/src/judgeval/data/evaluation_run.py +104 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/judgment_types.py +38 -8
- judgeval-0.6.0/src/judgeval/data/trace.py +83 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/trace_run.py +2 -3
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/dataset.py +2 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/integrations/langgraph.py +2 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/litellm_judge.py +2 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/mixture_of_judges.py +2 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/utils.py +2 -1
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judgment_client.py +113 -53
- judgeval-0.6.0/src/judgeval/local_eval_queue.py +190 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/run_evaluation.py +43 -197
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/base_scorer.py +9 -10
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/score.py +33 -11
- judgeval-0.6.0/src/judgeval/utils/async_utils.py +36 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/uv.lock +54 -530
- judgeval-0.4.0/src/judgeval/data/trace.py +0 -199
- judgeval-0.4.0/src/judgeval/evaluation_run.py +0 -76
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/release.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.gitignore +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/.pre-commit-config.yaml +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/LICENSE.md +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/README.md +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/agent.gif +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/agent_trace_example.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/data.gif +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/document.gif +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/errors.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_page.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/logo-dark.svg +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/logo-light.svg +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/new_darkmode.svg +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/new_lightmode.svg +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/online_eval.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/product_shot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/test.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/tests.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace.gif +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_demo.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_screenshot.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/pytest.ini +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/.coveragerc +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/storage/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/storage/s3_storage.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/constants.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/trace_manager.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/example.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/result.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/tool.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/rules.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/file_utils.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/requests.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/version_check.py +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/src/update_types.sh +0 -0
- {judgeval-0.4.0 → judgeval-0.6.0}/update_version.py +0 -0
@@ -10,14 +10,7 @@
|
|
10
10
|
-->
|
11
11
|
- [ ] 1. ...
|
12
12
|
|
13
|
-
## 🎥 Demo of Changes
|
14
|
-
|
15
|
-
<!-- Add a short 1-3 minute video describing/demoing the changes -->
|
16
|
-
|
17
13
|
## ✅ Checklist
|
18
14
|
|
19
|
-
- [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
|
20
|
-
- [ ] Video demo of changes
|
21
|
-
- [ ] Reviewers assigned
|
22
15
|
- [ ] Docs updated ([if necessary](https://github.com/JudgmentLabs/docs))
|
23
|
-
- [ ]
|
16
|
+
- [ ] Changelogs are updated ([if necessary](https://github.com/JudgmentLabs/docs/tree/main/content/docs/changelog/%28weekly%29))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -10,27 +10,26 @@ License-File: LICENSE.md
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: anthropic
|
14
13
|
Requires-Dist: boto3
|
15
|
-
Requires-Dist:
|
16
|
-
Requires-Dist: google-genai
|
17
|
-
Requires-Dist: groq>=0.30.0
|
14
|
+
Requires-Dist: click<8.2.0
|
18
15
|
Requires-Dist: langchain-anthropic
|
19
16
|
Requires-Dist: langchain-core
|
20
17
|
Requires-Dist: langchain-huggingface
|
21
18
|
Requires-Dist: langchain-openai
|
22
19
|
Requires-Dist: litellm>=1.61.15
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist: nest-asyncio
|
25
|
-
Requires-Dist: openai
|
20
|
+
Requires-Dist: nest-asyncio>=1.6.0
|
26
21
|
Requires-Dist: opentelemetry-api>=1.34.1
|
27
22
|
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
23
|
Requires-Dist: orjson>=3.9.0
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist: python-dotenv==1.0.1
|
31
|
-
Requires-Dist: python-slugify>=8.0.4
|
24
|
+
Requires-Dist: python-dotenv
|
32
25
|
Requires-Dist: requests
|
33
|
-
Requires-Dist:
|
26
|
+
Requires-Dist: rich
|
27
|
+
Requires-Dist: typer>=0.9.0
|
28
|
+
Provides-Extra: langchain
|
29
|
+
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
30
|
+
Requires-Dist: langchain-core; extra == 'langchain'
|
31
|
+
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
32
|
+
Requires-Dist: langchain-openai; extra == 'langchain'
|
34
33
|
Description-Content-Type: text/markdown
|
35
34
|
|
36
35
|
<div align="center">
|
@@ -1,10 +1,10 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.6.0"
|
4
4
|
authors = [
|
5
|
-
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
|
-
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
7
|
-
{ name="Joseph Camyre", email="joseph@judgmentlabs.ai" },
|
5
|
+
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
|
+
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
7
|
+
{ name = "Joseph Camyre", email = "joseph@judgmentlabs.ai" },
|
8
8
|
]
|
9
9
|
description = "Judgeval Package"
|
10
10
|
readme = "README.md"
|
@@ -16,54 +16,54 @@ classifiers = [
|
|
16
16
|
license = "Apache-2.0"
|
17
17
|
license-files = ["LICENSE.md"]
|
18
18
|
dependencies = [
|
19
|
+
"rich",
|
19
20
|
"litellm>=1.61.15",
|
20
|
-
"python-dotenv
|
21
|
+
"python-dotenv",
|
21
22
|
"requests",
|
22
|
-
"pandas",
|
23
|
-
"openai",
|
24
|
-
"together",
|
25
|
-
"anthropic",
|
26
|
-
"nest-asyncio",
|
27
|
-
"langchain-huggingface",
|
28
|
-
"langchain-openai",
|
29
|
-
"langchain-anthropic",
|
30
|
-
"langchain-core",
|
31
|
-
"google-genai",
|
32
23
|
"boto3",
|
33
|
-
"matplotlib>=3.10.3",
|
34
|
-
"python-slugify>=8.0.4",
|
35
|
-
"datamodel-code-generator>=0.31.1",
|
36
|
-
"groq>=0.30.0",
|
37
24
|
"opentelemetry-api>=1.34.1",
|
38
25
|
"opentelemetry-sdk>=1.34.1",
|
39
26
|
"orjson>=3.9.0",
|
27
|
+
"nest-asyncio>=1.6.0",
|
28
|
+
"langchain-huggingface",
|
29
|
+
"langchain-openai",
|
30
|
+
"langchain-anthropic",
|
31
|
+
"langchain-core",
|
32
|
+
"click<8.2.0",
|
33
|
+
"typer>=0.9.0",
|
40
34
|
]
|
41
35
|
|
42
36
|
[project.urls]
|
43
37
|
Homepage = "https://github.com/JudgmentLabs/judgeval"
|
44
38
|
Issues = "https://github.com/JudgmentLabs/judgeval/issues"
|
45
39
|
|
40
|
+
[project.scripts]
|
41
|
+
judgeval = "judgeval.cli:app"
|
42
|
+
|
46
43
|
[build-system]
|
47
44
|
requires = ["hatchling"]
|
48
45
|
build-backend = "hatchling.build"
|
49
46
|
|
50
47
|
[tool.hatch.build.targets.wheel]
|
51
48
|
packages = ["src/judgeval"]
|
52
|
-
include = [
|
53
|
-
|
54
|
-
|
49
|
+
include = ["/src/judgeval", "/src/judgeval/**/*.py"]
|
50
|
+
|
51
|
+
[project.optional-dependencies]
|
52
|
+
langchain = [
|
53
|
+
"langchain-huggingface",
|
54
|
+
"langchain-openai",
|
55
|
+
"langchain-anthropic",
|
56
|
+
"langchain-core",
|
55
57
|
]
|
56
58
|
|
57
59
|
[dependency-groups]
|
58
60
|
dev = [
|
59
61
|
"chromadb>=1.0.12",
|
60
|
-
"langchain-community>=0.3.24",
|
61
62
|
"pytest>=8.4.0",
|
62
63
|
"pytest-asyncio>=1.0.0",
|
63
64
|
"pytest-cov>=6.1.1",
|
64
65
|
"pytest-mock>=3.14.1",
|
65
66
|
"tavily-python>=0.7.5",
|
66
|
-
"langgraph>=0.4.3",
|
67
67
|
"pre-commit>=4.2.0",
|
68
68
|
"types-requests>=2.32.4.20250611",
|
69
69
|
"mypy>=1.17.0",
|
@@ -90,18 +90,19 @@ dev = [
|
|
90
90
|
"types-tqdm>=4.67.0.20250516",
|
91
91
|
"types-tree-sitter-languages>=1.10.0.20250530",
|
92
92
|
"types-xmltodict>=0.14.0.20241009",
|
93
|
+
"datamodel-code-generator>=0.31.2",
|
94
|
+
"openai",
|
95
|
+
"together",
|
96
|
+
"anthropic",
|
97
|
+
"google-genai",
|
98
|
+
"groq",
|
99
|
+
"langgraph>=0.4.3",
|
93
100
|
]
|
94
101
|
|
95
102
|
[tool.hatch.build]
|
96
103
|
directory = "dist"
|
97
|
-
artifacts = [
|
98
|
-
|
99
|
-
]
|
100
|
-
exclude = [
|
101
|
-
"src/e2etests/*",
|
102
|
-
"src/tests/*",
|
103
|
-
"src/demo/*"
|
104
|
-
]
|
104
|
+
artifacts = ["src/judgeval/**/*.py"]
|
105
|
+
exclude = ["src/e2etests/*", "src/tests/*", "src/demo/*"]
|
105
106
|
|
106
107
|
[tool.ruff]
|
107
108
|
exclude = ["docs"]
|
@@ -2,6 +2,7 @@
|
|
2
2
|
from judgeval.clients import client, together_client
|
3
3
|
from judgeval.judgment_client import JudgmentClient
|
4
4
|
from judgeval.version_check import check_latest_version
|
5
|
+
from judgeval.local_eval_queue import LocalEvaluationQueue
|
5
6
|
|
6
7
|
check_latest_version()
|
7
8
|
|
@@ -10,4 +11,5 @@ __all__ = [
|
|
10
11
|
"client",
|
11
12
|
"together_client",
|
12
13
|
"JudgmentClient",
|
14
|
+
"LocalEvaluationQueue",
|
13
15
|
]
|
@@ -0,0 +1,65 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
|
3
|
+
import typer
|
4
|
+
from pathlib import Path
|
5
|
+
from dotenv import load_dotenv
|
6
|
+
from judgeval.common.logger import judgeval_logger
|
7
|
+
from judgeval.judgment_client import JudgmentClient
|
8
|
+
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
app = typer.Typer(
|
12
|
+
no_args_is_help=True,
|
13
|
+
rich_markup_mode=None,
|
14
|
+
rich_help_panel=None,
|
15
|
+
pretty_exceptions_enable=False,
|
16
|
+
pretty_exceptions_show_locals=False,
|
17
|
+
pretty_exceptions_short=False,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
@app.command("upload_scorer")
|
22
|
+
def upload_scorer(
|
23
|
+
scorer_file_path: str,
|
24
|
+
requirements_file_path: str,
|
25
|
+
unique_name: str = typer.Option(
|
26
|
+
None, help="Custom name for the scorer (auto-detected if not provided)"
|
27
|
+
),
|
28
|
+
):
|
29
|
+
# Validate file paths
|
30
|
+
if not Path(scorer_file_path).exists():
|
31
|
+
judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
|
32
|
+
raise typer.Exit(1)
|
33
|
+
|
34
|
+
if not Path(requirements_file_path).exists():
|
35
|
+
judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
|
36
|
+
raise typer.Exit(1)
|
37
|
+
|
38
|
+
try:
|
39
|
+
client = JudgmentClient()
|
40
|
+
|
41
|
+
result = client.save_custom_scorer(
|
42
|
+
scorer_file_path=scorer_file_path,
|
43
|
+
requirements_file_path=requirements_file_path,
|
44
|
+
unique_name=unique_name,
|
45
|
+
)
|
46
|
+
|
47
|
+
if not result:
|
48
|
+
judgeval_logger.error("Failed to upload custom scorer")
|
49
|
+
raise typer.Exit(1)
|
50
|
+
|
51
|
+
raise typer.Exit(0)
|
52
|
+
except Exception:
|
53
|
+
raise
|
54
|
+
|
55
|
+
|
56
|
+
@app.command()
|
57
|
+
def version():
|
58
|
+
"""Show version info"""
|
59
|
+
judgeval_logger.info("JudgEval CLI v0.0.0")
|
60
|
+
|
61
|
+
|
62
|
+
if __name__ == "__main__":
|
63
|
+
app()
|
64
|
+
|
65
|
+
# judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
|
@@ -2,7 +2,6 @@ import os
|
|
2
2
|
from dotenv import load_dotenv
|
3
3
|
from openai import OpenAI
|
4
4
|
from typing import Optional
|
5
|
-
from together import Together, AsyncTogether
|
6
5
|
|
7
6
|
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
8
7
|
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
@@ -28,6 +27,8 @@ async_together_client: Optional["AsyncTogether"] = None
|
|
28
27
|
together_api_key = os.getenv("TOGETHERAI_API_KEY") or os.getenv("TOGETHER_API_KEY")
|
29
28
|
if together_api_key:
|
30
29
|
try:
|
30
|
+
from together import Together, AsyncTogether
|
31
|
+
|
31
32
|
together_client = Together(api_key=together_api_key)
|
32
33
|
async_together_client = AsyncTogether(api_key=together_api_key)
|
33
34
|
except Exception:
|
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
|
|
20
20
|
JUDGMENT_EVAL_DELETE_API_URL,
|
21
21
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
|
22
22
|
JUDGMENT_GET_EVAL_STATUS_API_URL,
|
23
|
-
JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
|
24
|
-
JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
|
25
23
|
JUDGMENT_SCORER_SAVE_API_URL,
|
26
24
|
JUDGMENT_SCORER_FETCH_API_URL,
|
27
25
|
JUDGMENT_SCORER_EXISTS_API_URL,
|
26
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
28
27
|
JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
|
29
|
-
JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
|
30
28
|
)
|
31
29
|
from judgeval.common.api.constants import (
|
32
30
|
TraceFetchPayload,
|
@@ -45,16 +43,14 @@ from judgeval.common.api.constants import (
|
|
45
43
|
DeleteEvalRunRequestBody,
|
46
44
|
EvalLogPayload,
|
47
45
|
EvalStatusPayload,
|
48
|
-
CheckExperimentTypePayload,
|
49
|
-
EvalRunNameExistsPayload,
|
50
46
|
ScorerSavePayload,
|
51
47
|
ScorerFetchPayload,
|
52
48
|
ScorerExistsPayload,
|
53
|
-
|
49
|
+
CustomScorerUploadPayload,
|
50
|
+
CustomScorerTemplateResponse,
|
54
51
|
)
|
55
52
|
from judgeval.utils.requests import requests
|
56
|
-
|
57
|
-
import orjson
|
53
|
+
from judgeval.common.api.json_encoder import json_encoder
|
58
54
|
|
59
55
|
|
60
56
|
class JudgmentAPIException(exceptions.HTTPError):
|
@@ -98,22 +94,28 @@ class JudgmentApiClient:
|
|
98
94
|
method: Literal["POST", "PATCH", "GET", "DELETE"],
|
99
95
|
url: str,
|
100
96
|
payload: Any,
|
97
|
+
timeout: Optional[Union[float, tuple]] = None,
|
101
98
|
) -> Any:
|
99
|
+
# Prepare request kwargs with optional timeout
|
100
|
+
request_kwargs = self._request_kwargs()
|
101
|
+
if timeout is not None:
|
102
|
+
request_kwargs["timeout"] = timeout
|
103
|
+
|
102
104
|
if method == "GET":
|
103
105
|
r = requests.request(
|
104
106
|
method,
|
105
107
|
url,
|
106
108
|
params=payload,
|
107
109
|
headers=self._headers(),
|
108
|
-
**
|
110
|
+
**request_kwargs,
|
109
111
|
)
|
110
112
|
else:
|
111
113
|
r = requests.request(
|
112
114
|
method,
|
113
115
|
url,
|
114
|
-
|
116
|
+
json=json_encoder(payload),
|
115
117
|
headers=self._headers(),
|
116
|
-
**
|
118
|
+
**request_kwargs,
|
117
119
|
)
|
118
120
|
|
119
121
|
try:
|
@@ -187,10 +189,10 @@ class JudgmentApiClient:
|
|
187
189
|
payload: EvalLogPayload = {"results": results, "run": run}
|
188
190
|
return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
|
189
191
|
|
190
|
-
def fetch_evaluation_results(self,
|
192
|
+
def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
|
191
193
|
payload: EvalRunRequestBody = {
|
192
194
|
"project_name": project_name,
|
193
|
-
"
|
195
|
+
"experiment_run_id": experiment_run_id,
|
194
196
|
}
|
195
197
|
return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
|
196
198
|
|
@@ -205,43 +207,21 @@ class JudgmentApiClient:
|
|
205
207
|
def add_to_evaluation_queue(self, payload: Dict[str, Any]):
|
206
208
|
return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
|
207
209
|
|
208
|
-
def get_evaluation_status(self,
|
210
|
+
def get_evaluation_status(self, experiment_run_id: str, project_name: str):
|
209
211
|
payload: EvalStatusPayload = {
|
210
|
-
"
|
212
|
+
"experiment_run_id": experiment_run_id,
|
211
213
|
"project_name": project_name,
|
212
214
|
"judgment_api_key": self.api_key,
|
213
215
|
}
|
214
216
|
return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
|
215
217
|
|
216
|
-
def
|
217
|
-
|
218
|
-
|
219
|
-
"project_name": project_name,
|
220
|
-
"judgment_api_key": self.api_key,
|
221
|
-
"is_trace": is_trace,
|
222
|
-
}
|
223
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
|
224
|
-
|
225
|
-
def check_eval_run_name_exists(self, eval_name: str, project_name: str):
|
226
|
-
payload: EvalRunNameExistsPayload = {
|
227
|
-
"eval_name": eval_name,
|
228
|
-
"project_name": project_name,
|
229
|
-
"judgment_api_key": self.api_key,
|
230
|
-
}
|
231
|
-
return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
|
232
|
-
|
233
|
-
def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
|
234
|
-
payload: CheckExampleKeysPayload = {
|
235
|
-
"keys": keys,
|
236
|
-
"eval_name": eval_name,
|
237
|
-
"project_name": project_name,
|
238
|
-
}
|
239
|
-
return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
|
240
|
-
|
241
|
-
def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
|
218
|
+
def save_scorer(
|
219
|
+
self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
|
220
|
+
):
|
242
221
|
payload: ScorerSavePayload = {
|
243
222
|
"name": name,
|
244
223
|
"prompt": prompt,
|
224
|
+
"threshold": threshold,
|
245
225
|
"options": options,
|
246
226
|
}
|
247
227
|
try:
|
@@ -293,6 +273,31 @@ class JudgmentApiClient:
|
|
293
273
|
request=e.request,
|
294
274
|
)
|
295
275
|
|
276
|
+
def upload_custom_scorer(
|
277
|
+
self,
|
278
|
+
scorer_name: str,
|
279
|
+
scorer_code: str,
|
280
|
+
requirements_text: str,
|
281
|
+
) -> CustomScorerTemplateResponse:
|
282
|
+
"""Upload custom scorer to backend"""
|
283
|
+
payload: CustomScorerUploadPayload = {
|
284
|
+
"scorer_name": scorer_name,
|
285
|
+
"scorer_code": scorer_code,
|
286
|
+
"requirements_text": requirements_text,
|
287
|
+
}
|
288
|
+
|
289
|
+
try:
|
290
|
+
# Use longer timeout for custom scorer upload (5 minutes)
|
291
|
+
response = self._do_request(
|
292
|
+
"POST",
|
293
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
|
294
|
+
payload,
|
295
|
+
timeout=(10, 300),
|
296
|
+
)
|
297
|
+
return response
|
298
|
+
except JudgmentAPIException as e:
|
299
|
+
raise e
|
300
|
+
|
296
301
|
def push_dataset(
|
297
302
|
self,
|
298
303
|
dataset_alias: str,
|
@@ -368,16 +373,3 @@ class JudgmentApiClient:
|
|
368
373
|
"verify": True,
|
369
374
|
"timeout": 30,
|
370
375
|
}
|
371
|
-
|
372
|
-
def _serialize(self, data: Any) -> str:
|
373
|
-
def fallback_encoder(obj):
|
374
|
-
try:
|
375
|
-
return repr(obj)
|
376
|
-
except Exception:
|
377
|
-
try:
|
378
|
-
return str(obj)
|
379
|
-
except Exception as e:
|
380
|
-
return f"<Unserializable object of type {type(obj).__name__}: {e}>"
|
381
|
-
|
382
|
-
# orjson returns bytes, so we need to decode to str
|
383
|
-
return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
|
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
|
|
49
49
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
50
50
|
JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
|
51
51
|
JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
|
52
|
-
|
53
|
-
|
54
|
-
|
52
|
+
|
53
|
+
# Custom Scorers API
|
54
|
+
JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
|
55
55
|
|
56
56
|
|
57
57
|
# Evaluation API Payloads
|
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
|
|
73
73
|
|
74
74
|
|
75
75
|
class EvalStatusPayload(TypedDict):
|
76
|
-
|
77
|
-
project_name: str
|
76
|
+
experiment_run_id: str
|
78
77
|
judgment_api_key: str
|
78
|
+
project_name: str
|
79
79
|
|
80
80
|
|
81
81
|
class CheckExperimentTypePayload(TypedDict):
|
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
|
|
162
162
|
class ScorerSavePayload(TypedDict):
|
163
163
|
name: str
|
164
164
|
prompt: str
|
165
|
+
threshold: float
|
165
166
|
options: Optional[dict]
|
166
167
|
|
167
168
|
|
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
|
|
171
172
|
|
172
173
|
class ScorerExistsPayload(TypedDict):
|
173
174
|
name: str
|
175
|
+
|
176
|
+
|
177
|
+
class CustomScorerUploadPayload(TypedDict):
|
178
|
+
scorer_name: str
|
179
|
+
scorer_code: str
|
180
|
+
requirements_text: str
|
181
|
+
|
182
|
+
|
183
|
+
class CustomScorerTemplateResponse(TypedDict):
|
184
|
+
scorer_name: str
|
185
|
+
status: str
|
186
|
+
message: str
|