judgeval 0.4.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {judgeval-0.4.0 → judgeval-0.6.0}/.github/pull_request_template.md +1 -8
  2. {judgeval-0.4.0 → judgeval-0.6.0}/PKG-INFO +11 -12
  3. {judgeval-0.4.0 → judgeval-0.6.0}/pyproject.toml +33 -32
  4. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/__init__.py +2 -0
  5. judgeval-0.6.0/src/judgeval/cli.py +65 -0
  6. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/clients.py +2 -1
  7. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/api.py +46 -54
  8. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/constants.py +18 -5
  9. judgeval-0.6.0/src/judgeval/common/api/json_encoder.py +241 -0
  10. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/core.py +772 -467
  11. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/otel_span_processor.py +1 -1
  12. judgeval-0.6.0/src/judgeval/common/tracer/providers.py +119 -0
  13. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/span_processor.py +1 -1
  14. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/span_transformer.py +16 -26
  15. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/constants.py +1 -0
  16. judgeval-0.6.0/src/judgeval/data/evaluation_run.py +104 -0
  17. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/judgment_types.py +38 -8
  18. judgeval-0.6.0/src/judgeval/data/trace.py +83 -0
  19. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/trace_run.py +2 -3
  20. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/dataset.py +2 -0
  21. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/integrations/langgraph.py +2 -1
  22. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/litellm_judge.py +2 -1
  23. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/mixture_of_judges.py +2 -1
  24. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/utils.py +2 -1
  25. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judgment_client.py +113 -53
  26. judgeval-0.6.0/src/judgeval/local_eval_queue.py +190 -0
  27. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/run_evaluation.py +43 -197
  28. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/base_scorer.py +9 -10
  29. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +17 -3
  30. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/score.py +33 -11
  31. judgeval-0.6.0/src/judgeval/utils/async_utils.py +36 -0
  32. {judgeval-0.4.0 → judgeval-0.6.0}/uv.lock +54 -530
  33. judgeval-0.4.0/src/judgeval/data/trace.py +0 -199
  34. judgeval-0.4.0/src/judgeval/evaluation_run.py +0 -76
  35. {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  36. {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  37. {judgeval-0.4.0 → judgeval-0.6.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  38. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/blocked-pr.yaml +0 -0
  39. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/ci.yaml +0 -0
  40. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/lint.yaml +0 -0
  41. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/merge-branch-check.yaml +0 -0
  42. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/mypy.yaml +0 -0
  43. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  44. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/release.yaml +0 -0
  45. {judgeval-0.4.0 → judgeval-0.6.0}/.github/workflows/validate-branch.yaml +0 -0
  46. {judgeval-0.4.0 → judgeval-0.6.0}/.gitignore +0 -0
  47. {judgeval-0.4.0 → judgeval-0.6.0}/.pre-commit-config.yaml +0 -0
  48. {judgeval-0.4.0 → judgeval-0.6.0}/LICENSE.md +0 -0
  49. {judgeval-0.4.0 → judgeval-0.6.0}/README.md +0 -0
  50. {judgeval-0.4.0 → judgeval-0.6.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  51. {judgeval-0.4.0 → judgeval-0.6.0}/assets/agent.gif +0 -0
  52. {judgeval-0.4.0 → judgeval-0.6.0}/assets/agent_trace_example.png +0 -0
  53. {judgeval-0.4.0 → judgeval-0.6.0}/assets/data.gif +0 -0
  54. {judgeval-0.4.0 → judgeval-0.6.0}/assets/dataset_clustering_screenshot.png +0 -0
  55. {judgeval-0.4.0 → judgeval-0.6.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
  56. {judgeval-0.4.0 → judgeval-0.6.0}/assets/datasets_preview_screenshot.png +0 -0
  57. {judgeval-0.4.0 → judgeval-0.6.0}/assets/document.gif +0 -0
  58. {judgeval-0.4.0 → judgeval-0.6.0}/assets/error_analysis_dashboard.png +0 -0
  59. {judgeval-0.4.0 → judgeval-0.6.0}/assets/errors.png +0 -0
  60. {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_dashboard_screenshot.png +0 -0
  61. {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_page.png +0 -0
  62. {judgeval-0.4.0 → judgeval-0.6.0}/assets/experiments_pagev2.png +0 -0
  63. {judgeval-0.4.0 → judgeval-0.6.0}/assets/logo-dark.svg +0 -0
  64. {judgeval-0.4.0 → judgeval-0.6.0}/assets/logo-light.svg +0 -0
  65. {judgeval-0.4.0 → judgeval-0.6.0}/assets/monitoring_screenshot.png +0 -0
  66. {judgeval-0.4.0 → judgeval-0.6.0}/assets/new_darkmode.svg +0 -0
  67. {judgeval-0.4.0 → judgeval-0.6.0}/assets/new_lightmode.svg +0 -0
  68. {judgeval-0.4.0 → judgeval-0.6.0}/assets/online_eval.png +0 -0
  69. {judgeval-0.4.0 → judgeval-0.6.0}/assets/product_shot.png +0 -0
  70. {judgeval-0.4.0 → judgeval-0.6.0}/assets/test.png +0 -0
  71. {judgeval-0.4.0 → judgeval-0.6.0}/assets/tests.png +0 -0
  72. {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace.gif +0 -0
  73. {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_demo.png +0 -0
  74. {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_screenshot.png +0 -0
  75. {judgeval-0.4.0 → judgeval-0.6.0}/assets/trace_screenshot_old.png +0 -0
  76. {judgeval-0.4.0 → judgeval-0.6.0}/pytest.ini +0 -0
  77. {judgeval-0.4.0 → judgeval-0.6.0}/src/.coveragerc +0 -0
  78. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/__init__.py +0 -0
  79. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/api/__init__.py +0 -0
  80. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/exceptions.py +0 -0
  81. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/logger.py +0 -0
  82. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/storage/__init__.py +0 -0
  83. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/storage/s3_storage.py +0 -0
  84. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/__init__.py +0 -0
  85. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/constants.py +0 -0
  86. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/otel_exporter.py +0 -0
  87. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/tracer/trace_manager.py +0 -0
  88. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/common/utils.py +0 -0
  89. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/__init__.py +0 -0
  90. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/example.py +0 -0
  91. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/result.py +0 -0
  92. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scorer_data.py +0 -0
  93. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  94. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  95. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/data/tool.py +0 -0
  96. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/__init__.py +0 -0
  97. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/base_judge.py +0 -0
  98. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/judges/together_judge.py +0 -0
  99. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/rules.py +0 -0
  100. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/__init__.py +0 -0
  101. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/agent_scorer.py +0 -0
  102. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/api_scorer.py +0 -0
  103. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/example_scorer.py +0 -0
  104. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/exceptions.py +0 -0
  105. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  106. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  107. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  108. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  109. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  110. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  111. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  112. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  113. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  114. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  115. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  116. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/scorers/utils.py +0 -0
  117. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/tracer/__init__.py +0 -0
  118. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/alerts.py +0 -0
  119. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/file_utils.py +0 -0
  120. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/utils/requests.py +0 -0
  121. {judgeval-0.4.0 → judgeval-0.6.0}/src/judgeval/version_check.py +0 -0
  122. {judgeval-0.4.0 → judgeval-0.6.0}/src/update_types.sh +0 -0
  123. {judgeval-0.4.0 → judgeval-0.6.0}/update_version.py +0 -0
@@ -10,14 +10,7 @@
10
10
  -->
11
11
  - [ ] 1. ...
12
12
 
13
- ## 🎥 Demo of Changes
14
-
15
- <!-- Add a short 1-3 minute video describing/demoing the changes -->
16
-
17
13
  ## ✅ Checklist
18
14
 
19
- - [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
20
- - [ ] Video demo of changes
21
- - [ ] Reviewers assigned
22
15
  - [ ] Docs updated ([if necessary](https://github.com/JudgmentLabs/docs))
23
- - [ ] Cookbooks updated ([if necessary](https://github.com/JudgmentLabs/judgment-cookbook))
16
+ - [ ] Changelogs are updated ([if necessary](https://github.com/JudgmentLabs/docs/tree/main/content/docs/changelog/%28weekly%29))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -10,27 +10,26 @@ License-File: LICENSE.md
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
- Requires-Dist: anthropic
14
13
  Requires-Dist: boto3
15
- Requires-Dist: datamodel-code-generator>=0.31.1
16
- Requires-Dist: google-genai
17
- Requires-Dist: groq>=0.30.0
14
+ Requires-Dist: click<8.2.0
18
15
  Requires-Dist: langchain-anthropic
19
16
  Requires-Dist: langchain-core
20
17
  Requires-Dist: langchain-huggingface
21
18
  Requires-Dist: langchain-openai
22
19
  Requires-Dist: litellm>=1.61.15
23
- Requires-Dist: matplotlib>=3.10.3
24
- Requires-Dist: nest-asyncio
25
- Requires-Dist: openai
20
+ Requires-Dist: nest-asyncio>=1.6.0
26
21
  Requires-Dist: opentelemetry-api>=1.34.1
27
22
  Requires-Dist: opentelemetry-sdk>=1.34.1
28
23
  Requires-Dist: orjson>=3.9.0
29
- Requires-Dist: pandas
30
- Requires-Dist: python-dotenv==1.0.1
31
- Requires-Dist: python-slugify>=8.0.4
24
+ Requires-Dist: python-dotenv
32
25
  Requires-Dist: requests
33
- Requires-Dist: together
26
+ Requires-Dist: rich
27
+ Requires-Dist: typer>=0.9.0
28
+ Provides-Extra: langchain
29
+ Requires-Dist: langchain-anthropic; extra == 'langchain'
30
+ Requires-Dist: langchain-core; extra == 'langchain'
31
+ Requires-Dist: langchain-huggingface; extra == 'langchain'
32
+ Requires-Dist: langchain-openai; extra == 'langchain'
34
33
  Description-Content-Type: text/markdown
35
34
 
36
35
  <div align="center">
@@ -1,10 +1,10 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.4.0"
3
+ version = "0.6.0"
4
4
  authors = [
5
- { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
- { name="Alex Shan", email="alex@judgmentlabs.ai" },
7
- { name="Joseph Camyre", email="joseph@judgmentlabs.ai" },
5
+ { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
+ { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
7
+ { name = "Joseph Camyre", email = "joseph@judgmentlabs.ai" },
8
8
  ]
9
9
  description = "Judgeval Package"
10
10
  readme = "README.md"
@@ -16,54 +16,54 @@ classifiers = [
16
16
  license = "Apache-2.0"
17
17
  license-files = ["LICENSE.md"]
18
18
  dependencies = [
19
+ "rich",
19
20
  "litellm>=1.61.15",
20
- "python-dotenv==1.0.1",
21
+ "python-dotenv",
21
22
  "requests",
22
- "pandas",
23
- "openai",
24
- "together",
25
- "anthropic",
26
- "nest-asyncio",
27
- "langchain-huggingface",
28
- "langchain-openai",
29
- "langchain-anthropic",
30
- "langchain-core",
31
- "google-genai",
32
23
  "boto3",
33
- "matplotlib>=3.10.3",
34
- "python-slugify>=8.0.4",
35
- "datamodel-code-generator>=0.31.1",
36
- "groq>=0.30.0",
37
24
  "opentelemetry-api>=1.34.1",
38
25
  "opentelemetry-sdk>=1.34.1",
39
26
  "orjson>=3.9.0",
27
+ "nest-asyncio>=1.6.0",
28
+ "langchain-huggingface",
29
+ "langchain-openai",
30
+ "langchain-anthropic",
31
+ "langchain-core",
32
+ "click<8.2.0",
33
+ "typer>=0.9.0",
40
34
  ]
41
35
 
42
36
  [project.urls]
43
37
  Homepage = "https://github.com/JudgmentLabs/judgeval"
44
38
  Issues = "https://github.com/JudgmentLabs/judgeval/issues"
45
39
 
40
+ [project.scripts]
41
+ judgeval = "judgeval.cli:app"
42
+
46
43
  [build-system]
47
44
  requires = ["hatchling"]
48
45
  build-backend = "hatchling.build"
49
46
 
50
47
  [tool.hatch.build.targets.wheel]
51
48
  packages = ["src/judgeval"]
52
- include = [
53
- "/src/judgeval",
54
- "/src/judgeval/**/*.py",
49
+ include = ["/src/judgeval", "/src/judgeval/**/*.py"]
50
+
51
+ [project.optional-dependencies]
52
+ langchain = [
53
+ "langchain-huggingface",
54
+ "langchain-openai",
55
+ "langchain-anthropic",
56
+ "langchain-core",
55
57
  ]
56
58
 
57
59
  [dependency-groups]
58
60
  dev = [
59
61
  "chromadb>=1.0.12",
60
- "langchain-community>=0.3.24",
61
62
  "pytest>=8.4.0",
62
63
  "pytest-asyncio>=1.0.0",
63
64
  "pytest-cov>=6.1.1",
64
65
  "pytest-mock>=3.14.1",
65
66
  "tavily-python>=0.7.5",
66
- "langgraph>=0.4.3",
67
67
  "pre-commit>=4.2.0",
68
68
  "types-requests>=2.32.4.20250611",
69
69
  "mypy>=1.17.0",
@@ -90,18 +90,19 @@ dev = [
90
90
  "types-tqdm>=4.67.0.20250516",
91
91
  "types-tree-sitter-languages>=1.10.0.20250530",
92
92
  "types-xmltodict>=0.14.0.20241009",
93
+ "datamodel-code-generator>=0.31.2",
94
+ "openai",
95
+ "together",
96
+ "anthropic",
97
+ "google-genai",
98
+ "groq",
99
+ "langgraph>=0.4.3",
93
100
  ]
94
101
 
95
102
  [tool.hatch.build]
96
103
  directory = "dist"
97
- artifacts = [
98
- "src/judgeval/**/*.py",
99
- ]
100
- exclude = [
101
- "src/e2etests/*",
102
- "src/tests/*",
103
- "src/demo/*"
104
- ]
104
+ artifacts = ["src/judgeval/**/*.py"]
105
+ exclude = ["src/e2etests/*", "src/tests/*", "src/demo/*"]
105
106
 
106
107
  [tool.ruff]
107
108
  exclude = ["docs"]
@@ -2,6 +2,7 @@
2
2
  from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
4
  from judgeval.version_check import check_latest_version
5
+ from judgeval.local_eval_queue import LocalEvaluationQueue
5
6
 
6
7
  check_latest_version()
7
8
 
@@ -10,4 +11,5 @@ __all__ = [
10
11
  "client",
11
12
  "together_client",
12
13
  "JudgmentClient",
14
+ "LocalEvaluationQueue",
13
15
  ]
@@ -0,0 +1,65 @@
1
+ #!/usr/bin/env python3
2
+
3
+ import typer
4
+ from pathlib import Path
5
+ from dotenv import load_dotenv
6
+ from judgeval.common.logger import judgeval_logger
7
+ from judgeval.judgment_client import JudgmentClient
8
+
9
+ load_dotenv()
10
+
11
+ app = typer.Typer(
12
+ no_args_is_help=True,
13
+ rich_markup_mode=None,
14
+ rich_help_panel=None,
15
+ pretty_exceptions_enable=False,
16
+ pretty_exceptions_show_locals=False,
17
+ pretty_exceptions_short=False,
18
+ )
19
+
20
+
21
+ @app.command("upload_scorer")
22
+ def upload_scorer(
23
+ scorer_file_path: str,
24
+ requirements_file_path: str,
25
+ unique_name: str = typer.Option(
26
+ None, help="Custom name for the scorer (auto-detected if not provided)"
27
+ ),
28
+ ):
29
+ # Validate file paths
30
+ if not Path(scorer_file_path).exists():
31
+ judgeval_logger.error(f"Scorer file not found: {scorer_file_path}")
32
+ raise typer.Exit(1)
33
+
34
+ if not Path(requirements_file_path).exists():
35
+ judgeval_logger.error(f"Requirements file not found: {requirements_file_path}")
36
+ raise typer.Exit(1)
37
+
38
+ try:
39
+ client = JudgmentClient()
40
+
41
+ result = client.save_custom_scorer(
42
+ scorer_file_path=scorer_file_path,
43
+ requirements_file_path=requirements_file_path,
44
+ unique_name=unique_name,
45
+ )
46
+
47
+ if not result:
48
+ judgeval_logger.error("Failed to upload custom scorer")
49
+ raise typer.Exit(1)
50
+
51
+ raise typer.Exit(0)
52
+ except Exception:
53
+ raise
54
+
55
+
56
+ @app.command()
57
+ def version():
58
+ """Show version info"""
59
+ judgeval_logger.info("JudgEval CLI v0.0.0")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ app()
64
+
65
+ # judgeval upload_scorer /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/profile_match_scorer.py /Users/alanzhang/repo/JudgmentLabs/judgeval/src/demo/requirements.txt
@@ -2,7 +2,6 @@ import os
2
2
  from dotenv import load_dotenv
3
3
  from openai import OpenAI
4
4
  from typing import Optional
5
- from together import Together, AsyncTogether
6
5
 
7
6
  PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
8
7
  load_dotenv(dotenv_path=PATH_TO_DOTENV)
@@ -28,6 +27,8 @@ async_together_client: Optional["AsyncTogether"] = None
28
27
  together_api_key = os.getenv("TOGETHERAI_API_KEY") or os.getenv("TOGETHER_API_KEY")
29
28
  if together_api_key:
30
29
  try:
30
+ from together import Together, AsyncTogether
31
+
31
32
  together_client = Together(api_key=together_api_key)
32
33
  async_together_client = AsyncTogether(api_key=together_api_key)
33
34
  except Exception:
@@ -20,13 +20,11 @@ from judgeval.common.api.constants import (
20
20
  JUDGMENT_EVAL_DELETE_API_URL,
21
21
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL,
22
22
  JUDGMENT_GET_EVAL_STATUS_API_URL,
23
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL,
24
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL,
25
23
  JUDGMENT_SCORER_SAVE_API_URL,
26
24
  JUDGMENT_SCORER_FETCH_API_URL,
27
25
  JUDGMENT_SCORER_EXISTS_API_URL,
26
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
28
27
  JUDGMENT_DATASETS_APPEND_TRACES_API_URL,
29
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL,
30
28
  )
31
29
  from judgeval.common.api.constants import (
32
30
  TraceFetchPayload,
@@ -45,16 +43,14 @@ from judgeval.common.api.constants import (
45
43
  DeleteEvalRunRequestBody,
46
44
  EvalLogPayload,
47
45
  EvalStatusPayload,
48
- CheckExperimentTypePayload,
49
- EvalRunNameExistsPayload,
50
46
  ScorerSavePayload,
51
47
  ScorerFetchPayload,
52
48
  ScorerExistsPayload,
53
- CheckExampleKeysPayload,
49
+ CustomScorerUploadPayload,
50
+ CustomScorerTemplateResponse,
54
51
  )
55
52
  from judgeval.utils.requests import requests
56
-
57
- import orjson
53
+ from judgeval.common.api.json_encoder import json_encoder
58
54
 
59
55
 
60
56
  class JudgmentAPIException(exceptions.HTTPError):
@@ -98,22 +94,28 @@ class JudgmentApiClient:
98
94
  method: Literal["POST", "PATCH", "GET", "DELETE"],
99
95
  url: str,
100
96
  payload: Any,
97
+ timeout: Optional[Union[float, tuple]] = None,
101
98
  ) -> Any:
99
+ # Prepare request kwargs with optional timeout
100
+ request_kwargs = self._request_kwargs()
101
+ if timeout is not None:
102
+ request_kwargs["timeout"] = timeout
103
+
102
104
  if method == "GET":
103
105
  r = requests.request(
104
106
  method,
105
107
  url,
106
108
  params=payload,
107
109
  headers=self._headers(),
108
- **self._request_kwargs(),
110
+ **request_kwargs,
109
111
  )
110
112
  else:
111
113
  r = requests.request(
112
114
  method,
113
115
  url,
114
- data=self._serialize(payload),
116
+ json=json_encoder(payload),
115
117
  headers=self._headers(),
116
- **self._request_kwargs(),
118
+ **request_kwargs,
117
119
  )
118
120
 
119
121
  try:
@@ -187,10 +189,10 @@ class JudgmentApiClient:
187
189
  payload: EvalLogPayload = {"results": results, "run": run}
188
190
  return self._do_request("POST", JUDGMENT_EVAL_LOG_API_URL, payload)
189
191
 
190
- def fetch_evaluation_results(self, project_name: str, eval_name: str):
192
+ def fetch_evaluation_results(self, experiment_run_id: str, project_name: str):
191
193
  payload: EvalRunRequestBody = {
192
194
  "project_name": project_name,
193
- "eval_name": eval_name,
195
+ "experiment_run_id": experiment_run_id,
194
196
  }
195
197
  return self._do_request("POST", JUDGMENT_EVAL_FETCH_API_URL, payload)
196
198
 
@@ -205,43 +207,21 @@ class JudgmentApiClient:
205
207
  def add_to_evaluation_queue(self, payload: Dict[str, Any]):
206
208
  return self._do_request("POST", JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL, payload)
207
209
 
208
- def get_evaluation_status(self, eval_name: str, project_name: str):
210
+ def get_evaluation_status(self, experiment_run_id: str, project_name: str):
209
211
  payload: EvalStatusPayload = {
210
- "eval_name": eval_name,
212
+ "experiment_run_id": experiment_run_id,
211
213
  "project_name": project_name,
212
214
  "judgment_api_key": self.api_key,
213
215
  }
214
216
  return self._do_request("GET", JUDGMENT_GET_EVAL_STATUS_API_URL, payload)
215
217
 
216
- def check_experiment_type(self, eval_name: str, project_name: str, is_trace: bool):
217
- payload: CheckExperimentTypePayload = {
218
- "eval_name": eval_name,
219
- "project_name": project_name,
220
- "judgment_api_key": self.api_key,
221
- "is_trace": is_trace,
222
- }
223
- return self._do_request("POST", JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL, payload)
224
-
225
- def check_eval_run_name_exists(self, eval_name: str, project_name: str):
226
- payload: EvalRunNameExistsPayload = {
227
- "eval_name": eval_name,
228
- "project_name": project_name,
229
- "judgment_api_key": self.api_key,
230
- }
231
- return self._do_request("POST", JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL, payload)
232
-
233
- def check_example_keys(self, keys: List[str], eval_name: str, project_name: str):
234
- payload: CheckExampleKeysPayload = {
235
- "keys": keys,
236
- "eval_name": eval_name,
237
- "project_name": project_name,
238
- }
239
- return self._do_request("POST", JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL, payload)
240
-
241
- def save_scorer(self, name: str, prompt: str, options: Optional[dict] = None):
218
+ def save_scorer(
219
+ self, name: str, prompt: str, threshold: float, options: Optional[dict] = None
220
+ ):
242
221
  payload: ScorerSavePayload = {
243
222
  "name": name,
244
223
  "prompt": prompt,
224
+ "threshold": threshold,
245
225
  "options": options,
246
226
  }
247
227
  try:
@@ -293,6 +273,31 @@ class JudgmentApiClient:
293
273
  request=e.request,
294
274
  )
295
275
 
276
+ def upload_custom_scorer(
277
+ self,
278
+ scorer_name: str,
279
+ scorer_code: str,
280
+ requirements_text: str,
281
+ ) -> CustomScorerTemplateResponse:
282
+ """Upload custom scorer to backend"""
283
+ payload: CustomScorerUploadPayload = {
284
+ "scorer_name": scorer_name,
285
+ "scorer_code": scorer_code,
286
+ "requirements_text": requirements_text,
287
+ }
288
+
289
+ try:
290
+ # Use longer timeout for custom scorer upload (5 minutes)
291
+ response = self._do_request(
292
+ "POST",
293
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL,
294
+ payload,
295
+ timeout=(10, 300),
296
+ )
297
+ return response
298
+ except JudgmentAPIException as e:
299
+ raise e
300
+
296
301
  def push_dataset(
297
302
  self,
298
303
  dataset_alias: str,
@@ -368,16 +373,3 @@ class JudgmentApiClient:
368
373
  "verify": True,
369
374
  "timeout": 30,
370
375
  }
371
-
372
- def _serialize(self, data: Any) -> str:
373
- def fallback_encoder(obj):
374
- try:
375
- return repr(obj)
376
- except Exception:
377
- try:
378
- return str(obj)
379
- except Exception as e:
380
- return f"<Unserializable object of type {type(obj).__name__}: {e}>"
381
-
382
- # orjson returns bytes, so we need to decode to str
383
- return orjson.dumps(data, default=fallback_encoder).decode("utf-8")
@@ -49,9 +49,9 @@ JUDGMENT_EVAL_DELETE_API_URL = (
49
49
  JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
50
50
  JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
51
  JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
- JUDGMENT_CHECK_EXPERIMENT_TYPE_API_URL = f"{ROOT_API}/check_experiment_type/"
53
- JUDGMENT_EVAL_RUN_NAME_EXISTS_API_URL = f"{ROOT_API}/eval-run-name-exists/"
54
- JUDGMENT_CHECK_EXAMPLE_KEYS_API_URL = f"{ROOT_API}/check_example_keys/"
52
+
53
+ # Custom Scorers API
54
+ JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/build_sandbox_template/"
55
55
 
56
56
 
57
57
  # Evaluation API Payloads
@@ -73,9 +73,9 @@ class EvalLogPayload(TypedDict):
73
73
 
74
74
 
75
75
  class EvalStatusPayload(TypedDict):
76
- eval_name: str
77
- project_name: str
76
+ experiment_run_id: str
78
77
  judgment_api_key: str
78
+ project_name: str
79
79
 
80
80
 
81
81
  class CheckExperimentTypePayload(TypedDict):
@@ -162,6 +162,7 @@ JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
162
162
  class ScorerSavePayload(TypedDict):
163
163
  name: str
164
164
  prompt: str
165
+ threshold: float
165
166
  options: Optional[dict]
166
167
 
167
168
 
@@ -171,3 +172,15 @@ class ScorerFetchPayload(TypedDict):
171
172
 
172
173
  class ScorerExistsPayload(TypedDict):
173
174
  name: str
175
+
176
+
177
+ class CustomScorerUploadPayload(TypedDict):
178
+ scorer_name: str
179
+ scorer_code: str
180
+ requirements_text: str
181
+
182
+
183
+ class CustomScorerTemplateResponse(TypedDict):
184
+ scorer_name: str
185
+ status: str
186
+ message: str