judgeval 0.10.1__tar.gz → 0.11.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/ci.yaml +2 -0
  2. judgeval-0.11.0/.pre-commit-config.yaml +23 -0
  3. {judgeval-0.10.1 → judgeval-0.11.0}/PKG-INFO +1 -1
  4. {judgeval-0.10.1 → judgeval-0.11.0}/pyproject.toml +1 -1
  5. {judgeval-0.10.1 → judgeval-0.11.0}/scripts/api_generator.py +2 -1
  6. {judgeval-0.10.1 → judgeval-0.11.0}/scripts/openapi_transform.py +2 -1
  7. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/__init__.py +4 -4
  8. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/api/__init__.py +17 -9
  9. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/api/api_types.py +20 -18
  10. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/evaluation_run.py +10 -11
  11. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/judgment_types.py +25 -14
  12. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/result.py +1 -0
  13. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/scorer_data.py +1 -26
  14. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/dataset/__init__.py +17 -16
  15. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/env.py +11 -2
  16. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/evaluation/__init__.py +20 -63
  17. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/integrations/langgraph/__init__.py +2 -1
  18. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/__init__.py +0 -4
  19. judgeval-0.11.0/src/judgeval/scorers/agent_scorer.py +17 -0
  20. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/api_scorer.py +0 -8
  21. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/base_scorer.py +2 -2
  22. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  23. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  24. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  25. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  26. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +3 -5
  27. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/score.py +1 -1
  28. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/__init__.py +7 -10
  29. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/local_eval_queue.py +11 -7
  30. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/utils.py +2 -2
  31. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/trainer/config.py +1 -1
  32. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/trainer/trainable_model.py +1 -1
  33. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/trainer/trainer.py +8 -6
  34. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/async_utils.py +7 -3
  35. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/testing.py +0 -4
  36. judgeval-0.10.1/.pre-commit-config.yaml +0 -23
  37. judgeval-0.10.1/src/judgeval/data/tool.py +0 -5
  38. judgeval-0.10.1/src/judgeval/scorers/agent_scorer.py +0 -17
  39. {judgeval-0.10.1 → judgeval-0.11.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  40. {judgeval-0.10.1 → judgeval-0.11.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  41. {judgeval-0.10.1 → judgeval-0.11.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  42. {judgeval-0.10.1 → judgeval-0.11.0}/.github/pull_request_template.md +0 -0
  43. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/blocked-pr.yaml +0 -0
  44. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/claude-code-review.yml +0 -0
  45. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/claude.yml +0 -0
  46. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/lint.yaml +0 -0
  47. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/merge-branch-check.yaml +0 -0
  48. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/mypy.yaml +0 -0
  49. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  50. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/release.yaml +0 -0
  51. {judgeval-0.10.1 → judgeval-0.11.0}/.github/workflows/validate-branch.yaml +0 -0
  52. {judgeval-0.10.1 → judgeval-0.11.0}/.gitignore +0 -0
  53. {judgeval-0.10.1 → judgeval-0.11.0}/LICENSE.md +0 -0
  54. {judgeval-0.10.1 → judgeval-0.11.0}/README.md +0 -0
  55. {judgeval-0.10.1 → judgeval-0.11.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  56. {judgeval-0.10.1 → judgeval-0.11.0}/assets/agent.gif +0 -0
  57. {judgeval-0.10.1 → judgeval-0.11.0}/assets/agent_trace_example.png +0 -0
  58. {judgeval-0.10.1 → judgeval-0.11.0}/assets/data.gif +0 -0
  59. {judgeval-0.10.1 → judgeval-0.11.0}/assets/dataset_clustering_screenshot.png +0 -0
  60. {judgeval-0.10.1 → judgeval-0.11.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
  61. {judgeval-0.10.1 → judgeval-0.11.0}/assets/datasets_preview_screenshot.png +0 -0
  62. {judgeval-0.10.1 → judgeval-0.11.0}/assets/document.gif +0 -0
  63. {judgeval-0.10.1 → judgeval-0.11.0}/assets/error_analysis_dashboard.png +0 -0
  64. {judgeval-0.10.1 → judgeval-0.11.0}/assets/errors.png +0 -0
  65. {judgeval-0.10.1 → judgeval-0.11.0}/assets/experiments_dashboard_screenshot.png +0 -0
  66. {judgeval-0.10.1 → judgeval-0.11.0}/assets/experiments_page.png +0 -0
  67. {judgeval-0.10.1 → judgeval-0.11.0}/assets/experiments_pagev2.png +0 -0
  68. {judgeval-0.10.1 → judgeval-0.11.0}/assets/logo-dark.svg +0 -0
  69. {judgeval-0.10.1 → judgeval-0.11.0}/assets/logo-light.svg +0 -0
  70. {judgeval-0.10.1 → judgeval-0.11.0}/assets/monitoring_screenshot.png +0 -0
  71. {judgeval-0.10.1 → judgeval-0.11.0}/assets/new_darkmode.svg +0 -0
  72. {judgeval-0.10.1 → judgeval-0.11.0}/assets/new_lightmode.svg +0 -0
  73. {judgeval-0.10.1 → judgeval-0.11.0}/assets/online_eval.png +0 -0
  74. {judgeval-0.10.1 → judgeval-0.11.0}/assets/product_shot.png +0 -0
  75. {judgeval-0.10.1 → judgeval-0.11.0}/assets/test.png +0 -0
  76. {judgeval-0.10.1 → judgeval-0.11.0}/assets/tests.png +0 -0
  77. {judgeval-0.10.1 → judgeval-0.11.0}/assets/trace.gif +0 -0
  78. {judgeval-0.10.1 → judgeval-0.11.0}/assets/trace_demo.png +0 -0
  79. {judgeval-0.10.1 → judgeval-0.11.0}/assets/trace_screenshot.png +0 -0
  80. {judgeval-0.10.1 → judgeval-0.11.0}/assets/trace_screenshot_old.png +0 -0
  81. {judgeval-0.10.1 → judgeval-0.11.0}/pytest.ini +0 -0
  82. {judgeval-0.10.1 → judgeval-0.11.0}/scripts/update_types.sh +0 -0
  83. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/cli.py +0 -0
  84. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/constants.py +0 -0
  85. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/__init__.py +0 -0
  86. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/example.py +0 -0
  87. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  88. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  89. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/data/trace.py +0 -0
  90. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/exceptions.py +0 -0
  91. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/judges/__init__.py +0 -0
  92. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/judges/base_judge.py +0 -0
  93. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/judges/litellm_judge.py +0 -0
  94. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/judges/together_judge.py +0 -0
  95. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/judges/utils.py +0 -0
  96. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/logger.py +0 -0
  97. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/example_scorer.py +0 -0
  98. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/exceptions.py +0 -0
  99. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  100. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  101. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/scorers/utils.py +0 -0
  102. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/constants.py +0 -0
  103. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
  104. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/exporters/s3.py +0 -0
  105. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/exporters/store.py +0 -0
  106. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/exporters/utils.py +0 -0
  107. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/keys.py +0 -0
  108. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/llm/__init__.py +0 -0
  109. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/llm/providers.py +0 -0
  110. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/managers.py +0 -0
  111. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/tracer/processors/__init__.py +0 -0
  112. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/trainer/__init__.py +0 -0
  113. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/trainer/console.py +0 -0
  114. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/decorators.py +0 -0
  115. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/file_utils.py +0 -0
  116. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/guards.py +0 -0
  117. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/meta.py +0 -0
  118. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/serialize.py +0 -0
  119. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/url.py +0 -0
  120. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/utils/version_check.py +0 -0
  121. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/version.py +0 -0
  122. {judgeval-0.10.1 → judgeval-0.11.0}/src/judgeval/warnings.py +0 -0
  123. {judgeval-0.10.1 → judgeval-0.11.0}/update_version.py +0 -0
  124. {judgeval-0.10.1 → judgeval-0.11.0}/uv.lock +0 -0
@@ -47,6 +47,8 @@ jobs:
47
47
  - name: Run tests
48
48
  run: |
49
49
  cd src
50
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
51
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
50
52
  uv run pytest tests
51
53
 
52
54
  run-e2e-tests-staging:
@@ -0,0 +1,23 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/uv-pre-commit
3
+ rev: 0.8.17
4
+ hooks:
5
+ - id: uv-lock
6
+
7
+ - repo: https://github.com/astral-sh/ruff-pre-commit
8
+ rev: v0.13.0
9
+ hooks:
10
+ - id: ruff
11
+ name: ruff (linter)
12
+ args: [--fix]
13
+ - id: ruff-format
14
+ name: ruff (formatter)
15
+
16
+ - repo: https://github.com/pre-commit/mirrors-mypy
17
+ rev: v1.17.0
18
+ hooks:
19
+ - id: mypy
20
+ language: system
21
+ # These next two lines allow commits even if mypy fails, REMOVE once we fix all mypy errors
22
+ verbose: true
23
+ entry: bash -c 'mypy src/judgeval/ || true'
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.10.1
3
+ Version: 0.11.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.10.1"
3
+ version = "0.11.0"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -33,12 +33,13 @@ JUDGEVAL_PATHS: List[str] = [
33
33
  "/add_to_run_eval_queue/traces",
34
34
  "/get_evaluation_status/",
35
35
  "/save_scorer/",
36
- "/fetch_scorer/",
36
+ "/fetch_scorers/",
37
37
  "/scorer_exists/",
38
38
  "/upload_custom_scorer/",
39
39
  "/datasets/create_for_judgeval/",
40
40
  "/datasets/insert_examples_for_judgeval/",
41
41
  "/datasets/pull_for_judgeval/",
42
+ "/datasets/pull_all_for_judgeval/",
42
43
  "/projects/resolve/",
43
44
  "/e2e_fetch_trace/",
44
45
  "/e2e_fetch_span_score/",
@@ -32,12 +32,13 @@ JUDGEVAL_PATHS: List[str] = [
32
32
  "/add_to_run_eval_queue/traces",
33
33
  "/get_evaluation_status/",
34
34
  "/save_scorer/",
35
- "/fetch_scorer/",
35
+ "/fetch_scorers/",
36
36
  "/scorer_exists/",
37
37
  "/upload_custom_scorer/",
38
38
  "/datasets/create_for_judgeval/",
39
39
  "/datasets/insert_examples_for_judgeval/",
40
40
  "/datasets/pull_for_judgeval/",
41
+ "/datasets/pull_all_for_judgeval/",
41
42
  "/projects/resolve/",
42
43
  "/e2e_fetch_trace/",
43
44
  "/e2e_fetch_span_score/",
@@ -6,7 +6,8 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
8
  from typing import List, Optional, Union
9
- from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
9
+ from judgeval.scorers import APIScorerConfig
10
+ from judgeval.scorers.example_scorer import ExampleScorer
10
11
  from judgeval.data.example import Example
11
12
  from judgeval.logger import judgeval_logger
12
13
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
@@ -38,7 +39,7 @@ class JudgmentClient(metaclass=SingletonMeta):
38
39
  def run_evaluation(
39
40
  self,
40
41
  examples: List[Example],
41
- scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
42
+ scorers: List[Union[APIScorerConfig, ExampleScorer]],
42
43
  project_name: str = "default_project",
43
44
  eval_run_name: str = "default_eval_run",
44
45
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
@@ -51,10 +52,9 @@ class JudgmentClient(metaclass=SingletonMeta):
51
52
  examples=examples,
52
53
  scorers=scorers,
53
54
  model=model,
54
- organization_id=self.organization_id,
55
55
  )
56
56
 
57
- results = run_eval(eval, self.api_key)
57
+ results = run_eval(eval)
58
58
  if assert_test:
59
59
  assert_test_results(results)
60
60
 
@@ -137,12 +137,13 @@ class JudgmentSyncClient:
137
137
  payload,
138
138
  )
139
139
 
140
- def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> List[DatasetInfo]:
140
+ def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
141
141
  return self._request(
142
142
  "POST",
143
143
  url_for("/datasets/pull_all_for_judgeval/"),
144
144
  payload,
145
145
  )
146
+
146
147
  def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
147
148
  return self._request(
148
149
  "POST",
@@ -180,12 +181,12 @@ class JudgmentSyncClient:
180
181
  payload,
181
182
  )
182
183
 
183
- def fetch_scorer(
184
- self, payload: FetchPromptScorerRequest
185
- ) -> FetchPromptScorerResponse:
184
+ def fetch_scorers(
185
+ self, payload: FetchPromptScorersRequest
186
+ ) -> FetchPromptScorersResponse:
186
187
  return self._request(
187
188
  "POST",
188
- url_for("/fetch_scorer/"),
189
+ url_for("/fetch_scorers/"),
189
190
  payload,
190
191
  )
191
192
 
@@ -345,6 +346,13 @@ class JudgmentAsyncClient:
345
346
  payload,
346
347
  )
347
348
 
349
+ async def datasets_pull_all_for_judgeval(self, payload: DatasetsFetch) -> Any:
350
+ return await self._request(
351
+ "POST",
352
+ url_for("/datasets/pull_all_for_judgeval/"),
353
+ payload,
354
+ )
355
+
348
356
  async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
349
357
  return await self._request(
350
358
  "POST",
@@ -384,12 +392,12 @@ class JudgmentAsyncClient:
384
392
  payload,
385
393
  )
386
394
 
387
- async def fetch_scorer(
388
- self, payload: FetchPromptScorerRequest
389
- ) -> FetchPromptScorerResponse:
395
+ async def fetch_scorers(
396
+ self, payload: FetchPromptScorersRequest
397
+ ) -> FetchPromptScorersResponse:
390
398
  return await self._request(
391
399
  "POST",
392
- url_for("/fetch_scorer/"),
400
+ url_for("/fetch_scorers/"),
393
401
  payload,
394
402
  )
395
403
 
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-10T17:42:12+00:00
3
+ # timestamp: 2025-09-12T16:54:35+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -19,6 +19,7 @@ class DatasetFetch(TypedDict):
19
19
  dataset_name: str
20
20
  project_name: str
21
21
 
22
+
22
23
  class DatasetsFetch(TypedDict):
23
24
  project_name: str
24
25
 
@@ -60,8 +61,8 @@ class SavePromptScorerResponse(TypedDict):
60
61
  name: str
61
62
 
62
63
 
63
- class FetchPromptScorerRequest(TypedDict):
64
- name: str
64
+ class FetchPromptScorersRequest(TypedDict):
65
+ names: NotRequired[Optional[List[str]]]
65
66
 
66
67
 
67
68
  class CustomScorerUploadPayload(TypedDict):
@@ -154,7 +155,7 @@ class ScorerData(TypedDict):
154
155
  score: NotRequired[Optional[float]]
155
156
  reason: NotRequired[Optional[str]]
156
157
  strict_mode: NotRequired[Optional[bool]]
157
- evaluation_model: NotRequired[str]
158
+ evaluation_model: NotRequired[Optional[str]]
158
159
  error: NotRequired[Optional[str]]
159
160
  additional_metadata: NotRequired[Optional[Dict[str, Any]]]
160
161
 
@@ -189,13 +190,13 @@ class OtelTraceSpan(TypedDict):
189
190
 
190
191
 
191
192
  class ExampleEvaluationRun(TypedDict):
192
- id: NotRequired[Optional[str]]
193
- project_name: NotRequired[Optional[str]]
194
- eval_name: NotRequired[Optional[str]]
193
+ id: NotRequired[str]
194
+ project_name: str
195
+ eval_name: str
195
196
  custom_scorers: NotRequired[List[BaseScorer]]
196
197
  judgment_scorers: NotRequired[List[ScorerConfig]]
197
198
  model: str
198
- created_at: NotRequired[Optional[str]]
199
+ created_at: NotRequired[str]
199
200
  examples: List[Example]
200
201
  trace_span_id: NotRequired[Optional[str]]
201
202
  trace_id: NotRequired[Optional[str]]
@@ -206,13 +207,13 @@ class HTTPValidationError(TypedDict):
206
207
 
207
208
 
208
209
  class TraceEvaluationRun(TypedDict):
209
- id: NotRequired[Optional[str]]
210
- project_name: NotRequired[Optional[str]]
211
- eval_name: NotRequired[Optional[str]]
210
+ id: NotRequired[str]
211
+ project_name: str
212
+ eval_name: str
212
213
  custom_scorers: NotRequired[List[BaseScorer]]
213
214
  judgment_scorers: NotRequired[List[ScorerConfig]]
214
215
  model: str
215
- created_at: NotRequired[Optional[str]]
216
+ created_at: NotRequired[str]
216
217
  trace_and_span_ids: List[TraceAndSpanId]
217
218
  is_offline: NotRequired[bool]
218
219
 
@@ -228,30 +229,31 @@ class DatasetReturn(TypedDict):
228
229
  project_name: str
229
230
  examples: NotRequired[Optional[List[Example]]]
230
231
 
232
+
231
233
  class DatasetInfo(TypedDict):
232
234
  dataset_id: str
233
235
  name: str
234
236
  created_at: str
235
237
  dataset_kind: DatasetKind
236
238
  entries: int
237
- creator: str
239
+ creator: str
238
240
 
239
241
 
240
242
  class DatasetCreate(TypedDict):
241
243
  name: str
242
244
  dataset_kind: DatasetKind
243
245
  project_name: str
244
- examples: NotRequired[Optional[List[Example]]]
245
- overwrite: NotRequired[Optional[bool]]
246
+ examples: List[Example]
247
+ overwrite: bool
246
248
 
247
249
 
248
- class FetchPromptScorerResponse(TypedDict):
249
- scorer: PromptScorer
250
+ class FetchPromptScorersResponse(TypedDict):
251
+ scorers: List[PromptScorer]
250
252
 
251
253
 
252
254
  class ScoringResult(TypedDict):
253
255
  success: bool
254
- scorers_data: Optional[List[ScorerData]]
256
+ scorers_data: List[ScorerData]
255
257
  name: NotRequired[Optional[str]]
256
258
  data_object: NotRequired[Optional[Union[OtelTraceSpan, Example]]]
257
259
  trace_id: NotRequired[Optional[str]]
@@ -1,11 +1,11 @@
1
1
  from typing import List, Optional, Union, Tuple
2
- from litellm.files.main import BaseModel
3
- from pydantic import field_validator, model_validator, Field
2
+ from pydantic import field_validator, model_validator, Field, BaseModel
4
3
  from datetime import datetime, timezone
5
4
  import uuid
6
5
 
7
6
  from judgeval.data import Example
8
- from judgeval.scorers import BaseScorer, APIScorerConfig
7
+ from judgeval.scorers import APIScorerConfig
8
+ from judgeval.scorers.example_scorer import ExampleScorer
9
9
  from judgeval.constants import ACCEPTABLE_MODELS
10
10
  from judgeval.data.judgment_types import (
11
11
  ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
@@ -14,19 +14,18 @@ from judgeval.data.judgment_types import (
14
14
 
15
15
 
16
16
  class EvaluationRun(BaseModel):
17
- id: Optional[str] = Field(default_factory=lambda: str(uuid.uuid4()))
18
- created_at: Optional[str] = Field(
17
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
18
+ created_at: str = Field(
19
19
  default_factory=lambda: datetime.now(timezone.utc).isoformat()
20
20
  )
21
- organization_id: Optional[str] = None
22
- custom_scorers: Optional[List[BaseScorer]] = None
23
- judgment_scorers: Optional[List[APIScorerConfig]] = None
24
- scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None
21
+ custom_scorers: List[ExampleScorer] = Field(default_factory=list)
22
+ judgment_scorers: List[APIScorerConfig] = Field(default_factory=list)
23
+ scorers: List[Union[ExampleScorer, APIScorerConfig]] = Field(default_factory=list)
25
24
  model: str
26
25
 
27
26
  def __init__(
28
27
  self,
29
- scorers: Optional[List[Union[BaseScorer, APIScorerConfig]]] = None,
28
+ scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
30
29
  **kwargs,
31
30
  ):
32
31
  """
@@ -38,7 +37,7 @@ class EvaluationRun(BaseModel):
38
37
  """
39
38
  if scorers is not None:
40
39
  # Automatically sort scorers into appropriate fields
41
- custom_scorers = [s for s in scorers if isinstance(s, BaseScorer)]
40
+ custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
42
41
  judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
43
42
 
44
43
  # Always set both fields as lists (even if empty) to satisfy validation
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-10T17:42:11+00:00
3
+ # timestamp: 2025-09-12T16:54:34+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -22,6 +22,10 @@ class DatasetFetch(BaseModel):
22
22
  project_name: Annotated[str, Field(title="Project Name")]
23
23
 
24
24
 
25
+ class DatasetsFetch(BaseModel):
26
+ project_name: Annotated[str, Field(title="Project Name")]
27
+
28
+
25
29
  class ProjectAdd(BaseModel):
26
30
  project_name: Annotated[str, Field(title="Project Name")]
27
31
 
@@ -59,8 +63,8 @@ class SavePromptScorerResponse(BaseModel):
59
63
  name: Annotated[str, Field(title="Name")]
60
64
 
61
65
 
62
- class FetchPromptScorerRequest(BaseModel):
63
- name: Annotated[str, Field(title="Name")]
66
+ class FetchPromptScorersRequest(BaseModel):
67
+ names: Annotated[Optional[List[str]], Field(title="Names")] = None
64
68
 
65
69
 
66
70
  class CustomScorerUploadPayload(BaseModel):
@@ -210,8 +214,8 @@ class OtelTraceSpan(BaseModel):
210
214
 
211
215
  class ExampleEvaluationRun(BaseModel):
212
216
  id: Annotated[Optional[str], Field(title="Id")] = None
213
- project_name: Annotated[Optional[str], Field(title="Project Name")] = None
214
- eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
217
+ project_name: Annotated[str, Field(title="Project Name")]
218
+ eval_name: Annotated[str, Field(title="Eval Name")]
215
219
  custom_scorers: Annotated[
216
220
  Optional[List[BaseScorer]], Field(title="Custom Scorers")
217
221
  ] = []
@@ -231,8 +235,8 @@ class HTTPValidationError(BaseModel):
231
235
 
232
236
  class TraceEvaluationRun(BaseModel):
233
237
  id: Annotated[Optional[str], Field(title="Id")] = None
234
- project_name: Annotated[Optional[str], Field(title="Project Name")] = None
235
- eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
238
+ project_name: Annotated[str, Field(title="Project Name")]
239
+ eval_name: Annotated[str, Field(title="Eval Name")]
236
240
  custom_scorers: Annotated[
237
241
  Optional[List[BaseScorer]], Field(title="Custom Scorers")
238
242
  ] = []
@@ -259,23 +263,30 @@ class DatasetReturn(BaseModel):
259
263
  examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
260
264
 
261
265
 
266
+ class DatasetInfo(BaseModel):
267
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
268
+ name: Annotated[str, Field(title="Name")]
269
+ created_at: Annotated[str, Field(title="Created At")]
270
+ dataset_kind: DatasetKind
271
+ entries: Annotated[int, Field(title="Entries")]
272
+ creator: Annotated[str, Field(title="Creator")]
273
+
274
+
262
275
  class DatasetCreate(BaseModel):
263
276
  name: Annotated[str, Field(title="Name")]
264
277
  dataset_kind: DatasetKind
265
278
  project_name: Annotated[str, Field(title="Project Name")]
266
- examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
267
- overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
279
+ examples: Annotated[List[Example], Field(title="Examples")]
280
+ overwrite: Annotated[bool, Field(title="Overwrite")]
268
281
 
269
282
 
270
- class FetchPromptScorerResponse(BaseModel):
271
- scorer: PromptScorer
283
+ class FetchPromptScorersResponse(BaseModel):
284
+ scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
272
285
 
273
286
 
274
287
  class ScoringResult(BaseModel):
275
288
  success: Annotated[bool, Field(title="Success")]
276
- scorers_data: Annotated[Optional[List[ScorerData]], Field(title="Scorers Data")] = (
277
- None
278
- )
289
+ scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
279
290
  name: Annotated[Optional[str], Field(title="Name")] = None
280
291
  data_object: Annotated[
281
292
  Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
@@ -18,6 +18,7 @@ class ScoringResult(JudgmentScoringResult):
18
18
 
19
19
  # Need to override this so that it uses this repo's Example class
20
20
  data_object: Example
21
+ scorers_data: List[ScorerData]
21
22
 
22
23
  def model_dump(self, **kwargs):
23
24
  data = super().model_dump(**kwargs)
@@ -6,36 +6,11 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from judgeval.data.judgment_types import ScorerData as JudgmentScorerData
9
+ from judgeval.data.judgment_types import ScorerData
10
10
  from judgeval.scorers import BaseScorer
11
11
  from typing import List
12
12
 
13
13
 
14
- class ScorerData(JudgmentScorerData):
15
- """
16
- ScorerData holds the information related to a single, completed Scorer evaluation run.
17
-
18
- For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
19
- object will contain whether the example passed its threshold expectation, as well as more detailed
20
- information surrounding the evaluation run such as the claims and verdicts generated by the
21
- judge model(s).
22
- """
23
-
24
- def to_dict(self) -> dict:
25
- """Convert the ScorerData instance to a JSON-serializable dictionary."""
26
- return {
27
- "name": self.name,
28
- "threshold": self.threshold,
29
- "success": self.success,
30
- "score": self.score,
31
- "reason": self.reason,
32
- "strict_mode": self.strict_mode,
33
- "evaluation_model": self.evaluation_model,
34
- "error": self.error,
35
- "additional_metadata": self.additional_metadata,
36
- }
37
-
38
-
39
14
  def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
40
15
  """
41
16
  After a `scorer` is run, it contains information about the example that was evaluated
@@ -3,7 +3,7 @@ import orjson
3
3
  import os
4
4
  import yaml
5
5
  from dataclasses import dataclass
6
- from typing import List, Literal, Optional
6
+ from typing import List, Literal
7
7
 
8
8
  from judgeval.data import Example
9
9
  from judgeval.utils.file_utils import get_examples_from_yaml, get_examples_from_json
@@ -13,15 +13,17 @@ from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
13
13
 
14
14
  from judgeval.api.api_types import DatasetKind
15
15
 
16
+
16
17
  @dataclass
17
18
  class DatasetInfo:
18
19
  dataset_id: str
19
- name: str
20
+ name: str
20
21
  created_at: str
21
22
  dataset_kind: DatasetKind
22
23
  entries: int
23
24
  creator: str
24
25
 
26
+
25
27
  @dataclass
26
28
  class Dataset:
27
29
  examples: List[Example]
@@ -46,9 +48,12 @@ class Dataset:
46
48
  if not dataset:
47
49
  raise ValueError(f"Dataset {name} not found in project {project_name}")
48
50
  examples = dataset.get("examples", [])
51
+ if examples is None:
52
+ examples = []
53
+
49
54
  for e in examples:
50
- if isinstance(e, dict) and isinstance(e.get("data"), dict):
51
- e.update(e.pop("data"))
55
+ if isinstance(e, dict) and isinstance(e.get("data", {}), dict):
56
+ e.update(e.pop("data")) # type: ignore
52
57
  e.pop(
53
58
  "example_id"
54
59
  ) # TODO: remove once scorer data migraiton is complete
@@ -64,7 +69,7 @@ class Dataset:
64
69
  cls,
65
70
  name: str,
66
71
  project_name: str,
67
- examples: Optional[List[Example]] = None,
72
+ examples: List[Example] = [],
68
73
  overwrite: bool = False,
69
74
  ):
70
75
  if not examples:
@@ -75,7 +80,7 @@ class Dataset:
75
80
  {
76
81
  "name": name,
77
82
  "project_name": project_name,
78
- "examples": [e.model_dump() for e in examples],
83
+ "examples": examples, # type: ignore
79
84
  "dataset_kind": "example",
80
85
  "overwrite": overwrite,
81
86
  }
@@ -87,18 +92,14 @@ class Dataset:
87
92
  project_name=project_name,
88
93
  examples=examples,
89
94
  )
95
+
90
96
  @classmethod
91
- def list(
92
- cls,
93
- project_name: str
94
- ):
97
+ def list(cls, project_name: str):
95
98
  client = JudgmentSyncClient(cls.judgment_api_key, cls.organization_id)
96
- datasets = client.datasets_pull_all_for_judgeval(
97
- {"project_name": project_name}
98
- )
99
-
99
+ datasets = client.datasets_pull_all_for_judgeval({"project_name": project_name})
100
+
100
101
  judgeval_logger.info(f"Fetched all datasets for project {project_name}!")
101
-
102
+
102
103
  return [DatasetInfo(**dataset_info) for dataset_info in datasets]
103
104
 
104
105
  def add_from_json(self, file_path: str) -> None:
@@ -147,7 +148,7 @@ class Dataset:
147
148
  {
148
149
  "dataset_name": self.name,
149
150
  "project_name": self.project_name,
150
- "examples": [e.model_dump() for e in examples],
151
+ "examples": examples, # type: ignore
151
152
  }
152
153
  )
153
154
 
@@ -19,8 +19,17 @@ def optional_env_var(var_name: str, default: str | None = None) -> str | None:
19
19
  return os.getenv(var_name, default)
20
20
 
21
21
 
22
- JUDGMENT_API_KEY = optional_env_var("JUDGMENT_API_KEY")
23
- JUDGMENT_ORG_ID = optional_env_var("JUDGMENT_ORG_ID")
22
+ def required_env_var(var_name: str) -> str:
23
+ value = os.getenv(var_name)
24
+ if value is None:
25
+ raise EnvironmentError(
26
+ f"Environment variable '{var_name}' is required but not set."
27
+ )
28
+ return value
29
+
30
+
31
+ JUDGMENT_API_KEY = required_env_var("JUDGMENT_API_KEY")
32
+ JUDGMENT_ORG_ID = required_env_var("JUDGMENT_ORG_ID")
24
33
  JUDGMENT_API_URL = optional_env_var("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
25
34
 
26
35
  JUDGMENT_DEFAULT_GPT_MODEL = optional_env_var("JUDGMENT_DEFAULT_GPT_MODEL", "gpt-4.1")