judgeval 0.9.3__tar.gz → 0.10.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/ci.yaml +33 -22
  2. {judgeval-0.9.3 → judgeval-0.10.0}/.pre-commit-config.yaml +2 -2
  3. {judgeval-0.9.3 → judgeval-0.10.0}/PKG-INFO +2 -2
  4. {judgeval-0.9.3 → judgeval-0.10.0}/pyproject.toml +3 -2
  5. {judgeval-0.9.3 → judgeval-0.10.0}/scripts/api_generator.py +4 -4
  6. {judgeval-0.9.3 → judgeval-0.10.0}/scripts/openapi_transform.py +2 -3
  7. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/__init__.py +2 -2
  8. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/api/__init__.py +28 -96
  9. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/api/api_types.py +49 -140
  10. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/constants.py +1 -5
  11. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/__init__.py +1 -3
  12. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/example.py +4 -2
  13. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/judgment_types.py +57 -165
  14. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/result.py +1 -2
  15. judgeval-0.10.0/src/judgeval/data/trace.py +14 -0
  16. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/dataset/__init__.py +15 -42
  17. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/evaluation/__init__.py +23 -34
  18. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/__init__.py +9 -7
  19. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/api_scorer.py +8 -0
  20. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/base_scorer.py +0 -1
  21. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
  22. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
  23. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  24. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
  25. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
  26. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
  27. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/__init__.py +40 -93
  28. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/local_eval_queue.py +2 -2
  29. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/processors/__init__.py +84 -6
  30. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/utils.py +1 -1
  31. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/trainer.py +4 -4
  32. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/serialize.py +7 -1
  33. {judgeval-0.9.3 → judgeval-0.10.0}/uv.lock +2591 -2039
  34. judgeval-0.9.3/src/judgeval/data/trace.py +0 -40
  35. judgeval-0.9.3/src/judgeval/data/trace_run.py +0 -39
  36. judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  37. judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  38. judgeval-0.9.3/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  39. judgeval-0.9.3/src/judgeval/scorers/trace_api_scorer.py +0 -5
  40. {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  41. {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  42. {judgeval-0.9.3 → judgeval-0.10.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  43. {judgeval-0.9.3 → judgeval-0.10.0}/.github/pull_request_template.md +0 -0
  44. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/blocked-pr.yaml +0 -0
  45. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/claude-code-review.yml +0 -0
  46. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/claude.yml +0 -0
  47. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/lint.yaml +0 -0
  48. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/merge-branch-check.yaml +0 -0
  49. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/mypy.yaml +0 -0
  50. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  51. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/release.yaml +0 -0
  52. {judgeval-0.9.3 → judgeval-0.10.0}/.github/workflows/validate-branch.yaml +0 -0
  53. {judgeval-0.9.3 → judgeval-0.10.0}/.gitignore +0 -0
  54. {judgeval-0.9.3 → judgeval-0.10.0}/LICENSE.md +0 -0
  55. {judgeval-0.9.3 → judgeval-0.10.0}/README.md +0 -0
  56. {judgeval-0.9.3 → judgeval-0.10.0}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  57. {judgeval-0.9.3 → judgeval-0.10.0}/assets/agent.gif +0 -0
  58. {judgeval-0.9.3 → judgeval-0.10.0}/assets/agent_trace_example.png +0 -0
  59. {judgeval-0.9.3 → judgeval-0.10.0}/assets/data.gif +0 -0
  60. {judgeval-0.9.3 → judgeval-0.10.0}/assets/dataset_clustering_screenshot.png +0 -0
  61. {judgeval-0.9.3 → judgeval-0.10.0}/assets/dataset_clustering_screenshot_dm.png +0 -0
  62. {judgeval-0.9.3 → judgeval-0.10.0}/assets/datasets_preview_screenshot.png +0 -0
  63. {judgeval-0.9.3 → judgeval-0.10.0}/assets/document.gif +0 -0
  64. {judgeval-0.9.3 → judgeval-0.10.0}/assets/error_analysis_dashboard.png +0 -0
  65. {judgeval-0.9.3 → judgeval-0.10.0}/assets/errors.png +0 -0
  66. {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_dashboard_screenshot.png +0 -0
  67. {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_page.png +0 -0
  68. {judgeval-0.9.3 → judgeval-0.10.0}/assets/experiments_pagev2.png +0 -0
  69. {judgeval-0.9.3 → judgeval-0.10.0}/assets/logo-dark.svg +0 -0
  70. {judgeval-0.9.3 → judgeval-0.10.0}/assets/logo-light.svg +0 -0
  71. {judgeval-0.9.3 → judgeval-0.10.0}/assets/monitoring_screenshot.png +0 -0
  72. {judgeval-0.9.3 → judgeval-0.10.0}/assets/new_darkmode.svg +0 -0
  73. {judgeval-0.9.3 → judgeval-0.10.0}/assets/new_lightmode.svg +0 -0
  74. {judgeval-0.9.3 → judgeval-0.10.0}/assets/online_eval.png +0 -0
  75. {judgeval-0.9.3 → judgeval-0.10.0}/assets/product_shot.png +0 -0
  76. {judgeval-0.9.3 → judgeval-0.10.0}/assets/test.png +0 -0
  77. {judgeval-0.9.3 → judgeval-0.10.0}/assets/tests.png +0 -0
  78. {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace.gif +0 -0
  79. {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_demo.png +0 -0
  80. {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_screenshot.png +0 -0
  81. {judgeval-0.9.3 → judgeval-0.10.0}/assets/trace_screenshot_old.png +0 -0
  82. {judgeval-0.9.3 → judgeval-0.10.0}/pytest.ini +0 -0
  83. {judgeval-0.9.3 → judgeval-0.10.0}/scripts/update_types.sh +0 -0
  84. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/cli.py +0 -0
  85. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/evaluation_run.py +0 -0
  86. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scorer_data.py +0 -0
  87. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  88. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  89. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/data/tool.py +0 -0
  90. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/env.py +0 -0
  91. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/exceptions.py +0 -0
  92. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/integrations/langgraph/__init__.py +0 -0
  93. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/__init__.py +0 -0
  94. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/base_judge.py +0 -0
  95. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/litellm_judge.py +0 -0
  96. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/together_judge.py +0 -0
  97. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/judges/utils.py +0 -0
  98. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/logger.py +0 -0
  99. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/agent_scorer.py +0 -0
  100. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/example_scorer.py +0 -0
  101. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/exceptions.py +0 -0
  102. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  103. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/score.py +0 -0
  104. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/scorers/utils.py +0 -0
  105. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/constants.py +0 -0
  106. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/__init__.py +0 -0
  107. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/s3.py +0 -0
  108. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/store.py +0 -0
  109. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/exporters/utils.py +0 -0
  110. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/keys.py +0 -0
  111. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/llm/__init__.py +0 -0
  112. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/llm/providers.py +0 -0
  113. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/tracer/managers.py +0 -0
  114. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/__init__.py +0 -0
  115. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/config.py +0 -0
  116. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/console.py +0 -0
  117. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/trainer/trainable_model.py +0 -0
  118. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/async_utils.py +0 -0
  119. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/decorators.py +0 -0
  120. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/file_utils.py +0 -0
  121. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/guards.py +0 -0
  122. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/meta.py +0 -0
  123. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/testing.py +0 -0
  124. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/url.py +0 -0
  125. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/utils/version_check.py +0 -0
  126. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/version.py +0 -0
  127. {judgeval-0.9.3 → judgeval-0.10.0}/src/judgeval/warnings.py +0 -0
  128. {judgeval-0.9.3 → judgeval-0.10.0}/update_version.py +0 -0
@@ -18,7 +18,10 @@ jobs:
18
18
  matrix:
19
19
  os: [ubuntu-latest, macos-latest]
20
20
  python-version:
21
+ - "3.10"
21
22
  - "3.11"
23
+ - "3.12"
24
+ - "3.13"
22
25
  name: Unit Tests
23
26
  runs-on: ${{ matrix.os }}
24
27
  env:
@@ -49,18 +52,19 @@ jobs:
49
52
  run-e2e-tests-staging:
50
53
  needs: [validate-branch]
51
54
  if: "github.base_ref == 'staging' && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
55
+ strategy:
56
+ fail-fast: false
57
+ matrix:
58
+ python-version:
59
+ - "3.10"
60
+ - "3.11"
61
+ - "3.12"
62
+ - "3.13"
52
63
  name: Staging E2E Tests
53
64
  runs-on: ubuntu-latest
54
65
  env:
55
66
  TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
56
67
  steps:
57
- - name: Wait for turn
58
- uses: softprops/turnstyle@v2
59
- with:
60
- poll-interval-seconds: 10
61
- same-branch-only: false
62
- job-to-wait-for: "Staging E2E Tests"
63
-
64
68
  - name: Configure AWS Credentials
65
69
  uses: aws-actions/configure-aws-credentials@v4
66
70
  with:
@@ -74,7 +78,7 @@ jobs:
74
78
  - name: Set up Python
75
79
  uses: actions/setup-python@v4
76
80
  with:
77
- python-version: "3.11"
81
+ python-version: ${{ matrix.python-version }}
78
82
 
79
83
  - name: Install judgeval dependencies
80
84
  run: |
@@ -93,32 +97,36 @@ jobs:
93
97
  - name: Run E2E tests
94
98
  working-directory: src
95
99
  run: |
96
- SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-stg-judgeval/api-keys/judgeval --query SecretString --output text)
100
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id stg/api-keys/e2e-tests --query SecretString --output text)
97
101
  export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
98
- timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
102
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
103
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
104
+ export JUDGMENT_API_URL=https://staging.api.judgmentlabs.ai
105
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
99
106
 
100
107
  - name: Upload coverage HTML report (staging)
101
108
  if: always()
102
109
  uses: actions/upload-artifact@v4
103
110
  with:
104
- name: coverage-html-staging
111
+ name: coverage-html-staging-${{ matrix.python-version }}
105
112
  path: src/htmlcov
106
113
 
107
114
  run-e2e-tests-main:
108
115
  needs: [validate-branch]
109
116
  if: "github.base_ref == 'main' && !contains(github.actor, '[bot]') && needs.validate-branch.result == 'success'"
117
+ strategy:
118
+ fail-fast: false
119
+ matrix:
120
+ python-version:
121
+ - "3.10"
122
+ - "3.11"
123
+ - "3.12"
124
+ - "3.13"
110
125
  name: Production E2E Tests
111
126
  runs-on: ubuntu-latest
112
127
  env:
113
128
  TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
114
129
  steps:
115
- - name: Wait for turn
116
- uses: softprops/turnstyle@v2
117
- with:
118
- poll-interval-seconds: 10
119
- same-branch-only: false
120
- job-to-wait-for: "Production E2E Tests"
121
-
122
130
  - name: Configure AWS Credentials
123
131
  uses: aws-actions/configure-aws-credentials@v4
124
132
  with:
@@ -132,7 +140,7 @@ jobs:
132
140
  - name: Set up Python
133
141
  uses: actions/setup-python@v4
134
142
  with:
135
- python-version: "3.11"
143
+ python-version: ${{ matrix.python-version }}
136
144
 
137
145
  - name: Install judgeval dependencies
138
146
  run: |
@@ -151,13 +159,16 @@ jobs:
151
159
  - name: Run E2E tests
152
160
  working-directory: src
153
161
  run: |
154
- SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id gh-actions-judgeval/api-keys/judgeval --query SecretString --output text)
162
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id prod/api-keys/e2e-tests --query SecretString --output text)
155
163
  export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
156
- timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
164
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
165
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
166
+ export JUDGMENT_API_URL=https://api.judgmentlabs.ai
167
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
157
168
 
158
169
  - name: Upload coverage HTML report (production)
159
170
  if: always()
160
171
  uses: actions/upload-artifact@v4
161
172
  with:
162
- name: coverage-html-production
173
+ name: coverage-html-production-${{ matrix.python-version }}
163
174
  path: src/htmlcov
@@ -1,11 +1,11 @@
1
1
  repos:
2
2
  - repo: https://github.com/astral-sh/uv-pre-commit
3
- rev: 0.8.0
3
+ rev: 0.8.15
4
4
  hooks:
5
5
  - id: uv-lock
6
6
 
7
7
  - repo: https://github.com/astral-sh/ruff-pre-commit
8
- rev: v0.12.4
8
+ rev: v0.12.12
9
9
  hooks:
10
10
  - id: ruff
11
11
  name: ruff (linter)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.9.3
3
+ Version: 0.10.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -9,7 +9,7 @@ License-Expression: Apache-2.0
9
9
  License-File: LICENSE.md
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.11
12
+ Requires-Python: >=3.10
13
13
  Requires-Dist: boto3>=1.40.11
14
14
  Requires-Dist: click<8.2.0
15
15
  Requires-Dist: dotenv
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.9.3"
3
+ version = "0.10.0"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -8,7 +8,7 @@ authors = [
8
8
  ]
9
9
  description = "Judgeval Package"
10
10
  readme = "README.md"
11
- requires-python = ">=3.11"
11
+ requires-python = ">=3.10"
12
12
  classifiers = [
13
13
  "Programming Language :: Python :: 3",
14
14
  "Operating System :: OS Independent",
@@ -75,6 +75,7 @@ dev = [
75
75
  "pytest-cov>=6.2.1",
76
76
  "types-tqdm>=4.67.0.20250809",
77
77
  "pytest-asyncio>=1.1.0",
78
+ "pytest-xdist>=3.8.0",
78
79
  ]
79
80
 
80
81
 
@@ -36,13 +36,13 @@ JUDGEVAL_PATHS: List[str] = [
36
36
  "/fetch_scorer/",
37
37
  "/scorer_exists/",
38
38
  "/upload_custom_scorer/",
39
- "/datasets/push/",
40
- "/datasets/insert_examples/",
39
+ "/datasets/create_for_judgeval/",
40
+ "/datasets/insert_examples_for_judgeval/",
41
41
  "/datasets/pull_for_judgeval/",
42
- "/datasets/fetch_stats_by_project/",
43
42
  "/projects/resolve/",
44
43
  "/e2e_fetch_trace/",
45
44
  "/e2e_fetch_span_score/",
45
+ "/e2e_fetch_trace_scorer_span_score/",
46
46
  ]
47
47
 
48
48
 
@@ -253,7 +253,7 @@ def generate_client_class(
253
253
 
254
254
  def generate_api_file() -> str:
255
255
  lines = [
256
- "from typing import List, Dict, Any, Mapping, Literal, Optional",
256
+ "from typing import Dict, Any, Mapping, Literal, Optional",
257
257
  "import httpx",
258
258
  "from httpx import Response",
259
259
  "from judgeval.exceptions import JudgmentAPIError",
@@ -35,10 +35,9 @@ JUDGEVAL_PATHS: List[str] = [
35
35
  "/fetch_scorer/",
36
36
  "/scorer_exists/",
37
37
  "/upload_custom_scorer/",
38
- "/datasets/push/",
39
- "/datasets/insert_examples/",
38
+ "/datasets/create_for_judgeval/",
39
+ "/datasets/insert_examples_for_judgeval/",
40
40
  "/datasets/pull_for_judgeval/",
41
- "/datasets/fetch_stats_by_project/",
42
41
  "/projects/resolve/",
43
42
  "/e2e_fetch_trace/",
44
43
  "/e2e_fetch_span_score/",
@@ -6,7 +6,7 @@ from judgeval.data.evaluation_run import ExampleEvaluationRun
6
6
 
7
7
 
8
8
  from typing import List, Optional, Union
9
- from judgeval.scorers import BaseScorer, APIScorerConfig
9
+ from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
10
10
  from judgeval.data.example import Example
11
11
  from judgeval.logger import judgeval_logger
12
12
  from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_DEFAULT_GPT_MODEL, JUDGMENT_ORG_ID
@@ -38,7 +38,7 @@ class JudgmentClient(metaclass=SingletonMeta):
38
38
  def run_evaluation(
39
39
  self,
40
40
  examples: List[Example],
41
- scorers: List[Union[APIScorerConfig, BaseScorer]],
41
+ scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
42
42
  project_name: str = "default_project",
43
43
  eval_run_name: str = "default_eval_run",
44
44
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
@@ -71,13 +71,6 @@ class JudgmentSyncClient:
71
71
  payload,
72
72
  )
73
73
 
74
- def evaluate_trace(self, payload: TraceRun) -> Any:
75
- return self._request(
76
- "POST",
77
- url_for("/evaluate_trace/"),
78
- payload,
79
- )
80
-
81
74
  def evaluate_examples(
82
75
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
83
76
  ) -> Any:
@@ -128,59 +121,26 @@ class JudgmentSyncClient:
128
121
  query_params,
129
122
  )
130
123
 
131
- def datasets_insert_examples(self, payload: DatasetInsertExamples) -> Any:
124
+ def datasets_insert_examples_for_judgeval(
125
+ self, payload: DatasetInsertExamples
126
+ ) -> Any:
132
127
  return self._request(
133
128
  "POST",
134
- url_for("/datasets/insert_examples/"),
129
+ url_for("/datasets/insert_examples_for_judgeval/"),
135
130
  payload,
136
131
  )
137
132
 
138
- def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> Any:
133
+ def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
139
134
  return self._request(
140
135
  "POST",
141
136
  url_for("/datasets/pull_for_judgeval/"),
142
137
  payload,
143
138
  )
144
139
 
145
- def datasets_push(self, payload: DatasetPush) -> Any:
140
+ def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
146
141
  return self._request(
147
142
  "POST",
148
- url_for("/datasets/push/"),
149
- payload,
150
- )
151
-
152
- def traces_upsert(self, payload: TraceSave) -> Any:
153
- return self._request(
154
- "POST",
155
- url_for("/traces/upsert/"),
156
- payload,
157
- )
158
-
159
- def traces_fetch(self, payload: TraceFetch) -> Any:
160
- return self._request(
161
- "POST",
162
- url_for("/traces/fetch/"),
163
- payload,
164
- )
165
-
166
- def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
167
- return self._request(
168
- "POST",
169
- url_for("/traces/add_to_dataset/"),
170
- payload,
171
- )
172
-
173
- def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
174
- return self._request(
175
- "POST",
176
- url_for("/traces/spans/batch/"),
177
- payload,
178
- )
179
-
180
- def traces_evaluation_runs_batch(self, payload: EvaluationRunsBatchRequest) -> Any:
181
- return self._request(
182
- "POST",
183
- url_for("/traces/evaluation_runs/batch/"),
143
+ url_for("/datasets/create_for_judgeval/"),
184
144
  payload,
185
145
  )
186
146
 
@@ -255,6 +215,13 @@ class JudgmentSyncClient:
255
215
  payload,
256
216
  )
257
217
 
218
+ def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
219
+ return self._request(
220
+ "POST",
221
+ url_for("/e2e_fetch_trace_scorer_span_score/"),
222
+ payload,
223
+ )
224
+
258
225
 
259
226
  class JudgmentAsyncClient:
260
227
  __slots__ = ("api_key", "organization_id", "client")
@@ -304,13 +271,6 @@ class JudgmentAsyncClient:
304
271
  payload,
305
272
  )
306
273
 
307
- async def evaluate_trace(self, payload: TraceRun) -> Any:
308
- return await self._request(
309
- "POST",
310
- url_for("/evaluate_trace/"),
311
- payload,
312
- )
313
-
314
274
  async def evaluate_examples(
315
275
  self, payload: ExampleEvaluationRun, stream: Optional[str] = None
316
276
  ) -> Any:
@@ -363,61 +323,26 @@ class JudgmentAsyncClient:
363
323
  query_params,
364
324
  )
365
325
 
366
- async def datasets_insert_examples(self, payload: DatasetInsertExamples) -> Any:
326
+ async def datasets_insert_examples_for_judgeval(
327
+ self, payload: DatasetInsertExamples
328
+ ) -> Any:
367
329
  return await self._request(
368
330
  "POST",
369
- url_for("/datasets/insert_examples/"),
331
+ url_for("/datasets/insert_examples_for_judgeval/"),
370
332
  payload,
371
333
  )
372
334
 
373
- async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> Any:
335
+ async def datasets_pull_for_judgeval(self, payload: DatasetFetch) -> DatasetReturn:
374
336
  return await self._request(
375
337
  "POST",
376
338
  url_for("/datasets/pull_for_judgeval/"),
377
339
  payload,
378
340
  )
379
341
 
380
- async def datasets_push(self, payload: DatasetPush) -> Any:
381
- return await self._request(
382
- "POST",
383
- url_for("/datasets/push/"),
384
- payload,
385
- )
386
-
387
- async def traces_upsert(self, payload: TraceSave) -> Any:
388
- return await self._request(
389
- "POST",
390
- url_for("/traces/upsert/"),
391
- payload,
392
- )
393
-
394
- async def traces_fetch(self, payload: TraceFetch) -> Any:
395
- return await self._request(
396
- "POST",
397
- url_for("/traces/fetch/"),
398
- payload,
399
- )
400
-
401
- async def traces_add_to_dataset(self, payload: TraceAddToDataset) -> Any:
402
- return await self._request(
403
- "POST",
404
- url_for("/traces/add_to_dataset/"),
405
- payload,
406
- )
407
-
408
- async def traces_spans_batch(self, payload: SpansBatchRequest) -> Any:
409
- return await self._request(
410
- "POST",
411
- url_for("/traces/spans/batch/"),
412
- payload,
413
- )
414
-
415
- async def traces_evaluation_runs_batch(
416
- self, payload: EvaluationRunsBatchRequest
417
- ) -> Any:
342
+ async def datasets_create_for_judgeval(self, payload: DatasetCreate) -> Any:
418
343
  return await self._request(
419
344
  "POST",
420
- url_for("/traces/evaluation_runs/batch/"),
345
+ url_for("/datasets/create_for_judgeval/"),
421
346
  payload,
422
347
  )
423
348
 
@@ -494,6 +419,13 @@ class JudgmentAsyncClient:
494
419
  payload,
495
420
  )
496
421
 
422
+ async def e2e_fetch_trace_scorer_span_score(self, payload: SpanScoreRequest) -> Any:
423
+ return await self._request(
424
+ "POST",
425
+ url_for("/e2e_fetch_trace_scorer_span_score/"),
426
+ payload,
427
+ )
428
+
497
429
 
498
430
  __all__ = [
499
431
  "JudgmentSyncClient",