judgeval 0.13.1__tar.gz → 0.14.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. judgeval-0.14.1/.github/workflows/ci.yaml +141 -0
  2. {judgeval-0.13.1 → judgeval-0.14.1}/PKG-INFO +1 -1
  3. {judgeval-0.13.1 → judgeval-0.14.1}/pyproject.toml +1 -1
  4. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/api/api_types.py +5 -1
  5. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/judgment_types.py +5 -1
  6. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +26 -2
  7. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/version.py +1 -1
  8. judgeval-0.13.1/.github/workflows/ci.yaml +0 -176
  9. {judgeval-0.13.1 → judgeval-0.14.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  10. {judgeval-0.13.1 → judgeval-0.14.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  11. {judgeval-0.13.1 → judgeval-0.14.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  12. {judgeval-0.13.1 → judgeval-0.14.1}/.github/pull_request_template.md +0 -0
  13. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/blocked-pr.yaml +0 -0
  14. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/claude-code-review.yml +0 -0
  15. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/claude.yml +0 -0
  16. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/lint.yaml +0 -0
  17. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/merge-branch-check.yaml +0 -0
  18. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/mypy.yaml +0 -0
  19. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
  20. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/release.yaml +0 -0
  21. {judgeval-0.13.1 → judgeval-0.14.1}/.github/workflows/validate-branch.yaml +0 -0
  22. {judgeval-0.13.1 → judgeval-0.14.1}/.gitignore +0 -0
  23. {judgeval-0.13.1 → judgeval-0.14.1}/.pre-commit-config.yaml +0 -0
  24. {judgeval-0.13.1 → judgeval-0.14.1}/LICENSE.md +0 -0
  25. {judgeval-0.13.1 → judgeval-0.14.1}/README.md +0 -0
  26. {judgeval-0.13.1 → judgeval-0.14.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  27. {judgeval-0.13.1 → judgeval-0.14.1}/assets/agent.gif +0 -0
  28. {judgeval-0.13.1 → judgeval-0.14.1}/assets/agent_trace_example.png +0 -0
  29. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/company.jpg +0 -0
  30. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/company_banner.jpg +0 -0
  31. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/darkmode.svg +0 -0
  32. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/full_logo.png +0 -0
  33. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/icon.png +0 -0
  34. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/lightmode.svg +0 -0
  35. {judgeval-0.13.1 → judgeval-0.14.1}/assets/brand/white_background.png +0 -0
  36. {judgeval-0.13.1 → judgeval-0.14.1}/assets/data.gif +0 -0
  37. {judgeval-0.13.1 → judgeval-0.14.1}/assets/dataset_clustering_screenshot.png +0 -0
  38. {judgeval-0.13.1 → judgeval-0.14.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
  39. {judgeval-0.13.1 → judgeval-0.14.1}/assets/datasets_preview_screenshot.png +0 -0
  40. {judgeval-0.13.1 → judgeval-0.14.1}/assets/document.gif +0 -0
  41. {judgeval-0.13.1 → judgeval-0.14.1}/assets/error_analysis_dashboard.png +0 -0
  42. {judgeval-0.13.1 → judgeval-0.14.1}/assets/errors.png +0 -0
  43. {judgeval-0.13.1 → judgeval-0.14.1}/assets/experiments_dashboard_screenshot.png +0 -0
  44. {judgeval-0.13.1 → judgeval-0.14.1}/assets/experiments_page.png +0 -0
  45. {judgeval-0.13.1 → judgeval-0.14.1}/assets/experiments_pagev2.png +0 -0
  46. {judgeval-0.13.1 → judgeval-0.14.1}/assets/logo-dark.svg +0 -0
  47. {judgeval-0.13.1 → judgeval-0.14.1}/assets/logo-light.svg +0 -0
  48. {judgeval-0.13.1 → judgeval-0.14.1}/assets/monitoring_screenshot.png +0 -0
  49. {judgeval-0.13.1 → judgeval-0.14.1}/assets/new_darkmode.svg +0 -0
  50. {judgeval-0.13.1 → judgeval-0.14.1}/assets/new_lightmode.svg +0 -0
  51. {judgeval-0.13.1 → judgeval-0.14.1}/assets/online_eval.png +0 -0
  52. {judgeval-0.13.1 → judgeval-0.14.1}/assets/product_shot.png +0 -0
  53. {judgeval-0.13.1 → judgeval-0.14.1}/assets/test.png +0 -0
  54. {judgeval-0.13.1 → judgeval-0.14.1}/assets/tests.png +0 -0
  55. {judgeval-0.13.1 → judgeval-0.14.1}/assets/trace.gif +0 -0
  56. {judgeval-0.13.1 → judgeval-0.14.1}/assets/trace_demo.png +0 -0
  57. {judgeval-0.13.1 → judgeval-0.14.1}/assets/trace_screenshot.png +0 -0
  58. {judgeval-0.13.1 → judgeval-0.14.1}/assets/trace_screenshot_old.png +0 -0
  59. {judgeval-0.13.1 → judgeval-0.14.1}/pytest.ini +0 -0
  60. {judgeval-0.13.1 → judgeval-0.14.1}/scripts/api_generator.py +0 -0
  61. {judgeval-0.13.1 → judgeval-0.14.1}/scripts/openapi_transform.py +0 -0
  62. {judgeval-0.13.1 → judgeval-0.14.1}/scripts/update_types.sh +0 -0
  63. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/__init__.py +0 -0
  64. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/api/__init__.py +0 -0
  65. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/cli.py +0 -0
  66. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/constants.py +0 -0
  67. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/__init__.py +0 -0
  68. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/evaluation_run.py +0 -0
  69. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/example.py +0 -0
  70. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/result.py +0 -0
  71. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/scorer_data.py +0 -0
  72. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  73. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  74. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/data/trace.py +0 -0
  75. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/dataset/__init__.py +0 -0
  76. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/env.py +0 -0
  77. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/evaluation/__init__.py +0 -0
  78. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/exceptions.py +0 -0
  79. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/integrations/langgraph/__init__.py +0 -0
  80. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/judges/__init__.py +0 -0
  81. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/judges/base_judge.py +0 -0
  82. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/judges/litellm_judge.py +0 -0
  83. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/judges/together_judge.py +0 -0
  84. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/judges/utils.py +0 -0
  85. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/logger.py +0 -0
  86. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/__init__.py +0 -0
  87. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/agent_scorer.py +0 -0
  88. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/api_scorer.py +0 -0
  89. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/base_scorer.py +0 -0
  90. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/example_scorer.py +0 -0
  91. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/exceptions.py +0 -0
  92. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  93. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  94. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  95. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  96. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  97. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  98. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/score.py +0 -0
  99. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/scorers/utils.py +0 -0
  100. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/__init__.py +0 -0
  101. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/constants.py +0 -0
  102. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/exporters/__init__.py +0 -0
  103. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/exporters/s3.py +0 -0
  104. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/exporters/store.py +0 -0
  105. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/exporters/utils.py +0 -0
  106. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/keys.py +0 -0
  107. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/__init__.py +0 -0
  108. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/anthropic/__init__.py +0 -0
  109. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/google/__init__.py +0 -0
  110. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/groq/__init__.py +0 -0
  111. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/openai/__init__.py +0 -0
  112. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/providers.py +0 -0
  113. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/llm/together/__init__.py +0 -0
  114. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/local_eval_queue.py +0 -0
  115. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/managers.py +0 -0
  116. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/processors/__init__.py +0 -0
  117. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/tracer/utils.py +0 -0
  118. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/trainer/__init__.py +0 -0
  119. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/trainer/config.py +0 -0
  120. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/trainer/console.py +0 -0
  121. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/trainer/trainable_model.py +0 -0
  122. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/trainer/trainer.py +0 -0
  123. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/async_utils.py +0 -0
  124. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/decorators.py +0 -0
  125. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/file_utils.py +0 -0
  126. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/guards.py +0 -0
  127. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/meta.py +0 -0
  128. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/serialize.py +0 -0
  129. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/testing.py +0 -0
  130. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/url.py +0 -0
  131. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/utils/version_check.py +0 -0
  132. {judgeval-0.13.1 → judgeval-0.14.1}/src/judgeval/warnings.py +0 -0
  133. {judgeval-0.13.1 → judgeval-0.14.1}/update_version.py +0 -0
  134. {judgeval-0.13.1 → judgeval-0.14.1}/uv.lock +0 -0
@@ -0,0 +1,141 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ types: [opened, synchronize, reopened]
6
+
7
+ permissions: read-all
8
+
9
+ jobs:
10
+ validate-branch:
11
+ uses: ./.github/workflows/merge-branch-check.yaml
12
+
13
+ run-tests:
14
+ needs: [validate-branch]
15
+ if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ os: [ubuntu-latest, macos-latest]
20
+ python-version:
21
+ - "3.10"
22
+ - "3.11"
23
+ - "3.12"
24
+ - "3.13"
25
+ name: Unit Tests
26
+ runs-on: ${{ matrix.os }}
27
+ env:
28
+ PYTHONPATH: "."
29
+ OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
30
+ TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
31
+ JUDGMENT_DEV: true
32
+
33
+ steps:
34
+ - name: Checkout code
35
+ uses: actions/checkout@v4
36
+
37
+ - name: Set up Python
38
+ uses: actions/setup-python@v4
39
+ with:
40
+ python-version: ${{ matrix.python-version }}
41
+
42
+ - name: Install dependencies
43
+ run: |
44
+ pip install uv
45
+ uv sync --dev
46
+
47
+ - name: Run tests
48
+ run: |
49
+ cd src
50
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
51
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
52
+ uv run pytest tests
53
+
54
+ run-e2e-tests:
55
+ needs: [validate-branch]
56
+ if: "(github.base_ref == 'staging' || github.base_ref == 'main') && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
57
+ strategy:
58
+ fail-fast: false
59
+ matrix:
60
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
61
+ name: E2E Tests
62
+ runs-on: ubuntu-latest
63
+ env:
64
+ TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
65
+ steps:
66
+ - name: Configure AWS Credentials
67
+ uses: aws-actions/configure-aws-credentials@v4
68
+ with:
69
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
70
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
71
+ aws-region: us-west-1
72
+
73
+ - name: Checkout code
74
+ uses: actions/checkout@v4
75
+
76
+ - name: Set env based on branch
77
+ run: |
78
+ if [ "${{ github.base_ref }}" = "main" ]; then
79
+ echo "TARGET_ENV=main" >> "$GITHUB_ENV"
80
+ echo "BASE_URL=https://api.judgmentlabs.ai" >> "$GITHUB_ENV"
81
+ echo "SECRETS_PATH=prod/api-keys/e2e-tests" >> "$GITHUB_ENV"
82
+ echo "COVERAGE_ARTIFACT=coverage-html-production-${{ matrix.python-version }}" >> "$GITHUB_ENV"
83
+ else
84
+ echo "TARGET_ENV=staging" >> "$GITHUB_ENV"
85
+ echo "BASE_URL=https://staging.api.judgmentlabs.ai" >> "$GITHUB_ENV"
86
+ echo "SECRETS_PATH=stg/api-keys/e2e-tests" >> "$GITHUB_ENV"
87
+ echo "COVERAGE_ARTIFACT=coverage-html-staging-${{ matrix.python-version }}" >> "$GITHUB_ENV"
88
+ fi
89
+
90
+ - name: Restore uv cache
91
+ uses: actions/cache/restore@v4
92
+ id: restore-uv-cache
93
+ with:
94
+ path: ~/.cache/uv/
95
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
96
+ restore-keys: |
97
+ ${{ runner.os }}-uv-judgment-
98
+ ${{ runner.os }}-uv-
99
+
100
+ - name: Set up Python
101
+ uses: actions/setup-python@v4
102
+ with:
103
+ python-version: ${{ matrix.python-version }}
104
+
105
+ - name: Install judgeval dependencies
106
+ run: |
107
+ pip install uv
108
+ uv sync --dev
109
+
110
+ - name: Check if server is running
111
+ run: |
112
+ if ! curl -s "$BASE_URL/health" > /dev/null; then
113
+ echo "Judgment server ($BASE_URL) is not running properly. Check CloudWatch logs."
114
+ exit 1
115
+ else
116
+ echo "Server is running."
117
+ fi
118
+
119
+ - name: Run E2E tests
120
+ working-directory: src
121
+ run: |
122
+ SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id "$SECRETS_PATH" --query SecretString --output text)
123
+ export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
124
+ export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
125
+ export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
126
+ export JUDGMENT_API_URL="$BASE_URL"
127
+ timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
128
+
129
+ - name: Upload coverage HTML report
130
+ if: always()
131
+ uses: actions/upload-artifact@v4
132
+ with:
133
+ name: ${{ env.COVERAGE_ARTIFACT }}
134
+ path: src/htmlcov
135
+
136
+ - name: Save uv cache
137
+ uses: actions/cache/save@v4
138
+ if: always() && steps.restore-uv-cache.outputs.cache-hit != 'true'
139
+ with:
140
+ path: ~/.cache/uv/
141
+ key: ${{ runner.os }}-uv-judgment-${{ hashFiles('./**/uv.lock') }}
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.13.1
3
+ Version: 0.14.1
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.13.1"
3
+ version = "0.14.1"
4
4
  authors = [
5
5
  { name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
6
6
  { name = "Alex Shan", email = "alex@judgmentlabs.ai" },
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-24T18:25:18+00:00
3
+ # timestamp: 2025-09-29T19:54:47+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -54,6 +54,8 @@ class SavePromptScorerRequest(TypedDict):
54
54
  threshold: float
55
55
  model: NotRequired[str]
56
56
  is_trace: NotRequired[bool]
57
+ options: NotRequired[Optional[Dict[str, float]]]
58
+ description: NotRequired[Optional[str]]
57
59
 
58
60
 
59
61
  class SavePromptScorerResponse(TypedDict):
@@ -143,6 +145,8 @@ class PromptScorer(TypedDict):
143
145
  prompt: str
144
146
  threshold: float
145
147
  model: NotRequired[str]
148
+ options: NotRequired[Optional[Dict[str, float]]]
149
+ description: NotRequired[Optional[str]]
146
150
  created_at: NotRequired[Optional[str]]
147
151
  updated_at: NotRequired[Optional[str]]
148
152
  is_trace: NotRequired[Optional[bool]]
@@ -1,6 +1,6 @@
1
1
  # generated by datamodel-codegen:
2
2
  # filename: .openapi.json
3
- # timestamp: 2025-09-24T18:25:17+00:00
3
+ # timestamp: 2025-09-29T19:54:46+00:00
4
4
 
5
5
  from __future__ import annotations
6
6
  from typing import Annotated, Any, Dict, List, Optional, Union
@@ -56,6 +56,8 @@ class SavePromptScorerRequest(BaseModel):
56
56
  threshold: Annotated[float, Field(title="Threshold")]
57
57
  model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
58
58
  is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
59
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
60
+ description: Annotated[Optional[str], Field(title="Description")] = None
59
61
 
60
62
 
61
63
  class SavePromptScorerResponse(BaseModel):
@@ -156,6 +158,8 @@ class PromptScorer(BaseModel):
156
158
  prompt: Annotated[str, Field(title="Prompt")]
157
159
  threshold: Annotated[float, Field(title="Threshold")]
158
160
  model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
161
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
162
+ description: Annotated[Optional[str], Field(title="Description")] = None
159
163
  created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
160
164
  updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
161
165
  is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
@@ -4,19 +4,21 @@ from judgeval.scorers.api_scorer import (
4
4
  TraceAPIScorerConfig,
5
5
  )
6
6
  from judgeval.constants import APIScorerType
7
- from typing import Dict, Any
7
+ from typing import Dict, Any, Optional
8
8
  from judgeval.api import JudgmentSyncClient
9
9
  from judgeval.exceptions import JudgmentAPIError
10
10
  import os
11
11
  from judgeval.logger import judgeval_logger
12
12
  from abc import ABC
13
13
  from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
14
+ from copy import copy
14
15
 
15
16
 
16
17
  def push_prompt_scorer(
17
18
  name: str,
18
19
  prompt: str,
19
20
  threshold: float,
21
+ options: Optional[Dict[str, float]] = None,
20
22
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
21
23
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
22
24
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
@@ -29,6 +31,7 @@ def push_prompt_scorer(
29
31
  "name": name,
30
32
  "prompt": prompt,
31
33
  "threshold": threshold,
34
+ "options": options,
32
35
  "model": model,
33
36
  "is_trace": is_trace,
34
37
  }
@@ -98,6 +101,7 @@ def scorer_exists(
98
101
  class BasePromptScorer(ABC, APIScorerConfig):
99
102
  score_type: APIScorerType
100
103
  prompt: str
104
+ options: Optional[Dict[str, float]] = None
101
105
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
102
106
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
103
107
 
@@ -124,6 +128,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
124
128
  name=name,
125
129
  prompt=scorer_config["prompt"],
126
130
  threshold=scorer_config["threshold"],
131
+ options=scorer_config.get("options"),
127
132
  model=scorer_config.get("model"),
128
133
  judgment_api_key=judgment_api_key,
129
134
  organization_id=organization_id,
@@ -135,6 +140,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
135
140
  name: str,
136
141
  prompt: str,
137
142
  threshold: float = 0.5,
143
+ options: Optional[Dict[str, float]] = None,
138
144
  model: str = JUDGMENT_DEFAULT_GPT_MODEL,
139
145
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
140
146
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
@@ -150,6 +156,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
150
156
  name,
151
157
  prompt,
152
158
  threshold,
159
+ options,
153
160
  model,
154
161
  judgment_api_key,
155
162
  organization_id,
@@ -161,6 +168,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
161
168
  name=name,
162
169
  prompt=prompt,
163
170
  threshold=threshold,
171
+ options=options,
164
172
  model=model,
165
173
  judgment_api_key=judgment_api_key,
166
174
  organization_id=organization_id,
@@ -199,6 +207,14 @@ class BasePromptScorer(ABC, APIScorerConfig):
199
207
  self.push_prompt_scorer()
200
208
  judgeval_logger.info(f"Successfully updated model for {self.name}")
201
209
 
210
+ def set_options(self, options: Optional[Dict[str, float]]):
211
+ """
212
+ Updates the options of the scorer.
213
+ """
214
+ self.options = options
215
+ self.push_prompt_scorer()
216
+ judgeval_logger.info(f"Successfully updated options for {self.name}")
217
+
202
218
  def append_to_prompt(self, prompt_addition: str):
203
219
  """
204
220
  Appends a string to the prompt.
@@ -226,6 +242,12 @@ class BasePromptScorer(ABC, APIScorerConfig):
226
242
  """
227
243
  return self.model
228
244
 
245
+ def get_options(self) -> Dict[str, float] | None:
246
+ """
247
+ Returns the options of the scorer.
248
+ """
249
+ return copy(self.options) if self.options is not None else None
250
+
229
251
  def get_name(self) -> str | None:
230
252
  """
231
253
  Returns the name of the scorer.
@@ -241,6 +263,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
241
263
  "model": self.model,
242
264
  "prompt": self.prompt,
243
265
  "threshold": self.threshold,
266
+ "options": self.options,
244
267
  }
245
268
 
246
269
  def push_prompt_scorer(self):
@@ -251,6 +274,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
251
274
  self.name,
252
275
  self.prompt,
253
276
  self.threshold,
277
+ self.options,
254
278
  self.model,
255
279
  self.judgment_api_key,
256
280
  self.organization_id,
@@ -258,7 +282,7 @@ class BasePromptScorer(ABC, APIScorerConfig):
258
282
  )
259
283
 
260
284
  def __str__(self):
261
- return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold})"
285
+ return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
262
286
 
263
287
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
264
288
  base = super().model_dump(*args, **kwargs)
@@ -1,4 +1,4 @@
1
- __version__ = "0.13.1"
1
+ __version__ = "0.14.1"
2
2
 
3
3
 
4
4
  def get_version() -> str:
@@ -1,176 +0,0 @@
1
- name: CI
2
-
3
- on:
4
- pull_request:
5
- types: [opened, synchronize, reopened]
6
-
7
- permissions: read-all
8
-
9
- jobs:
10
- validate-branch:
11
- uses: ./.github/workflows/merge-branch-check.yaml
12
-
13
- run-tests:
14
- needs: [validate-branch]
15
- if: needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped'
16
- strategy:
17
- fail-fast: false
18
- matrix:
19
- os: [ubuntu-latest, macos-latest]
20
- python-version:
21
- - "3.10"
22
- - "3.11"
23
- - "3.12"
24
- - "3.13"
25
- name: Unit Tests
26
- runs-on: ${{ matrix.os }}
27
- env:
28
- PYTHONPATH: "."
29
- OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
30
- TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
31
- JUDGMENT_DEV: true
32
-
33
- steps:
34
- - name: Checkout code
35
- uses: actions/checkout@v4
36
-
37
- - name: Set up Python
38
- uses: actions/setup-python@v4
39
- with:
40
- python-version: ${{ matrix.python-version }}
41
-
42
- - name: Install dependencies
43
- run: |
44
- pip install uv
45
- uv sync --dev
46
-
47
- - name: Run tests
48
- run: |
49
- cd src
50
- export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
51
- export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
52
- uv run pytest tests
53
-
54
- run-e2e-tests-staging:
55
- needs: [validate-branch]
56
- if: "github.base_ref == 'staging' && !contains(github.actor, '[bot]') && (needs.validate-branch.result == 'success' || needs.validate-branch.result == 'skipped')"
57
- strategy:
58
- fail-fast: false
59
- matrix:
60
- python-version:
61
- - "3.10"
62
- - "3.11"
63
- - "3.12"
64
- - "3.13"
65
- name: Staging E2E Tests
66
- runs-on: ubuntu-latest
67
- env:
68
- TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
69
- steps:
70
- - name: Configure AWS Credentials
71
- uses: aws-actions/configure-aws-credentials@v4
72
- with:
73
- aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
74
- aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
75
- aws-region: us-west-1
76
-
77
- - name: Checkout code
78
- uses: actions/checkout@v4
79
-
80
- - name: Set up Python
81
- uses: actions/setup-python@v4
82
- with:
83
- python-version: ${{ matrix.python-version }}
84
-
85
- - name: Install judgeval dependencies
86
- run: |
87
- pip install uv
88
- uv sync --dev
89
-
90
- - name: Check if server is running
91
- run: |
92
- if ! curl -s https://staging.api.judgmentlabs.ai/health > /dev/null; then
93
- echo "Staging Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
94
- exit 1
95
- else
96
- echo "Staging server is running."
97
- fi
98
-
99
- - name: Run E2E tests
100
- working-directory: src
101
- run: |
102
- SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id stg/api-keys/e2e-tests --query SecretString --output text)
103
- export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
104
- export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
105
- export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
106
- export JUDGMENT_API_URL=https://staging.api.judgmentlabs.ai
107
- timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
108
-
109
- - name: Upload coverage HTML report (staging)
110
- if: always()
111
- uses: actions/upload-artifact@v4
112
- with:
113
- name: coverage-html-staging-${{ matrix.python-version }}
114
- path: src/htmlcov
115
-
116
- run-e2e-tests-main:
117
- needs: [validate-branch]
118
- if: "github.base_ref == 'main' && !contains(github.actor, '[bot]') && needs.validate-branch.result == 'success'"
119
- strategy:
120
- fail-fast: false
121
- matrix:
122
- python-version:
123
- - "3.10"
124
- - "3.11"
125
- - "3.12"
126
- - "3.13"
127
- name: Production E2E Tests
128
- runs-on: ubuntu-latest
129
- env:
130
- TEST_TIMEOUT_SECONDS: ${{ secrets.TEST_TIMEOUT_SECONDS }}
131
- steps:
132
- - name: Configure AWS Credentials
133
- uses: aws-actions/configure-aws-credentials@v4
134
- with:
135
- aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
136
- aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
137
- aws-region: us-west-1
138
-
139
- - name: Checkout code
140
- uses: actions/checkout@v4
141
-
142
- - name: Set up Python
143
- uses: actions/setup-python@v4
144
- with:
145
- python-version: ${{ matrix.python-version }}
146
-
147
- - name: Install judgeval dependencies
148
- run: |
149
- pip install uv
150
- uv sync --dev
151
-
152
- - name: Check if server is running
153
- run: |
154
- if ! curl -s https://api.judgmentlabs.ai/health > /dev/null; then
155
- echo "Production Judgment server is not running properly. Check logs on AWS CloudWatch for more details."
156
- exit 1
157
- else
158
- echo "Production server is running."
159
- fi
160
-
161
- - name: Run E2E tests
162
- working-directory: src
163
- run: |
164
- SECRET_VARS=$(aws secretsmanager get-secret-value --secret-id prod/api-keys/e2e-tests --query SecretString --output text)
165
- export $(echo "$SECRET_VARS" | jq -r 'to_entries | .[] | "\(.key)=\(.value)"')
166
- export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
167
- export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
168
- export JUDGMENT_API_URL=https://api.judgmentlabs.ai
169
- timeout ${TEST_TIMEOUT_SECONDS}s uv run pytest -n auto --dist=loadfile --durations=0 --cov=. --cov-config=.coveragerc --cov-report=html ./e2etests
170
-
171
- - name: Upload coverage HTML report (production)
172
- if: always()
173
- uses: actions/upload-artifact@v4
174
- with:
175
- name: coverage-html-production-${{ matrix.python-version }}
176
- path: src/htmlcov
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes