verdikt-sdk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- verdikt_sdk-0.1.0/.github/workflows/pre-commit.yml +30 -0
- verdikt_sdk-0.1.0/.github/workflows/publish.yml +86 -0
- verdikt_sdk-0.1.0/.gitignore +163 -0
- verdikt_sdk-0.1.0/.pre-commit-config.yaml +28 -0
- verdikt_sdk-0.1.0/.python-version +1 -0
- verdikt_sdk-0.1.0/Makefile +21 -0
- verdikt_sdk-0.1.0/PKG-INFO +56 -0
- verdikt_sdk-0.1.0/README.md +46 -0
- verdikt_sdk-0.1.0/SDK.md +177 -0
- verdikt_sdk-0.1.0/TESTING.md +107 -0
- verdikt_sdk-0.1.0/pyproject.toml +37 -0
- verdikt_sdk-0.1.0/tests/__init__.py +0 -0
- verdikt_sdk-0.1.0/uv.lock +1774 -0
- verdikt_sdk-0.1.0/verdikt_sdk/__init__.py +5 -0
- verdikt_sdk-0.1.0/verdikt_sdk/auth.py +79 -0
- verdikt_sdk-0.1.0/verdikt_sdk/client.py +198 -0
- verdikt_sdk-0.1.0/verdikt_sdk/http.py +18 -0
- verdikt_sdk-0.1.0/verdikt_sdk/models.py +108 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: pre-commit
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
pre-commit:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v4
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
run: uv python install 3.13
|
|
19
|
+
|
|
20
|
+
- name: Install dependencies
|
|
21
|
+
run: uv sync --group dev
|
|
22
|
+
|
|
23
|
+
- name: Run ruff lint
|
|
24
|
+
run: uv run ruff check verdikt_sdk/
|
|
25
|
+
|
|
26
|
+
- name: Run ruff format check
|
|
27
|
+
run: uv run ruff format --check verdikt_sdk/
|
|
28
|
+
|
|
29
|
+
- name: Run mypy
|
|
30
|
+
run: uv run mypy verdikt_sdk/
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
name: publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
name: Build distribution
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up Python
|
|
20
|
+
run: uv python install 3.13
|
|
21
|
+
|
|
22
|
+
- name: Verify tag matches pyproject.toml version
|
|
23
|
+
run: |
|
|
24
|
+
PKG_VERSION=$(python3 -c "
|
|
25
|
+
import tomllib
|
|
26
|
+
with open('pyproject.toml', 'rb') as f:
|
|
27
|
+
print(tomllib.load(f)['project']['version'])
|
|
28
|
+
")
|
|
29
|
+
TAG_VERSION="${GITHUB_REF_NAME#v}"
|
|
30
|
+
if [ "$PKG_VERSION" != "$TAG_VERSION" ]; then
|
|
31
|
+
echo "Tag version ($TAG_VERSION) does not match pyproject.toml version ($PKG_VERSION)"
|
|
32
|
+
exit 1
|
|
33
|
+
fi
|
|
34
|
+
|
|
35
|
+
- name: Build distribution
|
|
36
|
+
run: uv build
|
|
37
|
+
|
|
38
|
+
- name: Store distribution packages
|
|
39
|
+
uses: actions/upload-artifact@v4
|
|
40
|
+
with:
|
|
41
|
+
name: python-package-distributions
|
|
42
|
+
path: dist/
|
|
43
|
+
|
|
44
|
+
publish-to-pypi:
|
|
45
|
+
name: Publish to PyPI
|
|
46
|
+
needs: build
|
|
47
|
+
runs-on: ubuntu-latest
|
|
48
|
+
|
|
49
|
+
environment:
|
|
50
|
+
name: pypi
|
|
51
|
+
url: https://pypi.org/p/verdikt-sdk
|
|
52
|
+
|
|
53
|
+
permissions:
|
|
54
|
+
id-token: write
|
|
55
|
+
|
|
56
|
+
steps:
|
|
57
|
+
- name: Download distribution packages
|
|
58
|
+
uses: actions/download-artifact@v4
|
|
59
|
+
with:
|
|
60
|
+
name: python-package-distributions
|
|
61
|
+
path: dist/
|
|
62
|
+
|
|
63
|
+
- name: Publish to PyPI
|
|
64
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
65
|
+
|
|
66
|
+
github-release:
|
|
67
|
+
name: Create GitHub Release
|
|
68
|
+
needs: publish-to-pypi
|
|
69
|
+
runs-on: ubuntu-latest
|
|
70
|
+
|
|
71
|
+
permissions:
|
|
72
|
+
contents: write
|
|
73
|
+
|
|
74
|
+
steps:
|
|
75
|
+
- uses: actions/checkout@v4
|
|
76
|
+
|
|
77
|
+
- name: Download distribution packages
|
|
78
|
+
uses: actions/download-artifact@v4
|
|
79
|
+
with:
|
|
80
|
+
name: python-package-distributions
|
|
81
|
+
path: dist/
|
|
82
|
+
|
|
83
|
+
- name: Create GitHub Release
|
|
84
|
+
env:
|
|
85
|
+
GH_TOKEN: ${{ github.token }}
|
|
86
|
+
run: gh release create "$GITHUB_REF_NAME" dist/* --generate-notes
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
### Python template
|
|
2
|
+
# Byte-compiled / optimized / DLL files
|
|
3
|
+
__pycache__/
|
|
4
|
+
*.py[cod]
|
|
5
|
+
*$py.class
|
|
6
|
+
|
|
7
|
+
# C extensions
|
|
8
|
+
*.so
|
|
9
|
+
|
|
10
|
+
# Distribution / packaging
|
|
11
|
+
.Python
|
|
12
|
+
build/
|
|
13
|
+
develop-eggs/
|
|
14
|
+
dist/
|
|
15
|
+
downloads/
|
|
16
|
+
eggs/
|
|
17
|
+
.eggs/
|
|
18
|
+
lib/
|
|
19
|
+
lib64/
|
|
20
|
+
parts/
|
|
21
|
+
sdist/
|
|
22
|
+
var/
|
|
23
|
+
wheels/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
cover/
|
|
54
|
+
|
|
55
|
+
# Translations
|
|
56
|
+
*.mo
|
|
57
|
+
*.pot
|
|
58
|
+
|
|
59
|
+
# Django stuff:
|
|
60
|
+
*.log
|
|
61
|
+
local_settings.py
|
|
62
|
+
db.sqlite3
|
|
63
|
+
db.sqlite3-journal
|
|
64
|
+
|
|
65
|
+
# Flask stuff:
|
|
66
|
+
instance/
|
|
67
|
+
.webassets-cache
|
|
68
|
+
|
|
69
|
+
# Scrapy stuff:
|
|
70
|
+
.scrapy
|
|
71
|
+
|
|
72
|
+
# Sphinx documentation
|
|
73
|
+
docs/_build/
|
|
74
|
+
|
|
75
|
+
# PyBuilder
|
|
76
|
+
.pybuilder/
|
|
77
|
+
target/
|
|
78
|
+
|
|
79
|
+
# Jupyter Notebook
|
|
80
|
+
.ipynb_checkpoints
|
|
81
|
+
|
|
82
|
+
# IPython
|
|
83
|
+
profile_default/
|
|
84
|
+
ipython_config.py
|
|
85
|
+
|
|
86
|
+
# pyenv
|
|
87
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
88
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
89
|
+
# .python-version
|
|
90
|
+
|
|
91
|
+
# pipenv
|
|
92
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
93
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
94
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
95
|
+
# install all needed dependencies.
|
|
96
|
+
#Pipfile.lock
|
|
97
|
+
|
|
98
|
+
# poetry
|
|
99
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
100
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
101
|
+
# commonly ignored for libraries.
|
|
102
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
103
|
+
#poetry.lock
|
|
104
|
+
|
|
105
|
+
# pdm
|
|
106
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
107
|
+
#pdm.lock
|
|
108
|
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
|
109
|
+
# in version control.
|
|
110
|
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
|
111
|
+
.pdm.toml
|
|
112
|
+
.pdm-python
|
|
113
|
+
.pdm-build/
|
|
114
|
+
|
|
115
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
116
|
+
__pypackages__/
|
|
117
|
+
|
|
118
|
+
# Celery stuff
|
|
119
|
+
celerybeat-schedule
|
|
120
|
+
celerybeat.pid
|
|
121
|
+
|
|
122
|
+
# SageMath parsed files
|
|
123
|
+
*.sage.py
|
|
124
|
+
|
|
125
|
+
# Environments
|
|
126
|
+
.env
|
|
127
|
+
.venv
|
|
128
|
+
env/
|
|
129
|
+
venv/
|
|
130
|
+
ENV/
|
|
131
|
+
env.bak/
|
|
132
|
+
venv.bak/
|
|
133
|
+
|
|
134
|
+
# Spyder project settings
|
|
135
|
+
.spyderproject
|
|
136
|
+
.spyproject
|
|
137
|
+
|
|
138
|
+
# Rope project settings
|
|
139
|
+
.ropeproject
|
|
140
|
+
|
|
141
|
+
# mkdocs documentation
|
|
142
|
+
/site
|
|
143
|
+
|
|
144
|
+
# mypy
|
|
145
|
+
.mypy_cache/
|
|
146
|
+
.dmypy.json
|
|
147
|
+
dmypy.json
|
|
148
|
+
|
|
149
|
+
# Pyre type checker
|
|
150
|
+
.pyre/
|
|
151
|
+
|
|
152
|
+
# pytype static type analyzer
|
|
153
|
+
.pytype/
|
|
154
|
+
|
|
155
|
+
# Cython debug symbols
|
|
156
|
+
cython_debug/
|
|
157
|
+
|
|
158
|
+
# PyCharm
|
|
159
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
160
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
161
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
162
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
163
|
+
.idea/
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
- id: ruff-lint
|
|
5
|
+
name: lint
|
|
6
|
+
language: system
|
|
7
|
+
types:
|
|
8
|
+
- python
|
|
9
|
+
entry: make ruff-lint
|
|
10
|
+
pass_filenames: false
|
|
11
|
+
|
|
12
|
+
- id: ruff-format
|
|
13
|
+
name: format
|
|
14
|
+
language: system
|
|
15
|
+
types:
|
|
16
|
+
- python
|
|
17
|
+
entry: make ruff-format
|
|
18
|
+
pass_filenames: false
|
|
19
|
+
|
|
20
|
+
- id: mypy
|
|
21
|
+
name: type-check
|
|
22
|
+
language: system
|
|
23
|
+
types:
|
|
24
|
+
- python
|
|
25
|
+
entry: make mypy
|
|
26
|
+
pass_filenames: false
|
|
27
|
+
|
|
28
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.13
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
.PHONY: help
|
|
2
|
+
help: # Show help for each of the Makefile recipes
|
|
3
|
+
@grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m: $$(echo $$l | cut -f 2- -d'#')\n"; done
|
|
4
|
+
|
|
5
|
+
TA ?= -v tests/
|
|
6
|
+
|
|
7
|
+
ruff-lint: # Run ruff linter
|
|
8
|
+
uv run ruff check --fix verdikt_sdk/
|
|
9
|
+
|
|
10
|
+
ruff-format: # Run ruff formatter
|
|
11
|
+
uv run ruff format verdikt_sdk/
|
|
12
|
+
|
|
13
|
+
mypy: # Run mypy type checker
|
|
14
|
+
uv run mypy verdikt_sdk/
|
|
15
|
+
|
|
16
|
+
lint: # Run pre-commit
|
|
17
|
+
pre-commit run --all-files
|
|
18
|
+
|
|
19
|
+
test: # Run tests
|
|
20
|
+
uv run pytest $(TA)
|
|
21
|
+
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: verdikt-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for the Verdikt Evaluation API
|
|
5
|
+
Requires-Python: >=3.13
|
|
6
|
+
Requires-Dist: httpx>=0.28.1
|
|
7
|
+
Requires-Dist: pydantic>=2.0
|
|
8
|
+
Requires-Dist: yalc>=0.2.1
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# verdikt-sdk
|
|
12
|
+
|
|
13
|
+
Python SDK for [Verdikt](https://github.com/cognitai-labs-dev/verdikt) — a standalone AI evaluation service that decouples evaluation and LLM/human judging from the application being evaluated.
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
pip install verdikt-sdk
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from verdikt_sdk import EvaluationClient
|
|
25
|
+
from verdikt_sdk.models import EvaluationType, Question
|
|
26
|
+
from yalc import LLMModel
|
|
27
|
+
|
|
28
|
+
client = EvaluationClient(
|
|
29
|
+
base_url="https://your-verdikt-instance.com",
|
|
30
|
+
client_id="your-client-id",
|
|
31
|
+
client_secret="your-client-secret",
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Register your app (idempotent — safe to call on every deploy)
|
|
35
|
+
await client.create_app(slug="my-app", name="My App")
|
|
36
|
+
|
|
37
|
+
# Sync questions to the dataset (idempotent)
|
|
38
|
+
await client.add_questions("my-app", [
|
|
39
|
+
Question(question="What is the capital of France?", human_answer="Paris"),
|
|
40
|
+
])
|
|
41
|
+
|
|
42
|
+
# Run an evaluation cycle
|
|
43
|
+
await client.run_evaluation(
|
|
44
|
+
app_slug="my-app",
|
|
45
|
+
app_version="v1.2.0",
|
|
46
|
+
callback=my_llm_function, # async fn(question: str) -> str
|
|
47
|
+
evaluation_type=EvaluationType.LLM_ONLY,
|
|
48
|
+
llm_judge_models=[LLMModel.gpt_4o_mini],
|
|
49
|
+
)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
|
|
53
|
+
|
|
54
|
+
## Authentication
|
|
55
|
+
|
|
56
|
+
The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# verdikt-sdk
|
|
2
|
+
|
|
3
|
+
Python SDK for [Verdikt](https://github.com/cognitai-labs-dev/verdikt) — a standalone AI evaluation service that decouples evaluation and LLM/human judging from the application being evaluated.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
pip install verdikt-sdk
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from verdikt_sdk import EvaluationClient
|
|
15
|
+
from verdikt_sdk.models import EvaluationType, Question
|
|
16
|
+
from yalc import LLMModel
|
|
17
|
+
|
|
18
|
+
client = EvaluationClient(
|
|
19
|
+
base_url="https://your-verdikt-instance.com",
|
|
20
|
+
client_id="your-client-id",
|
|
21
|
+
client_secret="your-client-secret",
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Register your app (idempotent — safe to call on every deploy)
|
|
25
|
+
await client.create_app(slug="my-app", name="My App")
|
|
26
|
+
|
|
27
|
+
# Sync questions to the dataset (idempotent)
|
|
28
|
+
await client.add_questions("my-app", [
|
|
29
|
+
Question(question="What is the capital of France?", human_answer="Paris"),
|
|
30
|
+
])
|
|
31
|
+
|
|
32
|
+
# Run an evaluation cycle
|
|
33
|
+
await client.run_evaluation(
|
|
34
|
+
app_slug="my-app",
|
|
35
|
+
app_version="v1.2.0",
|
|
36
|
+
callback=my_llm_function, # async fn(question: str) -> str
|
|
37
|
+
evaluation_type=EvaluationType.LLM_ONLY,
|
|
38
|
+
llm_judge_models=[LLMModel.gpt_4o_mini],
|
|
39
|
+
)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
`run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
|
|
43
|
+
|
|
44
|
+
## Authentication
|
|
45
|
+
|
|
46
|
+
The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
|
verdikt_sdk-0.1.0/SDK.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# Evaluation SDK Spec
|
|
2
|
+
|
|
3
|
+
Python SDK that wraps the evaluation API so integrators only provide a callback — the SDK handles auth, dataset diffing, and evaluation submission.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Backend changes required (this repo)
|
|
8
|
+
|
|
9
|
+
Four additions needed before the SDK can be built:
|
|
10
|
+
|
|
11
|
+
### 1. Add `slug` to apps
|
|
12
|
+
|
|
13
|
+
- Add a `slug` column to the `apps` table — unique, not null, URL-safe (lowercase, hyphens)
|
|
14
|
+
- Enforced at the DB level with a unique constraint
|
|
15
|
+
- `POST /v1/app` accepts `slug` alongside `name`
|
|
16
|
+
- New endpoint: `GET /v1/app/by-slug/{slug}` → returns `AppSchema` (404 if not found)
|
|
17
|
+
|
|
18
|
+
This replaces the need to fetch all apps and filter client-side.
|
|
19
|
+
|
|
20
|
+
### 2. `GET /.well-known`
|
|
21
|
+
Returns the Zitadel issuer URL so the SDK can discover it from `base_url` alone.
|
|
22
|
+
|
|
23
|
+
```json
|
|
24
|
+
{ "issuer": "https://my-zitadel.example.com" }
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### 3. `GET /v1/app/{app_id}/datasets/hashes`
|
|
28
|
+
Lightweight endpoint for SDK diffing — returns hashes only, no full text.
|
|
29
|
+
|
|
30
|
+
```json
|
|
31
|
+
[
|
|
32
|
+
{ "id": 1, "question_hash": "sha256...", "human_answer_hash": "sha256..." },
|
|
33
|
+
{ "id": 2, "question_hash": "sha256...", "human_answer_hash": "sha256..." }
|
|
34
|
+
]
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Hash algorithm: SHA-256 of the stripped text.
|
|
38
|
+
|
|
39
|
+
### 4. `PATCH /v1/app/{app_id}/datasets/{dataset_id}`
|
|
40
|
+
Updates `human_answer` (and optionally `question`) on an existing dataset entry.
|
|
41
|
+
`AppDatasetUpdateSchema` already exists in `src/schemas/app_dataset.py` — just needs a route.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## SDK interface
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from eval_sdk import EvaluationClient
|
|
49
|
+
from typing import Callable, Literal
|
|
50
|
+
|
|
51
|
+
class EvaluationClient:
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
base_url: str, # e.g. "https://eval.mycompany.com"
|
|
55
|
+
client_id: str, # Zitadel machine user client ID
|
|
56
|
+
client_secret: str, # Zitadel machine user client secret
|
|
57
|
+
) -> None: ...
|
|
58
|
+
|
|
59
|
+
def create_app(self, slug: str, name: str) -> None: ...
|
|
60
|
+
|
|
61
|
+
def add_questions(
|
|
62
|
+
self,
|
|
63
|
+
app_slug: str,
|
|
64
|
+
questions: list[dict], # [{"question": str, "human_answer": str}]
|
|
65
|
+
) -> None: ...
|
|
66
|
+
|
|
67
|
+
def run_evaluation(
|
|
68
|
+
self,
|
|
69
|
+
app_slug: str,
|
|
70
|
+
app_version: str,
|
|
71
|
+
callback: Callable[[str], str],
|
|
72
|
+
evaluation_type: Literal["LLM_ONLY", "HUMAN_AND_LLM"] = "LLM_ONLY",
|
|
73
|
+
llm_judge_models: list[str] | None = None,
|
|
74
|
+
) -> None: ...
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Method details
|
|
80
|
+
|
|
81
|
+
### `create_app(slug, name)`
|
|
82
|
+
Idempotent — safe to call on every deploy.
|
|
83
|
+
|
|
84
|
+
1. `GET /v1/app/by-slug/{slug}` → if 200, app exists → no-op
|
|
85
|
+
2. If 404 → `POST /v1/app` with `{ "slug": slug, "name": name }`
|
|
86
|
+
|
|
87
|
+
### `add_questions(app_slug, questions)`
|
|
88
|
+
Idempotent — safe to call on every deploy. Uses SHA-256 of the question text as the match key so full text is never compared directly (questions can be long).
|
|
89
|
+
|
|
90
|
+
1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
|
|
91
|
+
2. `GET /v1/app/{id}/datasets/hashes` → existing hashes
|
|
92
|
+
3. For each incoming question, compute `sha256(question.strip())`:
|
|
93
|
+
- Hash **not found** → `POST /v1/app/{id}/datasets` (new question)
|
|
94
|
+
- Hash found, `human_answer_hash` **differs** → `PATCH /v1/app/{id}/datasets/{dataset_id}` (updated answer)
|
|
95
|
+
- Hash found, `human_answer_hash` **matches** → skip
|
|
96
|
+
|
|
97
|
+
### `run_evaluation(app_slug, app_version, callback, ...)`
|
|
98
|
+
1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
|
|
99
|
+
2. `GET /v1/app/{id}/datasets` → full question list
|
|
100
|
+
3. For each dataset item: `answer = callback(item["question"])`
|
|
101
|
+
4. `POST /v1/app/{id}/evaluation` with:
|
|
102
|
+
```json
|
|
103
|
+
{
|
|
104
|
+
"app_version": "<app_version>",
|
|
105
|
+
"evaluation_type": "<evaluation_type>",
|
|
106
|
+
"app_answers": { "<dataset_id>": "<answer>", ... },
|
|
107
|
+
"llm_judge_models": ["gpt-4o-mini"]
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## Auth
|
|
114
|
+
|
|
115
|
+
Uses **OAuth2 client credentials grant** against Zitadel.
|
|
116
|
+
|
|
117
|
+
Flow on first API call:
|
|
118
|
+
1. `GET {base_url}/.well-known` → get `issuer`
|
|
119
|
+
2. `POST {issuer}/oauth/v2/token` with `grant_type=client_credentials`, `client_id`, `client_secret`
|
|
120
|
+
3. Cache the token; refresh automatically when `expires_in` is reached
|
|
121
|
+
|
|
122
|
+
The `issuer` and token are cached on the client instance — no repeated discovery calls.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Slug → ID caching
|
|
127
|
+
|
|
128
|
+
All three methods resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}`. The resolved mapping is cached on the client instance so multiple method calls don't repeat the lookup.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Slug format
|
|
133
|
+
|
|
134
|
+
- Lowercase, alphanumeric, hyphens only — e.g. `"my-app"`, `"gpt-wrapper-v2"`
|
|
135
|
+
- Enforced by the API (422 if invalid format)
|
|
136
|
+
- Chosen by the integrator at `create_app` time; stable forever
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Dependencies
|
|
141
|
+
|
|
142
|
+
- `httpx` — HTTP client
|
|
143
|
+
- `pydantic` — response validation
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Usage example
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from eval_sdk import EvaluationClient
|
|
151
|
+
|
|
152
|
+
client = EvaluationClient(
|
|
153
|
+
base_url="https://eval.mycompany.com",
|
|
154
|
+
client_id="my-service@myproject.zitadel.cloud",
|
|
155
|
+
client_secret="...",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Idempotent setup — safe to call on every deploy
|
|
159
|
+
client.create_app(slug="my-app", name="My App")
|
|
160
|
+
|
|
161
|
+
client.add_questions("my-app", [
|
|
162
|
+
{"question": "What is the capital of France?", "human_answer": "Paris"},
|
|
163
|
+
{"question": "What is 2 + 2?", "human_answer": "4"},
|
|
164
|
+
])
|
|
165
|
+
|
|
166
|
+
# Run after each inference cycle
|
|
167
|
+
def my_llm(question: str) -> str:
|
|
168
|
+
return my_model.complete(question)
|
|
169
|
+
|
|
170
|
+
client.run_evaluation(
|
|
171
|
+
app_slug="my-app",
|
|
172
|
+
app_version="v1.4.2",
|
|
173
|
+
callback=my_llm,
|
|
174
|
+
evaluation_type="LLM_ONLY",
|
|
175
|
+
llm_judge_models=["gpt-4o-mini"],
|
|
176
|
+
)
|
|
177
|
+
```
|