verdikt-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,30 @@
1
+ name: pre-commit
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+
7
+ jobs:
8
+ pre-commit:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v4
16
+
17
+ - name: Set up Python
18
+ run: uv python install 3.13
19
+
20
+ - name: Install dependencies
21
+ run: uv sync --group dev
22
+
23
+ - name: Run ruff lint
24
+ run: uv run ruff check verdikt_sdk/
25
+
26
+ - name: Run ruff format check
27
+ run: uv run ruff format --check verdikt_sdk/
28
+
29
+ - name: Run mypy
30
+ run: uv run mypy verdikt_sdk/
@@ -0,0 +1,86 @@
1
+ name: publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+
8
+ jobs:
9
+ build:
10
+ name: Build distribution
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v4
15
+
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v4
18
+
19
+ - name: Set up Python
20
+ run: uv python install 3.13
21
+
22
+ - name: Verify tag matches pyproject.toml version
23
+ run: |
24
+ PKG_VERSION=$(python3 -c "
25
+ import tomllib
26
+ with open('pyproject.toml', 'rb') as f:
27
+ print(tomllib.load(f)['project']['version'])
28
+ ")
29
+ TAG_VERSION="${GITHUB_REF_NAME#v}"
30
+ if [ "$PKG_VERSION" != "$TAG_VERSION" ]; then
31
+ echo "Tag version ($TAG_VERSION) does not match pyproject.toml version ($PKG_VERSION)"
32
+ exit 1
33
+ fi
34
+
35
+ - name: Build distribution
36
+ run: uv build
37
+
38
+ - name: Store distribution packages
39
+ uses: actions/upload-artifact@v4
40
+ with:
41
+ name: python-package-distributions
42
+ path: dist/
43
+
44
+ publish-to-pypi:
45
+ name: Publish to PyPI
46
+ needs: build
47
+ runs-on: ubuntu-latest
48
+
49
+ environment:
50
+ name: pypi
51
+ url: https://pypi.org/p/verdikt-sdk
52
+
53
+ permissions:
54
+ id-token: write
55
+
56
+ steps:
57
+ - name: Download distribution packages
58
+ uses: actions/download-artifact@v4
59
+ with:
60
+ name: python-package-distributions
61
+ path: dist/
62
+
63
+ - name: Publish to PyPI
64
+ uses: pypa/gh-action-pypi-publish@release/v1
65
+
66
+ github-release:
67
+ name: Create GitHub Release
68
+ needs: publish-to-pypi
69
+ runs-on: ubuntu-latest
70
+
71
+ permissions:
72
+ contents: write
73
+
74
+ steps:
75
+ - uses: actions/checkout@v4
76
+
77
+ - name: Download distribution packages
78
+ uses: actions/download-artifact@v4
79
+ with:
80
+ name: python-package-distributions
81
+ path: dist/
82
+
83
+ - name: Create GitHub Release
84
+ env:
85
+ GH_TOKEN: ${{ github.token }}
86
+ run: gh release create "$GITHUB_REF_NAME" dist/* --generate-notes
@@ -0,0 +1,163 @@
1
+ ### Python template
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+
7
+ # C extensions
8
+ *.so
9
+
10
+ # Distribution / packaging
11
+ .Python
12
+ build/
13
+ develop-eggs/
14
+ dist/
15
+ downloads/
16
+ eggs/
17
+ .eggs/
18
+ lib/
19
+ lib64/
20
+ parts/
21
+ sdist/
22
+ var/
23
+ wheels/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+ cover/
54
+
55
+ # Translations
56
+ *.mo
57
+ *.pot
58
+
59
+ # Django stuff:
60
+ *.log
61
+ local_settings.py
62
+ db.sqlite3
63
+ db.sqlite3-journal
64
+
65
+ # Flask stuff:
66
+ instance/
67
+ .webassets-cache
68
+
69
+ # Scrapy stuff:
70
+ .scrapy
71
+
72
+ # Sphinx documentation
73
+ docs/_build/
74
+
75
+ # PyBuilder
76
+ .pybuilder/
77
+ target/
78
+
79
+ # Jupyter Notebook
80
+ .ipynb_checkpoints
81
+
82
+ # IPython
83
+ profile_default/
84
+ ipython_config.py
85
+
86
+ # pyenv
87
+ # For a library or package, you might want to ignore these files since the code is
88
+ # intended to run in multiple environments; otherwise, check them in:
89
+ # .python-version
90
+
91
+ # pipenv
92
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
94
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
95
+ # install all needed dependencies.
96
+ #Pipfile.lock
97
+
98
+ # poetry
99
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
101
+ # commonly ignored for libraries.
102
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103
+ #poetry.lock
104
+
105
+ # pdm
106
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107
+ #pdm.lock
108
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109
+ # in version control.
110
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
111
+ .pdm.toml
112
+ .pdm-python
113
+ .pdm-build/
114
+
115
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116
+ __pypackages__/
117
+
118
+ # Celery stuff
119
+ celerybeat-schedule
120
+ celerybeat.pid
121
+
122
+ # SageMath parsed files
123
+ *.sage.py
124
+
125
+ # Environments
126
+ .env
127
+ .venv
128
+ env/
129
+ venv/
130
+ ENV/
131
+ env.bak/
132
+ venv.bak/
133
+
134
+ # Spyder project settings
135
+ .spyderproject
136
+ .spyproject
137
+
138
+ # Rope project settings
139
+ .ropeproject
140
+
141
+ # mkdocs documentation
142
+ /site
143
+
144
+ # mypy
145
+ .mypy_cache/
146
+ .dmypy.json
147
+ dmypy.json
148
+
149
+ # Pyre type checker
150
+ .pyre/
151
+
152
+ # pytype static type analyzer
153
+ .pytype/
154
+
155
+ # Cython debug symbols
156
+ cython_debug/
157
+
158
+ # PyCharm
159
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
162
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
163
+ .idea/
@@ -0,0 +1,28 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff-lint
5
+ name: lint
6
+ language: system
7
+ types:
8
+ - python
9
+ entry: make ruff-lint
10
+ pass_filenames: false
11
+
12
+ - id: ruff-format
13
+ name: format
14
+ language: system
15
+ types:
16
+ - python
17
+ entry: make ruff-format
18
+ pass_filenames: false
19
+
20
+ - id: mypy
21
+ name: type-check
22
+ language: system
23
+ types:
24
+ - python
25
+ entry: make mypy
26
+ pass_filenames: false
27
+
28
+
@@ -0,0 +1 @@
1
+ 3.13
@@ -0,0 +1,21 @@
1
+ .PHONY: help
2
+ help: # Show help for each of the Makefile recipes
3
+ @grep -E '^[a-zA-Z0-9 -]+:.*#' Makefile | sort | while read -r l; do printf "\033[1;32m$$(echo $$l | cut -f 1 -d':')\033[00m: $$(echo $$l | cut -f 2- -d'#')\n"; done
4
+
5
+ TA ?= -v tests/
6
+
7
+ ruff-lint: # Run ruff linter
8
+ uv run ruff check --fix verdikt_sdk/
9
+
10
+ ruff-format: # Run ruff formatter
11
+ uv run ruff format verdikt_sdk/
12
+
13
+ mypy: # Run mypy type checker
14
+ uv run mypy verdikt_sdk/
15
+
16
+ lint: # Run pre-commit
17
+ pre-commit run --all-files
18
+
19
+ test: # Run tests
20
+ uv run pytest $(TA)
21
+
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: verdikt-sdk
3
+ Version: 0.1.0
4
+ Summary: Python SDK for the Verdikt Evaluation API
5
+ Requires-Python: >=3.13
6
+ Requires-Dist: httpx>=0.28.1
7
+ Requires-Dist: pydantic>=2.0
8
+ Requires-Dist: yalc>=0.2.1
9
+ Description-Content-Type: text/markdown
10
+
11
+ # verdikt-sdk
12
+
13
+ Python SDK for [Verdikt](https://github.com/cognitai-labs-dev/verdikt) — a standalone AI evaluation service that decouples evaluation and LLM/human judging from the application being evaluated.
14
+
15
+ ## Installation
16
+
17
+ ```
18
+ pip install verdikt-sdk
19
+ ```
20
+
21
+ ## Usage
22
+
23
+ ```python
24
+ from verdikt_sdk import EvaluationClient
25
+ from verdikt_sdk.models import EvaluationType, Question
26
+ from yalc import LLMModel
27
+
28
+ client = EvaluationClient(
29
+ base_url="https://your-verdikt-instance.com",
30
+ client_id="your-client-id",
31
+ client_secret="your-client-secret",
32
+ )
33
+
34
+ # Register your app (idempotent — safe to call on every deploy)
35
+ await client.create_app(slug="my-app", name="My App")
36
+
37
+ # Sync questions to the dataset (idempotent)
38
+ await client.add_questions("my-app", [
39
+ Question(question="What is the capital of France?", human_answer="Paris"),
40
+ ])
41
+
42
+ # Run an evaluation cycle
43
+ await client.run_evaluation(
44
+ app_slug="my-app",
45
+ app_version="v1.2.0",
46
+ callback=my_llm_function, # async fn(question: str) -> str
47
+ evaluation_type=EvaluationType.LLM_ONLY,
48
+ llm_judge_models=[LLMModel.gpt_4o_mini],
49
+ )
50
+ ```
51
+
52
+ `run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
53
+
54
+ ## Authentication
55
+
56
+ The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
@@ -0,0 +1,46 @@
1
+ # verdikt-sdk
2
+
3
+ Python SDK for [Verdikt](https://github.com/cognitai-labs-dev/verdikt) — a standalone AI evaluation service that decouples evaluation and LLM/human judging from the application being evaluated.
4
+
5
+ ## Installation
6
+
7
+ ```
8
+ pip install verdikt-sdk
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from verdikt_sdk import EvaluationClient
15
+ from verdikt_sdk.models import EvaluationType, Question
16
+ from yalc import LLMModel
17
+
18
+ client = EvaluationClient(
19
+ base_url="https://your-verdikt-instance.com",
20
+ client_id="your-client-id",
21
+ client_secret="your-client-secret",
22
+ )
23
+
24
+ # Register your app (idempotent — safe to call on every deploy)
25
+ await client.create_app(slug="my-app", name="My App")
26
+
27
+ # Sync questions to the dataset (idempotent)
28
+ await client.add_questions("my-app", [
29
+ Question(question="What is the capital of France?", human_answer="Paris"),
30
+ ])
31
+
32
+ # Run an evaluation cycle
33
+ await client.run_evaluation(
34
+ app_slug="my-app",
35
+ app_version="v1.2.0",
36
+ callback=my_llm_function, # async fn(question: str) -> str
37
+ evaluation_type=EvaluationType.LLM_ONLY,
38
+ llm_judge_models=[LLMModel.gpt_4o_mini],
39
+ )
40
+ ```
41
+
42
+ `run_evaluation` calls your `callback` concurrently for every question in the dataset, then submits all answers to Verdikt for judgment.
43
+
44
+ ## Authentication
45
+
46
+ The SDK authenticates via Zitadel OAuth2 client credentials. Create a machine user in your Zitadel project and pass its `client_id` and `client_secret` to `EvaluationClient`.
@@ -0,0 +1,177 @@
1
+ # Evaluation SDK Spec
2
+
3
+ Python SDK that wraps the evaluation API so integrators only provide a callback — the SDK handles auth, dataset diffing, and evaluation submission.
4
+
5
+ ---
6
+
7
+ ## Backend changes required (this repo)
8
+
9
+ Four additions needed before the SDK can be built:
10
+
11
+ ### 1. Add `slug` to apps
12
+
13
+ - Add a `slug` column to the `apps` table — unique, not null, URL-safe (lowercase, hyphens)
14
+ - Enforced at the DB level with a unique constraint
15
+ - `POST /v1/app` accepts `slug` alongside `name`
16
+ - New endpoint: `GET /v1/app/by-slug/{slug}` → returns `AppSchema` (404 if not found)
17
+
18
+ This replaces the need to fetch all apps and filter client-side.
19
+
20
+ ### 2. `GET /.well-known`
21
+ Returns the Zitadel issuer URL so the SDK can discover it from `base_url` alone.
22
+
23
+ ```json
24
+ { "issuer": "https://my-zitadel.example.com" }
25
+ ```
26
+
27
+ ### 3. `GET /v1/app/{app_id}/datasets/hashes`
28
+ Lightweight endpoint for SDK diffing — returns hashes only, no full text.
29
+
30
+ ```json
31
+ [
32
+ { "id": 1, "question_hash": "sha256...", "human_answer_hash": "sha256..." },
33
+ { "id": 2, "question_hash": "sha256...", "human_answer_hash": "sha256..." }
34
+ ]
35
+ ```
36
+
37
+ Hash algorithm: SHA-256 of the stripped text.
38
+
39
+ ### 4. `PATCH /v1/app/{app_id}/datasets/{dataset_id}`
40
+ Updates `human_answer` (and optionally `question`) on an existing dataset entry.
41
+ `AppDatasetUpdateSchema` already exists in `src/schemas/app_dataset.py` — just needs a route.
42
+
43
+ ---
44
+
45
+ ## SDK interface
46
+
47
+ ```python
48
+ from eval_sdk import EvaluationClient
49
+ from typing import Callable, Literal
50
+
51
+ class EvaluationClient:
52
+ def __init__(
53
+ self,
54
+ base_url: str, # e.g. "https://eval.mycompany.com"
55
+ client_id: str, # Zitadel machine user client ID
56
+ client_secret: str, # Zitadel machine user client secret
57
+ ) -> None: ...
58
+
59
+ def create_app(self, slug: str, name: str) -> None: ...
60
+
61
+ def add_questions(
62
+ self,
63
+ app_slug: str,
64
+ questions: list[dict], # [{"question": str, "human_answer": str}]
65
+ ) -> None: ...
66
+
67
+ def run_evaluation(
68
+ self,
69
+ app_slug: str,
70
+ app_version: str,
71
+ callback: Callable[[str], str],
72
+ evaluation_type: Literal["LLM_ONLY", "HUMAN_AND_LLM"] = "LLM_ONLY",
73
+ llm_judge_models: list[str] | None = None,
74
+ ) -> None: ...
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Method details
80
+
81
+ ### `create_app(slug, name)`
82
+ Idempotent — safe to call on every deploy.
83
+
84
+ 1. `GET /v1/app/by-slug/{slug}` → if 200, app exists → no-op
85
+ 2. If 404 → `POST /v1/app` with `{ "slug": slug, "name": name }`
86
+
87
+ ### `add_questions(app_slug, questions)`
88
+ Idempotent — safe to call on every deploy. Uses SHA-256 of the question text as the match key so full text is never compared directly (questions can be long).
89
+
90
+ 1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
91
+ 2. `GET /v1/app/{id}/datasets/hashes` → existing hashes
92
+ 3. For each incoming question, compute `sha256(question.strip())`:
93
+ - Hash **not found** → `POST /v1/app/{id}/datasets` (new question)
94
+ - Hash found, `human_answer_hash` **differs** → `PATCH /v1/app/{id}/datasets/{dataset_id}` (updated answer)
95
+ - Hash found, `human_answer_hash` **matches** → skip
96
+
97
+ ### `run_evaluation(app_slug, app_version, callback, ...)`
98
+ 1. Resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}` (cached per client instance)
99
+ 2. `GET /v1/app/{id}/datasets` → full question list
100
+ 3. For each dataset item: `answer = callback(item["question"])`
101
+ 4. `POST /v1/app/{id}/evaluation` with:
102
+ ```json
103
+ {
104
+ "app_version": "<app_version>",
105
+ "evaluation_type": "<evaluation_type>",
106
+ "app_answers": { "<dataset_id>": "<answer>", ... },
107
+ "llm_judge_models": ["gpt-4o-mini"]
108
+ }
109
+ ```
110
+
111
+ ---
112
+
113
+ ## Auth
114
+
115
+ Uses **OAuth2 client credentials grant** against Zitadel.
116
+
117
+ Flow on first API call:
118
+ 1. `GET {base_url}/.well-known` → get `issuer`
119
+ 2. `POST {issuer}/oauth/v2/token` with `grant_type=client_credentials`, `client_id`, `client_secret`
120
+ 3. Cache the token; refresh automatically when `expires_in` is reached
121
+
122
+ The `issuer` and token are cached on the client instance — no repeated discovery calls.
123
+
124
+ ---
125
+
126
+ ## Slug → ID caching
127
+
128
+ All three methods resolve `app_slug` → `app_id` via `GET /v1/app/by-slug/{slug}`. The resolved mapping is cached on the client instance so multiple method calls don't repeat the lookup.
129
+
130
+ ---
131
+
132
+ ## Slug format
133
+
134
+ - Lowercase, alphanumeric, hyphens only — e.g. `"my-app"`, `"gpt-wrapper-v2"`
135
+ - Enforced by the API (422 if invalid format)
136
+ - Chosen by the integrator at `create_app` time; stable forever
137
+
138
+ ---
139
+
140
+ ## Dependencies
141
+
142
+ - `httpx` — HTTP client
143
+ - `pydantic` — response validation
144
+
145
+ ---
146
+
147
+ ## Usage example
148
+
149
+ ```python
150
+ from eval_sdk import EvaluationClient
151
+
152
+ client = EvaluationClient(
153
+ base_url="https://eval.mycompany.com",
154
+ client_id="my-service@myproject.zitadel.cloud",
155
+ client_secret="...",
156
+ )
157
+
158
+ # Idempotent setup — safe to call on every deploy
159
+ client.create_app(slug="my-app", name="My App")
160
+
161
+ client.add_questions("my-app", [
162
+ {"question": "What is the capital of France?", "human_answer": "Paris"},
163
+ {"question": "What is 2 + 2?", "human_answer": "4"},
164
+ ])
165
+
166
+ # Run after each inference cycle
167
+ def my_llm(question: str) -> str:
168
+ return my_model.complete(question)
169
+
170
+ client.run_evaluation(
171
+ app_slug="my-app",
172
+ app_version="v1.4.2",
173
+ callback=my_llm,
174
+ evaluation_type="LLM_ONLY",
175
+ llm_judge_models=["gpt-4o-mini"],
176
+ )
177
+ ```