modelab 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelab-0.1.0/.github/workflows/publish.yml +43 -0
- modelab-0.1.0/.gitignore +39 -0
- modelab-0.1.0/ARD/001-record-response-api.md +58 -0
- modelab-0.1.0/Dockerfile +26 -0
- modelab-0.1.0/IMPLEMENTATION.md +252 -0
- modelab-0.1.0/PKG-INFO +163 -0
- modelab-0.1.0/README.md +146 -0
- modelab-0.1.0/dashboard/.gitignore +13 -0
- modelab-0.1.0/dashboard/index.html +12 -0
- modelab-0.1.0/dashboard/package-lock.json +2876 -0
- modelab-0.1.0/dashboard/package.json +28 -0
- modelab-0.1.0/dashboard/src/App.tsx +27 -0
- modelab-0.1.0/dashboard/src/index.css +25 -0
- modelab-0.1.0/dashboard/src/lib/api.ts +48 -0
- modelab-0.1.0/dashboard/src/lib/utils.ts +16 -0
- modelab-0.1.0/dashboard/src/main.tsx +13 -0
- modelab-0.1.0/dashboard/src/pages/flag.tsx +192 -0
- modelab-0.1.0/dashboard/src/pages/overview.tsx +72 -0
- modelab-0.1.0/dashboard/tsconfig.json +24 -0
- modelab-0.1.0/dashboard/tsconfig.tsbuildinfo +1 -0
- modelab-0.1.0/dashboard/vite.config.ts +18 -0
- modelab-0.1.0/docker-compose.yml +30 -0
- modelab-0.1.0/modelab/__init__.py +126 -0
- modelab-0.1.0/modelab/_assignment.py +163 -0
- modelab-0.1.0/modelab/_engine.py +40 -0
- modelab-0.1.0/modelab/_errors.py +26 -0
- modelab-0.1.0/modelab/_server_storage.py +102 -0
- modelab-0.1.0/modelab/_state.py +42 -0
- modelab-0.1.0/modelab/_types.py +70 -0
- modelab-0.1.0/modelab/py.typed +0 -0
- modelab-0.1.0/pyproject.toml +34 -0
- modelab-0.1.0/server/__init__.py +0 -0
- modelab-0.1.0/server/app.py +55 -0
- modelab-0.1.0/server/config.py +18 -0
- modelab-0.1.0/server/database.py +100 -0
- modelab-0.1.0/server/models.py +83 -0
- modelab-0.1.0/server/routes/__init__.py +0 -0
- modelab-0.1.0/server/routes/api.py +161 -0
- modelab-0.1.0/server/routes/ingest.py +105 -0
- modelab-0.1.0/tests/__init__.py +0 -0
- modelab-0.1.0/tests/conftest.py +26 -0
- modelab-0.1.0/tests/test_assignment.py +398 -0
- modelab-0.1.0/tests/test_engine.py +297 -0
- modelab-0.1.0/tests/test_integration.py +278 -0
- modelab-0.1.0/tests/test_server_integration.py +431 -0
- modelab-0.1.0/tests/test_server_storage.py +299 -0
- modelab-0.1.0/tests/test_state.py +139 -0
- modelab-0.1.0/tests/test_types.py +171 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
|
|
13
|
+
- uses: actions/setup-python@v5
|
|
14
|
+
with:
|
|
15
|
+
python-version: "3.12"
|
|
16
|
+
|
|
17
|
+
- name: Install dependencies
|
|
18
|
+
run: pip install -e ".[dev]"
|
|
19
|
+
|
|
20
|
+
- name: Run tests
|
|
21
|
+
run: pytest -m "not docker"
|
|
22
|
+
|
|
23
|
+
publish:
|
|
24
|
+
needs: test
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
environment: pypi
|
|
27
|
+
permissions:
|
|
28
|
+
id-token: write
|
|
29
|
+
steps:
|
|
30
|
+
- uses: actions/checkout@v4
|
|
31
|
+
|
|
32
|
+
- uses: actions/setup-python@v5
|
|
33
|
+
with:
|
|
34
|
+
python-version: "3.12"
|
|
35
|
+
|
|
36
|
+
- name: Install build tools
|
|
37
|
+
run: pip install build
|
|
38
|
+
|
|
39
|
+
- name: Build package
|
|
40
|
+
run: python -m build
|
|
41
|
+
|
|
42
|
+
- name: Publish to PyPI
|
|
43
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
modelab-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
.venv
|
|
3
|
+
__pycache__
|
|
4
|
+
*.pyc
|
|
5
|
+
*.pyo
|
|
6
|
+
*.pyd
|
|
7
|
+
.pytest_cache
|
|
8
|
+
.ruff_cache
|
|
9
|
+
*.egg-info
|
|
10
|
+
dist/
|
|
11
|
+
build/
|
|
12
|
+
|
|
13
|
+
# Node.js
|
|
14
|
+
node_modules/
|
|
15
|
+
npm-debug.log*
|
|
16
|
+
yarn-debug.log*
|
|
17
|
+
yarn-error.log*
|
|
18
|
+
|
|
19
|
+
# IDEs
|
|
20
|
+
.vscode
|
|
21
|
+
.idea
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
|
|
26
|
+
# OS
|
|
27
|
+
.DS_Store
|
|
28
|
+
Thumbs.db
|
|
29
|
+
|
|
30
|
+
# Environment
|
|
31
|
+
.env
|
|
32
|
+
.env.local
|
|
33
|
+
.env.development.local
|
|
34
|
+
.env.test.local
|
|
35
|
+
.env.production.local
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Dashboard build output
|
|
39
|
+
dashboard/dist/
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# ARD-001: Replace `track()` with `record(response)`
|
|
2
|
+
|
|
3
|
+
**Status:** Accepted
|
|
4
|
+
**Date:** 2026-02-16
|
|
5
|
+
|
|
6
|
+
## Context
|
|
7
|
+
|
|
8
|
+
The `track()` context manager forced users to nest their LLM calls inside modelab:
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
with assignment.track() as t:
|
|
12
|
+
response = client.chat.completions.create(...)
|
|
13
|
+
t.set_tokens(input=usage.prompt_tokens, output=usage.completion_tokens)
|
|
14
|
+
t.set_cost(0.013)
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
This couples user code to the SDK. Users with existing wrappers (LangSmith, Sentry, custom middleware) cannot compose them with `track()` — context managers don't compose cleanly. modelab should observe LLM calls, not own their execution.
|
|
18
|
+
|
|
19
|
+
## Decision
|
|
20
|
+
|
|
21
|
+
Remove `track()` and `_Tracker` entirely. Extend `record()` to accept a provider response object as its first positional argument and auto-extract token usage via duck-typing:
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
response = client.chat.completions.create(...)
|
|
25
|
+
assignment.record(response, cost=0.013)
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
Duck-typing order:
|
|
29
|
+
1. **OpenAI**: `response.usage.prompt_tokens` / `response.usage.completion_tokens`
|
|
30
|
+
2. **Anthropic**: `response.usage.input_tokens` / `response.usage.output_tokens`
|
|
31
|
+
|
|
32
|
+
Explicit keyword arguments (`input_tokens=`, `output_tokens=`) always override extracted values. The backward-compatible kwargs-only call still works:
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
assignment.record(input_tokens=50, output_tokens=100, cost=0.01)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Consequences
|
|
39
|
+
|
|
40
|
+
### Positive
|
|
41
|
+
|
|
42
|
+
- **No coupling**: Users call their LLM however they want, then pass the response to `record()`.
|
|
43
|
+
- **Composable**: Works alongside any other middleware, wrappers, or observability tools.
|
|
44
|
+
- **Zero dependencies**: Duck-typing via `getattr` — no provider imports needed.
|
|
45
|
+
- **Simpler API surface**: One method (`record`) instead of a context manager + 3 setter methods.
|
|
46
|
+
|
|
47
|
+
### Negative
|
|
48
|
+
|
|
49
|
+
- **No auto-latency**: `track()` measured wall-clock time automatically. Users who want latency now measure it externally and pass `latency_ms=`.
|
|
50
|
+
- **No auto-error capture**: `track()` caught exceptions and recorded them. Users now use try/except and pass `error=` + call `mark_failure()`.
|
|
51
|
+
- **Breaking change**: All code using `track()`, `set_tokens()`, `set_cost()`, `set_metadata()` must migrate.
|
|
52
|
+
|
|
53
|
+
### Design decisions within this change
|
|
54
|
+
|
|
55
|
+
- **Cost stays manual**: Provider pricing changes too frequently for reliable auto-calculation.
|
|
56
|
+
- **No auto-latency**: Acceptable tradeoff — users who need it measure externally. Keeps the SDK stateless.
|
|
57
|
+
- **No auto-error capture**: Users call `mark_failure()` for explicit error handling. Cleaner separation of concerns.
|
|
58
|
+
- **Duck-typing order**: OpenAI first (larger market share), then Anthropic. Both are tried via `getattr` with no short-circuiting penalty.
|
modelab-0.1.0/Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Stage 1: Build dashboard
|
|
2
|
+
FROM node:22-slim AS frontend
|
|
3
|
+
WORKDIR /app/dashboard
|
|
4
|
+
COPY dashboard/package.json dashboard/package-lock.json ./
|
|
5
|
+
RUN npm ci
|
|
6
|
+
COPY dashboard/ ./
|
|
7
|
+
RUN npm run build
|
|
8
|
+
|
|
9
|
+
# Stage 2: Python runtime
|
|
10
|
+
FROM python:3.13-slim
|
|
11
|
+
WORKDIR /app
|
|
12
|
+
|
|
13
|
+
# Copy Python source + metadata needed for install
|
|
14
|
+
COPY pyproject.toml README.md ./
|
|
15
|
+
COPY modelab/ modelab/
|
|
16
|
+
COPY server/ server/
|
|
17
|
+
|
|
18
|
+
# Install Python dependencies
|
|
19
|
+
RUN pip install --no-cache-dir ".[server]"
|
|
20
|
+
|
|
21
|
+
# Copy built dashboard
|
|
22
|
+
COPY --from=frontend /app/dashboard/dist dashboard/dist
|
|
23
|
+
|
|
24
|
+
EXPOSE 8100
|
|
25
|
+
|
|
26
|
+
CMD ["python", "-m", "uvicorn", "server.app:app", "--host", "0.0.0.0", "--port", "8100"]
|
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# modelab — Implementation Plan
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
**modelab** is a provider-agnostic Python library + self-hosted server for A/B testing LLM systems in production.
|
|
6
|
+
|
|
7
|
+
Two components:
|
|
8
|
+
|
|
9
|
+
1. **Python SDK** — zero-dep library developers install in their app (assignment, tracking, events)
|
|
10
|
+
2. **Server + Dashboard** — self-hosted FastAPI + React app for visualization (Docker Compose)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## API
|
|
15
|
+
|
|
16
|
+
```python
|
|
17
|
+
import modelab
|
|
18
|
+
from modelab import Flag, Variant, EvalContext
|
|
19
|
+
|
|
20
|
+
# Initialize
|
|
21
|
+
modelab.init(
|
|
22
|
+
server="http://localhost:8100",
|
|
23
|
+
flags=[
|
|
24
|
+
Flag(
|
|
25
|
+
name="summarizer_v2",
|
|
26
|
+
variants=[
|
|
27
|
+
Variant("control", weight=50, config={"model": "gpt-3.5-turbo", "prompt": "Summarize: {input}"}),
|
|
28
|
+
Variant("treatment", weight=50, config={"model": "gpt-4", "prompt": "Concisely summarize: {input}"}),
|
|
29
|
+
],
|
|
30
|
+
rollout_pct=100,
|
|
31
|
+
),
|
|
32
|
+
],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
# Assign
|
|
36
|
+
ctx = EvalContext(user_id="123", session_id="abc")
|
|
37
|
+
assignment = modelab.assign("summarizer_v2", ctx)
|
|
38
|
+
|
|
39
|
+
if assignment is None:
|
|
40
|
+
# Outside rollout — default behavior
|
|
41
|
+
response = call_llm(model="gpt-3.5-turbo", prompt=text)
|
|
42
|
+
else:
|
|
43
|
+
# In experiment — use assigned variant
|
|
44
|
+
response = call_llm(
|
|
45
|
+
model=assignment.config["model"],
|
|
46
|
+
prompt=assignment.config["prompt"].format(input=text),
|
|
47
|
+
)
|
|
48
|
+
assignment.record(response, cost=0.013)
|
|
49
|
+
|
|
50
|
+
assignment.mark_success()
|
|
51
|
+
assignment.mark_custom_event("copied")
|
|
52
|
+
|
|
53
|
+
# Evaluate
|
|
54
|
+
results = modelab.evaluate("summarizer_v2")
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## System Architecture
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
Developer's App
|
|
63
|
+
│
|
|
64
|
+
├── modelab SDK (pip install modelab)
|
|
65
|
+
│ └── ServerStorage ──HTTP POST──▶ modelab-server
|
|
66
|
+
│
|
|
67
|
+
modelab-server (docker compose up)
|
|
68
|
+
├── FastAPI backend
|
|
69
|
+
├── React dashboard (served as static files)
|
|
70
|
+
└── PostgreSQL
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## File Structure
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
modelab/ # Python SDK
|
|
79
|
+
__init__.py # Public API: init(), assign(), evaluate()
|
|
80
|
+
_types.py # Flag, Variant, EvalContext, records, Storage Protocol
|
|
81
|
+
_engine.py # Deterministic hashing + bucketing
|
|
82
|
+
_assignment.py # Assignment class: record(), mark_*()
|
|
83
|
+
_storage.py # Storage Protocol + SQLiteStorage
|
|
84
|
+
_server_storage.py # ServerStorage (HTTP + buffering)
|
|
85
|
+
_aggregator.py # Metrics evaluation queries
|
|
86
|
+
_state.py # Module-level singleton state
|
|
87
|
+
_errors.py # Exception hierarchy
|
|
88
|
+
py.typed # PEP 561 marker
|
|
89
|
+
|
|
90
|
+
server/ # FastAPI backend
|
|
91
|
+
__init__.py
|
|
92
|
+
app.py # FastAPI app + static file mount
|
|
93
|
+
config.py # Settings via env vars
|
|
94
|
+
database.py # Postgres connection, schema, queries
|
|
95
|
+
models.py # Pydantic request/response schemas
|
|
96
|
+
routes/
|
|
97
|
+
__init__.py
|
|
98
|
+
ingest.py # POST /api/v1/ingest/*
|
|
99
|
+
api.py # GET /api/v1/flags/*
|
|
100
|
+
|
|
101
|
+
dashboard/ # React SPA
|
|
102
|
+
package.json
|
|
103
|
+
vite.config.ts
|
|
104
|
+
tsconfig.json
|
|
105
|
+
src/
|
|
106
|
+
main.tsx
|
|
107
|
+
App.tsx
|
|
108
|
+
lib/
|
|
109
|
+
utils.ts # shadcn utils
|
|
110
|
+
api.ts # Fetch wrapper for server API
|
|
111
|
+
components/
|
|
112
|
+
ui/ # shadcn components
|
|
113
|
+
flags-table.tsx
|
|
114
|
+
flag-detail.tsx
|
|
115
|
+
variant-card.tsx
|
|
116
|
+
metrics-chart.tsx
|
|
117
|
+
pages/
|
|
118
|
+
overview.tsx # /
|
|
119
|
+
flag.tsx # /flags/:name
|
|
120
|
+
|
|
121
|
+
tests/
|
|
122
|
+
conftest.py
|
|
123
|
+
test_engine.py
|
|
124
|
+
test_storage.py
|
|
125
|
+
test_assignment.py
|
|
126
|
+
test_aggregator.py
|
|
127
|
+
test_integration.py
|
|
128
|
+
|
|
129
|
+
pyproject.toml
|
|
130
|
+
Dockerfile
|
|
131
|
+
docker-compose.yml
|
|
132
|
+
README.md
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
## Data Model
|
|
138
|
+
|
|
139
|
+
### assignments
|
|
140
|
+
|
|
141
|
+
| Column | Type | Description |
|
|
142
|
+
| ------------- | ----------- | ----------------------- |
|
|
143
|
+
| assignment_id | UUID PK | Unique ID |
|
|
144
|
+
| flag_name | TEXT | Flag identifier |
|
|
145
|
+
| variant_name | TEXT | Assigned variant |
|
|
146
|
+
| user_id | TEXT | User identifier |
|
|
147
|
+
| session_id | TEXT | Optional session |
|
|
148
|
+
| config_json | JSONB | Variant config snapshot |
|
|
149
|
+
| assigned_at | TIMESTAMPTZ | When assigned |
|
|
150
|
+
|
|
151
|
+
### executions (1:1 with assignments)
|
|
152
|
+
|
|
153
|
+
| Column | Type | Description |
|
|
154
|
+
| ------------- | ----------- | ------------------- |
|
|
155
|
+
| assignment_id | UUID PK FK | Links to assignment |
|
|
156
|
+
| latency_ms | FLOAT | Wall-clock latency |
|
|
157
|
+
| input_tokens | INT | Input token count |
|
|
158
|
+
| output_tokens | INT | Output token count |
|
|
159
|
+
| cost | FLOAT | Dollar cost |
|
|
160
|
+
| error | TEXT | Exception message |
|
|
161
|
+
| metadata_json | JSONB | Arbitrary metadata |
|
|
162
|
+
| recorded_at | TIMESTAMPTZ | When recorded |
|
|
163
|
+
|
|
164
|
+
### events (1:N with assignments)
|
|
165
|
+
|
|
166
|
+
| Column | Type | Description |
|
|
167
|
+
| ------------- | ----------- | -------------------------- |
|
|
168
|
+
| event_id | UUID PK | Unique ID |
|
|
169
|
+
| assignment_id | UUID FK | Links to assignment |
|
|
170
|
+
| event_type | TEXT | success / failure / custom |
|
|
171
|
+
| event_name | TEXT | For custom events |
|
|
172
|
+
| payload_json | JSONB | Optional payload |
|
|
173
|
+
| created_at | TIMESTAMPTZ | When created |
|
|
174
|
+
|
|
175
|
+
### Indexes
|
|
176
|
+
|
|
177
|
+
- `(flag_name, variant_name)` on assignments
|
|
178
|
+
- `(flag_name, user_id)` on assignments
|
|
179
|
+
- `(assigned_at)` on assignments
|
|
180
|
+
- `(assignment_id)` on events
|
|
181
|
+
- `(assignment_id, event_type)` on events
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Server API
|
|
186
|
+
|
|
187
|
+
### Ingestion (from SDK)
|
|
188
|
+
|
|
189
|
+
```
|
|
190
|
+
POST /api/v1/ingest/assignments body: AssignmentRecord[]
|
|
191
|
+
POST /api/v1/ingest/executions body: ExecutionRecord[]
|
|
192
|
+
POST /api/v1/ingest/events body: EventRecord[]
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
All accept batches. Authenticated via `X-API-Key` header.
|
|
196
|
+
|
|
197
|
+
### Dashboard API
|
|
198
|
+
|
|
199
|
+
```
|
|
200
|
+
GET /api/v1/flags → list of flags with summary stats
|
|
201
|
+
GET /api/v1/flags/{name} → detailed per-variant evaluation
|
|
202
|
+
GET /api/v1/flags/{name}/timeline → time-series metrics for charts
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
---
|
|
206
|
+
|
|
207
|
+
## Dashboard Pages
|
|
208
|
+
|
|
209
|
+
### Flags Overview (`/`)
|
|
210
|
+
|
|
211
|
+
- Table: flag name, variants, total assignments, success rate, status
|
|
212
|
+
- Click row → detail page
|
|
213
|
+
|
|
214
|
+
### Flag Detail (`/flags/:name`)
|
|
215
|
+
|
|
216
|
+
- Variant comparison cards (assignments, success rate, avg latency, avg cost)
|
|
217
|
+
- Charts (recharts):
|
|
218
|
+
- Success rate bar chart
|
|
219
|
+
- Latency comparison
|
|
220
|
+
- Cost comparison
|
|
221
|
+
- Assignments over time
|
|
222
|
+
- Events breakdown table
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Key Design Decisions
|
|
227
|
+
|
|
228
|
+
| Decision | Choice | Why |
|
|
229
|
+
| ---------------- | --------------------------------------------------------- | ------------------------------------------ |
|
|
230
|
+
| LLM coupling | None — `assign()` returns config, dev calls their own LLM | Works with any stack |
|
|
231
|
+
| Events target | Assignment object only | Explicit, no ambiguity with multi-flag |
|
|
232
|
+
| Hashing | md5 (stdlib) | Zero deps, excellent distribution |
|
|
233
|
+
| Bucket count | 10,000 | 0.01% rollout granularity |
|
|
234
|
+
| Storage errors | Log warning, don't crash | Feature flags are auxiliary |
|
|
235
|
+
| Python version | 3.10+ | Modern type hints |
|
|
236
|
+
| SDK dependencies | Zero | Maximum adoptability |
|
|
237
|
+
| Server DB | PostgreSQL | Production-grade, JSONB, proper timestamps |
|
|
238
|
+
| Dashboard | React + shadcn/ui | Polished components, good DX |
|
|
239
|
+
| Deployment | Single Dockerfile, docker-compose | One command to self-host |
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Excluded from v1
|
|
244
|
+
|
|
245
|
+
- Async SDK
|
|
246
|
+
- LLM provider integrations
|
|
247
|
+
- Statistical significance / p-values
|
|
248
|
+
- Bayesian testing
|
|
249
|
+
- Multi-armed bandits
|
|
250
|
+
- Remote flag management (flags in code, dashboard is read-only)
|
|
251
|
+
- User auth in dashboard (API key only)
|
|
252
|
+
- Real-time streaming
|
modelab-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modelab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Provider-agnostic A/B testing for LLM systems
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: ab-testing,experiments,feature-flags,llm
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: httpx>=0.27; extra == 'dev'
|
|
10
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
11
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
12
|
+
Provides-Extra: server
|
|
13
|
+
Requires-Dist: fastapi>=0.110; extra == 'server'
|
|
14
|
+
Requires-Dist: psycopg[binary,pool]>=3.1; extra == 'server'
|
|
15
|
+
Requires-Dist: uvicorn[standard]>=0.29; extra == 'server'
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# modelab
|
|
19
|
+
|
|
20
|
+
Provider-agnostic A/B testing for LLM systems in production.
|
|
21
|
+
|
|
22
|
+
**Two components:**
|
|
23
|
+
1. **Python SDK** — zero-dependency library for assignment, tracking, and evaluation
|
|
24
|
+
2. **Server + Dashboard** — self-hosted FastAPI + React app for visualization (Docker Compose)
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### SDK (local development)
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
pip install modelab
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import modelab
|
|
36
|
+
from modelab import Flag, Variant, EvalContext
|
|
37
|
+
|
|
38
|
+
# Initialize — point to the modelab server
|
|
39
|
+
modelab.init(
|
|
40
|
+
server="http://localhost:8100",
|
|
41
|
+
flags=[
|
|
42
|
+
Flag(
|
|
43
|
+
name="summarizer_v2",
|
|
44
|
+
variants=[
|
|
45
|
+
Variant("control", weight=50, config={"model": "gpt-3.5-turbo", "prompt": "Summarize: {input}"}),
|
|
46
|
+
Variant("treatment", weight=50, config={"model": "gpt-4", "prompt": "Concisely summarize: {input}"}),
|
|
47
|
+
],
|
|
48
|
+
rollout_pct=100,
|
|
49
|
+
),
|
|
50
|
+
],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Assign a variant
|
|
54
|
+
ctx = EvalContext(user_id="user_123", session_id="abc")
|
|
55
|
+
assignment = modelab.assign("summarizer_v2", ctx)
|
|
56
|
+
|
|
57
|
+
if assignment is None:
|
|
58
|
+
# Outside rollout — use default behavior
|
|
59
|
+
response = call_llm(model="gpt-3.5-turbo", prompt=text)
|
|
60
|
+
else:
|
|
61
|
+
# In experiment — use assigned variant config
|
|
62
|
+
response = call_llm(
|
|
63
|
+
model=assignment.config["model"],
|
|
64
|
+
prompt=assignment.config["prompt"].format(input=text),
|
|
65
|
+
)
|
|
66
|
+
assignment.record(response, cost=0.013)
|
|
67
|
+
assignment.mark_success()
|
|
68
|
+
|
|
69
|
+
# Evaluate results
|
|
70
|
+
results = modelab.evaluate("summarizer_v2")
|
|
71
|
+
print(results)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Self-Hosted Server + Dashboard
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
docker compose up
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
This starts:
|
|
81
|
+
- **PostgreSQL** on port 5432
|
|
82
|
+
- **modelab server + dashboard** on port 8100
|
|
83
|
+
|
|
84
|
+
## Concepts
|
|
85
|
+
|
|
86
|
+
### Flags
|
|
87
|
+
An experiment with one or more variants and a rollout percentage (0-100%).
|
|
88
|
+
|
|
89
|
+
### Variants
|
|
90
|
+
Each variant has a name, weight (for traffic splitting), and a config dict you use to parameterize your LLM calls.
|
|
91
|
+
|
|
92
|
+
### Assignment
|
|
93
|
+
Deterministic — the same `(flag_name, user_id)` always maps to the same variant. Uses MD5 hashing into 10,000 buckets for 0.01% rollout granularity.
|
|
94
|
+
|
|
95
|
+
### Recording
|
|
96
|
+
|
|
97
|
+
Use `assignment.record(response)` to capture execution metrics. Token counts are automatically extracted from the response object via duck-typing (supports OpenAI and Anthropic response formats). Cost, latency, error, and arbitrary metadata can be passed as keyword arguments:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
assignment.record(response, cost=0.013, latency_ms=250.0, model="gpt-4o")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
You can also record without a response object:
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
assignment.record(input_tokens=50, output_tokens=100, cost=0.01)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Events
|
|
110
|
+
Mark assignments as success/failure or record custom events (e.g., "copied", "thumbs_up").
|
|
111
|
+
|
|
112
|
+
### Evaluation
|
|
113
|
+
`modelab.evaluate(flag_name)` returns per-variant metrics: success rate, avg latency, avg cost, token usage, and custom event counts.
|
|
114
|
+
|
|
115
|
+
## Server API
|
|
116
|
+
|
|
117
|
+
### Ingestion (from SDK)
|
|
118
|
+
```
|
|
119
|
+
POST /api/v1/ingest/assignments (batch)
|
|
120
|
+
POST /api/v1/ingest/executions (batch)
|
|
121
|
+
POST /api/v1/ingest/events (batch)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Dashboard API
|
|
125
|
+
```
|
|
126
|
+
GET /api/v1/flags — list flags with summary stats
|
|
127
|
+
GET /api/v1/flags/{name} — detailed per-variant evaluation
|
|
128
|
+
GET /api/v1/flags/{name}/timeline — time-series metrics
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Development
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
# Install in dev mode
|
|
135
|
+
pip install -e ".[dev]"
|
|
136
|
+
|
|
137
|
+
# Run tests
|
|
138
|
+
pytest
|
|
139
|
+
|
|
140
|
+
# Run dashboard dev server
|
|
141
|
+
cd dashboard && npm install && npm run dev
|
|
142
|
+
|
|
143
|
+
# Run API server (requires Postgres)
|
|
144
|
+
uvicorn server.app:app --reload --port 8100
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Architecture
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
Developer's App
|
|
151
|
+
│
|
|
152
|
+
├── modelab SDK (pip install modelab)
|
|
153
|
+
│ └── ServerStorage ──HTTP POST──▶ modelab-server
|
|
154
|
+
│
|
|
155
|
+
modelab-server (docker compose up)
|
|
156
|
+
├── FastAPI backend
|
|
157
|
+
├── React dashboard (served as static files)
|
|
158
|
+
└── PostgreSQL
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## License
|
|
162
|
+
|
|
163
|
+
MIT
|