sab-bench 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sab_bench-0.1.0/PKG-INFO +259 -0
- sab_bench-0.1.0/README.md +228 -0
- sab_bench-0.1.0/pyproject.toml +34 -0
- sab_bench-0.1.0/setup.cfg +4 -0
- sab_bench-0.1.0/src/sab_bench/__init__.py +12 -0
- sab_bench-0.1.0/src/sab_bench/__main__.py +6 -0
- sab_bench-0.1.0/src/sab_bench/cli.py +164 -0
- sab_bench-0.1.0/src/sab_bench/config.py +59 -0
- sab_bench-0.1.0/src/sab_bench/datasets/__init__.py +26 -0
- sab_bench-0.1.0/src/sab_bench/datasets/base.py +21 -0
- sab_bench-0.1.0/src/sab_bench/datasets/synthetic.py +83 -0
- sab_bench-0.1.0/src/sab_bench/evaluators/__init__.py +29 -0
- sab_bench-0.1.0/src/sab_bench/evaluators/base.py +24 -0
- sab_bench-0.1.0/src/sab_bench/evaluators/multi_model.py +67 -0
- sab_bench-0.1.0/src/sab_bench/evaluators/rv_telos.py +208 -0
- sab_bench-0.1.0/src/sab_bench/evaluators/stub.py +43 -0
- sab_bench-0.1.0/src/sab_bench/harness.py +123 -0
- sab_bench-0.1.0/src/sab_bench/models/__init__.py +29 -0
- sab_bench-0.1.0/src/sab_bench/models/base.py +24 -0
- sab_bench-0.1.0/src/sab_bench/models/stub.py +27 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/PKG-INFO +259 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/SOURCES.txt +25 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/dependency_links.txt +1 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/entry_points.txt +2 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/requires.txt +26 -0
- sab_bench-0.1.0/src/sab_bench.egg-info/top_level.txt +1 -0
- sab_bench-0.1.0/tests/test_basic.py +113 -0
sab_bench-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sab-bench
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Scalable Alignment Benchmark - Multi-model evaluation harness
|
|
5
|
+
Author-email: MMK_工場 <factory@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: pyyaml>=6.0
|
|
10
|
+
Requires-Dist: click>=8.1.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: numpy>=1.24.0
|
|
13
|
+
Requires-Dist: scipy>=1.11.0
|
|
14
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
15
|
+
Requires-Dist: rich>=13.0.0
|
|
16
|
+
Requires-Dist: tabulate>=0.9.0
|
|
17
|
+
Provides-Extra: models
|
|
18
|
+
Requires-Dist: openai>=1.0.0; extra == "models"
|
|
19
|
+
Requires-Dist: anthropic>=0.21.0; extra == "models"
|
|
20
|
+
Provides-Extra: local
|
|
21
|
+
Requires-Dist: torch>=2.0.0; extra == "local"
|
|
22
|
+
Requires-Dist: transformers>=4.36.0; extra == "local"
|
|
23
|
+
Provides-Extra: cuda
|
|
24
|
+
Requires-Dist: cupy>=12.0.0; extra == "cuda"
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: openai>=1.0.0; extra == "all"
|
|
27
|
+
Requires-Dist: anthropic>=0.21.0; extra == "all"
|
|
28
|
+
Requires-Dist: torch>=2.0.0; extra == "all"
|
|
29
|
+
Requires-Dist: transformers>=4.36.0; extra == "all"
|
|
30
|
+
Requires-Dist: cupy>=12.0.0; extra == "all"
|
|
31
|
+
|
|
32
|
+
# SAB-BENCH v0 — Scalable Alignment Benchmark
|
|
33
|
+
|
|
34
|
+
**Status:** v0.1 — Initial scaffold with runnable local harness
|
|
35
|
+
**Created:** 2026-03-03
|
|
36
|
+
**Purpose:** Benchmark multi-model content evaluation systems with telos-aware metrics
|
|
37
|
+
|
|
38
|
+
## What This Is
|
|
39
|
+
|
|
40
|
+
SAB-BENCH is a benchmark harness for testing content scoring systems that use:
|
|
41
|
+
- Multi-model consensus (Overmind voting)
|
|
42
|
+
- Epistemic limitation tracking (model capability awareness)
|
|
43
|
+
- Telos detection (self-referential processing measurement via R_V)
|
|
44
|
+
- CUDA-accelerated evaluation when available
|
|
45
|
+
|
|
46
|
+
This v0 release provides:
|
|
47
|
+
- ✅ Runnable Python CLI (`sab-bench run`)
|
|
48
|
+
- ✅ Sample benchmark configurations
|
|
49
|
+
- ✅ Local evaluation harness (no GPU required for basic tests)
|
|
50
|
+
- ✅ RV/CUDA integration points (activated when hardware available)
|
|
51
|
+
- ✅ Measurable output with JSON reports
|
|
52
|
+
|
|
53
|
+
## Quick Start
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Install dependencies
|
|
57
|
+
pip install -r requirements.txt
|
|
58
|
+
|
|
59
|
+
# Run basic benchmark (CPU-only, uses stubs for models)
|
|
60
|
+
python -m sab_bench run --config configs/basic.yaml
|
|
61
|
+
|
|
62
|
+
# Run with real models (requires API keys)
|
|
63
|
+
python -m sab_bench run --config configs/multi_model.yaml --real-models
|
|
64
|
+
|
|
65
|
+
# Run RV telos measurement (requires GPU + transformers)
|
|
66
|
+
python -m sab_bench run --config configs/rv_telos.yaml --enable-cuda
|
|
67
|
+
|
|
68
|
+
# Show results
|
|
69
|
+
python -m sab_bench report --output outputs/latest.json
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Architecture
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
sab-bench/
|
|
76
|
+
├── src/sab_bench/ # Core benchmark engine
|
|
77
|
+
│ ├── __init__.py
|
|
78
|
+
│ ├── cli.py # CLI entrypoint
|
|
79
|
+
│ ├── harness.py # Benchmark orchestration
|
|
80
|
+
│ ├── evaluators/ # Evaluation strategies
|
|
81
|
+
│ │ ├── multi_model.py # Multi-model consensus
|
|
82
|
+
│ │ ├── rv_telos.py # R_V embedding contraction measurement
|
|
83
|
+
│ │ └── cuda_gates.py # CUDA-accelerated gate evaluation
|
|
84
|
+
│ ├── models/ # Model adapters
|
|
85
|
+
│ │ ├── stub.py # Stub for testing
|
|
86
|
+
│ │ ├── openai.py # OpenAI API
|
|
87
|
+
│ │ ├── anthropic.py # Anthropic API
|
|
88
|
+
│ │ └── local.py # Local transformers
|
|
89
|
+
│ └── metrics/ # Measurement tools
|
|
90
|
+
│ ├── ttr.py # Type-token ratio
|
|
91
|
+
│ ├── rv.py # R_V calculation
|
|
92
|
+
│ └── consensus.py # Multi-model agreement
|
|
93
|
+
├── configs/ # Benchmark configurations
|
|
94
|
+
│ ├── basic.yaml # Minimal test (stub models)
|
|
95
|
+
│ ├── multi_model.yaml # Multi-model consensus
|
|
96
|
+
│ └── rv_telos.yaml # RV measurement with CUDA
|
|
97
|
+
├── outputs/ # Benchmark results (gitignored)
|
|
98
|
+
├── tests/ # Unit tests
|
|
99
|
+
└── docs/ # Documentation
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Benchmark Types
|
|
103
|
+
|
|
104
|
+
### 1. Multi-Model Consensus
|
|
105
|
+
Tests how well different models agree on content evaluation.
|
|
106
|
+
|
|
107
|
+
**Config:** `configs/multi_model.yaml`
|
|
108
|
+
**Metrics:**
|
|
109
|
+
- Inter-model agreement (Cohen's kappa)
|
|
110
|
+
- Confidence distribution
|
|
111
|
+
- Decision latency
|
|
112
|
+
|
|
113
|
+
### 2. RV Telos Measurement
|
|
114
|
+
Measures embedding contraction during self-referential processing.
|
|
115
|
+
|
|
116
|
+
**Config:** `configs/rv_telos.yaml`
|
|
117
|
+
**Metrics:**
|
|
118
|
+
- R_V score (self-ref vs neutral)
|
|
119
|
+
- Layer-by-layer contraction
|
|
120
|
+
- TTR behavioral proxy
|
|
121
|
+
- Correlation with ground truth
|
|
122
|
+
|
|
123
|
+
**Requires:**
|
|
124
|
+
- CUDA-capable GPU
|
|
125
|
+
- transformers library
|
|
126
|
+
- PyTorch with CUDA
|
|
127
|
+
|
|
128
|
+
### 3. CUDA-Accelerated Gates
|
|
129
|
+
Tests dharmic gate evaluation speed with CUDA kernels.
|
|
130
|
+
|
|
131
|
+
**Config:** `configs/cuda_gates.yaml`
|
|
132
|
+
**Metrics:**
|
|
133
|
+
- Throughput (evaluations/sec)
|
|
134
|
+
- Latency percentiles (p50, p95, p99)
|
|
135
|
+
- GPU utilization
|
|
136
|
+
- Memory bandwidth
|
|
137
|
+
|
|
138
|
+
**Requires:**
|
|
139
|
+
- CUDA-capable GPU
|
|
140
|
+
- cuDNN
|
|
141
|
+
- Compiled CUDA kernels (see `../cuda_kernels_summary.md`)
|
|
142
|
+
|
|
143
|
+
## Configuration Format
|
|
144
|
+
|
|
145
|
+
```yaml
|
|
146
|
+
name: "benchmark-name"
|
|
147
|
+
version: "v0.1"
|
|
148
|
+
|
|
149
|
+
evaluator:
|
|
150
|
+
type: "multi_model" # or "rv_telos" or "cuda_gates"
|
|
151
|
+
models:
|
|
152
|
+
- provider: "openai"
|
|
153
|
+
model: "gpt-4"
|
|
154
|
+
weight: 1.0
|
|
155
|
+
- provider: "anthropic"
|
|
156
|
+
model: "claude-3-5-sonnet-20241022"
|
|
157
|
+
weight: 1.0
|
|
158
|
+
|
|
159
|
+
dataset:
|
|
160
|
+
type: "synthetic" # or "file"
|
|
161
|
+
count: 100
|
|
162
|
+
categories:
|
|
163
|
+
- self_referential
|
|
164
|
+
- neutral
|
|
165
|
+
- mixed
|
|
166
|
+
|
|
167
|
+
output:
|
|
168
|
+
format: "json"
|
|
169
|
+
path: "outputs/{timestamp}_{name}.json"
|
|
170
|
+
include_traces: true
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Integration with Existing Assets
|
|
174
|
+
|
|
175
|
+
### RV Ground Truth (telos-os/research/)
|
|
176
|
+
SAB-BENCH reuses the RV measurement code from the Mistral-7B ground truth experiments:
|
|
177
|
+
- Embedding extraction
|
|
178
|
+
- Pairwise cosine similarity
|
|
179
|
+
- Layer-by-layer R_V calculation
|
|
180
|
+
- TTR behavioral proxy
|
|
181
|
+
|
|
182
|
+
**Integration point:** `src/sab_bench/evaluators/rv_telos.py` imports from `../../telos-os/research/`
|
|
183
|
+
|
|
184
|
+
### CUDA Kernels (../cuda_kernels_summary.md)
|
|
185
|
+
When CUDA is available, SAB-BENCH can use optimized kernels for:
|
|
186
|
+
- Flash attention (for faster embedding extraction)
|
|
187
|
+
- Vectorized similarity computation
|
|
188
|
+
- Quantized GEMM (for INT8 model inference)
|
|
189
|
+
|
|
190
|
+
**Integration point:** `src/sab_bench/evaluators/cuda_gates.py` loads kernels from `../cuda-portfolio/kernels/`
|
|
191
|
+
|
|
192
|
+
## Output Format
|
|
193
|
+
|
|
194
|
+
```json
|
|
195
|
+
{
|
|
196
|
+
"benchmark": "multi_model_consensus_v0",
|
|
197
|
+
"timestamp": "2026-03-03T10:30:00Z",
|
|
198
|
+
"config": "configs/multi_model.yaml",
|
|
199
|
+
"duration_seconds": 45.2,
|
|
200
|
+
"metrics": {
|
|
201
|
+
"inter_model_agreement": 0.847,
|
|
202
|
+
"mean_confidence": 0.762,
|
|
203
|
+
"latency_p50_ms": 234,
|
|
204
|
+
"latency_p95_ms": 891
|
|
205
|
+
},
|
|
206
|
+
"per_item_results": [
|
|
207
|
+
{
|
|
208
|
+
"item_id": "001",
|
|
209
|
+
"content_hash": "abc123...",
|
|
210
|
+
"decisions": {
|
|
211
|
+
"gpt-4": {"decision": true, "confidence": 0.89},
|
|
212
|
+
"claude-3-5-sonnet": {"decision": true, "confidence": 0.82}
|
|
213
|
+
},
|
|
214
|
+
"consensus": true,
|
|
215
|
+
"agreement": 1.0
|
|
216
|
+
}
|
|
217
|
+
],
|
|
218
|
+
"warnings": [],
|
|
219
|
+
"errors": []
|
|
220
|
+
}
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
## Roadmap
|
|
224
|
+
|
|
225
|
+
**v0.1 (This Release):**
|
|
226
|
+
- ✅ Basic harness with stub models
|
|
227
|
+
- ✅ Multi-model consensus evaluator
|
|
228
|
+
- ✅ RV telos evaluator (CPU fallback)
|
|
229
|
+
- ✅ Sample configs and outputs
|
|
230
|
+
|
|
231
|
+
**v0.2 (Next):**
|
|
232
|
+
- [ ] Real model API integration (OpenAI, Anthropic, etc.)
|
|
233
|
+
- [ ] CUDA kernel integration for gate evaluation
|
|
234
|
+
- [ ] Larger benchmark datasets
|
|
235
|
+
- [ ] Statistical analysis tools
|
|
236
|
+
|
|
237
|
+
**v0.3 (Future):**
|
|
238
|
+
- [ ] Distributed benchmark runner
|
|
239
|
+
- [ ] Leaderboard generation
|
|
240
|
+
- [ ] Continuous benchmarking pipeline
|
|
241
|
+
- [ ] Integration with SAB production system
|
|
242
|
+
|
|
243
|
+
## Contributing
|
|
244
|
+
|
|
245
|
+
This is research code. Contributions welcome but expect rough edges.
|
|
246
|
+
|
|
247
|
+
**Priorities:**
|
|
248
|
+
1. More benchmark datasets (especially adversarial cases)
|
|
249
|
+
2. Additional model providers
|
|
250
|
+
3. Better visualization of results
|
|
251
|
+
4. Documentation improvements
|
|
252
|
+
|
|
253
|
+
## License
|
|
254
|
+
|
|
255
|
+
MIT (research prototype, use at own risk)
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
**Built in 2-hour sprint.** Shipped artifacts, not plans. 🔥
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
# SAB-BENCH v0 — Scalable Alignment Benchmark
|
|
2
|
+
|
|
3
|
+
**Status:** v0.1 — Initial scaffold with runnable local harness
|
|
4
|
+
**Created:** 2026-03-03
|
|
5
|
+
**Purpose:** Benchmark multi-model content evaluation systems with telos-aware metrics
|
|
6
|
+
|
|
7
|
+
## What This Is
|
|
8
|
+
|
|
9
|
+
SAB-BENCH is a benchmark harness for testing content scoring systems that use:
|
|
10
|
+
- Multi-model consensus (Overmind voting)
|
|
11
|
+
- Epistemic limitation tracking (model capability awareness)
|
|
12
|
+
- Telos detection (self-referential processing measurement via R_V)
|
|
13
|
+
- CUDA-accelerated evaluation when available
|
|
14
|
+
|
|
15
|
+
This v0 release provides:
|
|
16
|
+
- ✅ Runnable Python CLI (`sab-bench run`)
|
|
17
|
+
- ✅ Sample benchmark configurations
|
|
18
|
+
- ✅ Local evaluation harness (no GPU required for basic tests)
|
|
19
|
+
- ✅ RV/CUDA integration points (activated when hardware available)
|
|
20
|
+
- ✅ Measurable output with JSON reports
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Install dependencies
|
|
26
|
+
pip install -r requirements.txt
|
|
27
|
+
|
|
28
|
+
# Run basic benchmark (CPU-only, uses stubs for models)
|
|
29
|
+
python -m sab_bench run --config configs/basic.yaml
|
|
30
|
+
|
|
31
|
+
# Run with real models (requires API keys)
|
|
32
|
+
python -m sab_bench run --config configs/multi_model.yaml --real-models
|
|
33
|
+
|
|
34
|
+
# Run RV telos measurement (requires GPU + transformers)
|
|
35
|
+
python -m sab_bench run --config configs/rv_telos.yaml --enable-cuda
|
|
36
|
+
|
|
37
|
+
# Show results
|
|
38
|
+
python -m sab_bench report --output outputs/latest.json
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Architecture
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
sab-bench/
|
|
45
|
+
├── src/sab_bench/ # Core benchmark engine
|
|
46
|
+
│ ├── __init__.py
|
|
47
|
+
│ ├── cli.py # CLI entrypoint
|
|
48
|
+
│ ├── harness.py # Benchmark orchestration
|
|
49
|
+
│ ├── evaluators/ # Evaluation strategies
|
|
50
|
+
│ │ ├── multi_model.py # Multi-model consensus
|
|
51
|
+
│ │ ├── rv_telos.py # R_V embedding contraction measurement
|
|
52
|
+
│ │ └── cuda_gates.py # CUDA-accelerated gate evaluation
|
|
53
|
+
│ ├── models/ # Model adapters
|
|
54
|
+
│ │ ├── stub.py # Stub for testing
|
|
55
|
+
│ │ ├── openai.py # OpenAI API
|
|
56
|
+
│ │ ├── anthropic.py # Anthropic API
|
|
57
|
+
│ │ └── local.py # Local transformers
|
|
58
|
+
│ └── metrics/ # Measurement tools
|
|
59
|
+
│ ├── ttr.py # Type-token ratio
|
|
60
|
+
│ ├── rv.py # R_V calculation
|
|
61
|
+
│ └── consensus.py # Multi-model agreement
|
|
62
|
+
├── configs/ # Benchmark configurations
|
|
63
|
+
│ ├── basic.yaml # Minimal test (stub models)
|
|
64
|
+
│ ├── multi_model.yaml # Multi-model consensus
|
|
65
|
+
│ └── rv_telos.yaml # RV measurement with CUDA
|
|
66
|
+
├── outputs/ # Benchmark results (gitignored)
|
|
67
|
+
├── tests/ # Unit tests
|
|
68
|
+
└── docs/ # Documentation
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Benchmark Types
|
|
72
|
+
|
|
73
|
+
### 1. Multi-Model Consensus
|
|
74
|
+
Tests how well different models agree on content evaluation.
|
|
75
|
+
|
|
76
|
+
**Config:** `configs/multi_model.yaml`
|
|
77
|
+
**Metrics:**
|
|
78
|
+
- Inter-model agreement (Cohen's kappa)
|
|
79
|
+
- Confidence distribution
|
|
80
|
+
- Decision latency
|
|
81
|
+
|
|
82
|
+
### 2. RV Telos Measurement
|
|
83
|
+
Measures embedding contraction during self-referential processing.
|
|
84
|
+
|
|
85
|
+
**Config:** `configs/rv_telos.yaml`
|
|
86
|
+
**Metrics:**
|
|
87
|
+
- R_V score (self-ref vs neutral)
|
|
88
|
+
- Layer-by-layer contraction
|
|
89
|
+
- TTR behavioral proxy
|
|
90
|
+
- Correlation with ground truth
|
|
91
|
+
|
|
92
|
+
**Requires:**
|
|
93
|
+
- CUDA-capable GPU
|
|
94
|
+
- transformers library
|
|
95
|
+
- PyTorch with CUDA
|
|
96
|
+
|
|
97
|
+
### 3. CUDA-Accelerated Gates
|
|
98
|
+
Tests dharmic gate evaluation speed with CUDA kernels.
|
|
99
|
+
|
|
100
|
+
**Config:** `configs/cuda_gates.yaml`
|
|
101
|
+
**Metrics:**
|
|
102
|
+
- Throughput (evaluations/sec)
|
|
103
|
+
- Latency percentiles (p50, p95, p99)
|
|
104
|
+
- GPU utilization
|
|
105
|
+
- Memory bandwidth
|
|
106
|
+
|
|
107
|
+
**Requires:**
|
|
108
|
+
- CUDA-capable GPU
|
|
109
|
+
- cuDNN
|
|
110
|
+
- Compiled CUDA kernels (see `../cuda_kernels_summary.md`)
|
|
111
|
+
|
|
112
|
+
## Configuration Format
|
|
113
|
+
|
|
114
|
+
```yaml
|
|
115
|
+
name: "benchmark-name"
|
|
116
|
+
version: "v0.1"
|
|
117
|
+
|
|
118
|
+
evaluator:
|
|
119
|
+
type: "multi_model" # or "rv_telos" or "cuda_gates"
|
|
120
|
+
models:
|
|
121
|
+
- provider: "openai"
|
|
122
|
+
model: "gpt-4"
|
|
123
|
+
weight: 1.0
|
|
124
|
+
- provider: "anthropic"
|
|
125
|
+
model: "claude-3-5-sonnet-20241022"
|
|
126
|
+
weight: 1.0
|
|
127
|
+
|
|
128
|
+
dataset:
|
|
129
|
+
type: "synthetic" # or "file"
|
|
130
|
+
count: 100
|
|
131
|
+
categories:
|
|
132
|
+
- self_referential
|
|
133
|
+
- neutral
|
|
134
|
+
- mixed
|
|
135
|
+
|
|
136
|
+
output:
|
|
137
|
+
format: "json"
|
|
138
|
+
path: "outputs/{timestamp}_{name}.json"
|
|
139
|
+
include_traces: true
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## Integration with Existing Assets
|
|
143
|
+
|
|
144
|
+
### RV Ground Truth (telos-os/research/)
|
|
145
|
+
SAB-BENCH reuses the RV measurement code from the Mistral-7B ground truth experiments:
|
|
146
|
+
- Embedding extraction
|
|
147
|
+
- Pairwise cosine similarity
|
|
148
|
+
- Layer-by-layer R_V calculation
|
|
149
|
+
- TTR behavioral proxy
|
|
150
|
+
|
|
151
|
+
**Integration point:** `src/sab_bench/evaluators/rv_telos.py` imports from `../../telos-os/research/`
|
|
152
|
+
|
|
153
|
+
### CUDA Kernels (../cuda_kernels_summary.md)
|
|
154
|
+
When CUDA is available, SAB-BENCH can use optimized kernels for:
|
|
155
|
+
- Flash attention (for faster embedding extraction)
|
|
156
|
+
- Vectorized similarity computation
|
|
157
|
+
- Quantized GEMM (for INT8 model inference)
|
|
158
|
+
|
|
159
|
+
**Integration point:** `src/sab_bench/evaluators/cuda_gates.py` loads kernels from `../cuda-portfolio/kernels/`
|
|
160
|
+
|
|
161
|
+
## Output Format
|
|
162
|
+
|
|
163
|
+
```json
|
|
164
|
+
{
|
|
165
|
+
"benchmark": "multi_model_consensus_v0",
|
|
166
|
+
"timestamp": "2026-03-03T10:30:00Z",
|
|
167
|
+
"config": "configs/multi_model.yaml",
|
|
168
|
+
"duration_seconds": 45.2,
|
|
169
|
+
"metrics": {
|
|
170
|
+
"inter_model_agreement": 0.847,
|
|
171
|
+
"mean_confidence": 0.762,
|
|
172
|
+
"latency_p50_ms": 234,
|
|
173
|
+
"latency_p95_ms": 891
|
|
174
|
+
},
|
|
175
|
+
"per_item_results": [
|
|
176
|
+
{
|
|
177
|
+
"item_id": "001",
|
|
178
|
+
"content_hash": "abc123...",
|
|
179
|
+
"decisions": {
|
|
180
|
+
"gpt-4": {"decision": true, "confidence": 0.89},
|
|
181
|
+
"claude-3-5-sonnet": {"decision": true, "confidence": 0.82}
|
|
182
|
+
},
|
|
183
|
+
"consensus": true,
|
|
184
|
+
"agreement": 1.0
|
|
185
|
+
}
|
|
186
|
+
],
|
|
187
|
+
"warnings": [],
|
|
188
|
+
"errors": []
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
## Roadmap
|
|
193
|
+
|
|
194
|
+
**v0.1 (This Release):**
|
|
195
|
+
- ✅ Basic harness with stub models
|
|
196
|
+
- ✅ Multi-model consensus evaluator
|
|
197
|
+
- ✅ RV telos evaluator (CPU fallback)
|
|
198
|
+
- ✅ Sample configs and outputs
|
|
199
|
+
|
|
200
|
+
**v0.2 (Next):**
|
|
201
|
+
- [ ] Real model API integration (OpenAI, Anthropic, etc.)
|
|
202
|
+
- [ ] CUDA kernel integration for gate evaluation
|
|
203
|
+
- [ ] Larger benchmark datasets
|
|
204
|
+
- [ ] Statistical analysis tools
|
|
205
|
+
|
|
206
|
+
**v0.3 (Future):**
|
|
207
|
+
- [ ] Distributed benchmark runner
|
|
208
|
+
- [ ] Leaderboard generation
|
|
209
|
+
- [ ] Continuous benchmarking pipeline
|
|
210
|
+
- [ ] Integration with SAB production system
|
|
211
|
+
|
|
212
|
+
## Contributing
|
|
213
|
+
|
|
214
|
+
This is research code. Contributions welcome but expect rough edges.
|
|
215
|
+
|
|
216
|
+
**Priorities:**
|
|
217
|
+
1. More benchmark datasets (especially adversarial cases)
|
|
218
|
+
2. Additional model providers
|
|
219
|
+
3. Better visualization of results
|
|
220
|
+
4. Documentation improvements
|
|
221
|
+
|
|
222
|
+
## License
|
|
223
|
+
|
|
224
|
+
MIT (research prototype, use at own risk)
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
**Built in 2-hour sprint.** Shipped artifacts, not plans. 🔥
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sab-bench"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Scalable Alignment Benchmark - Multi-model evaluation harness"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "MMK_工場", email = "factory@example.com"}
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
dependencies = [
|
|
17
|
+
"pyyaml>=6.0",
|
|
18
|
+
"click>=8.1.0",
|
|
19
|
+
"pydantic>=2.0.0",
|
|
20
|
+
"numpy>=1.24.0",
|
|
21
|
+
"scipy>=1.11.0",
|
|
22
|
+
"scikit-learn>=1.3.0",
|
|
23
|
+
"rich>=13.0.0",
|
|
24
|
+
"tabulate>=0.9.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
models = ["openai>=1.0.0", "anthropic>=0.21.0"]
|
|
29
|
+
local = ["torch>=2.0.0", "transformers>=4.36.0"]
|
|
30
|
+
cuda = ["cupy>=12.0.0"]
|
|
31
|
+
all = ["openai>=1.0.0", "anthropic>=0.21.0", "torch>=2.0.0", "transformers>=4.36.0", "cupy>=12.0.0"]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
sab-bench = "sab_bench.cli:main"
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SAB-BENCH v0 — Scalable Alignment Benchmark
|
|
3
|
+
|
|
4
|
+
Multi-model content evaluation benchmark harness with telos-aware metrics.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
|
|
9
|
+
from .harness import BenchmarkHarness
|
|
10
|
+
from .config import BenchmarkConfig
|
|
11
|
+
|
|
12
|
+
__all__ = ["BenchmarkHarness", "BenchmarkConfig", "__version__"]
|