sab-bench 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ Metadata-Version: 2.4
2
+ Name: sab-bench
3
+ Version: 0.1.0
4
+ Summary: Scalable Alignment Benchmark - Multi-model evaluation harness
5
+ Author-email: MMK_工場 <factory@example.com>
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: pyyaml>=6.0
10
+ Requires-Dist: click>=8.1.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: numpy>=1.24.0
13
+ Requires-Dist: scipy>=1.11.0
14
+ Requires-Dist: scikit-learn>=1.3.0
15
+ Requires-Dist: rich>=13.0.0
16
+ Requires-Dist: tabulate>=0.9.0
17
+ Provides-Extra: models
18
+ Requires-Dist: openai>=1.0.0; extra == "models"
19
+ Requires-Dist: anthropic>=0.21.0; extra == "models"
20
+ Provides-Extra: local
21
+ Requires-Dist: torch>=2.0.0; extra == "local"
22
+ Requires-Dist: transformers>=4.36.0; extra == "local"
23
+ Provides-Extra: cuda
24
+ Requires-Dist: cupy>=12.0.0; extra == "cuda"
25
+ Provides-Extra: all
26
+ Requires-Dist: openai>=1.0.0; extra == "all"
27
+ Requires-Dist: anthropic>=0.21.0; extra == "all"
28
+ Requires-Dist: torch>=2.0.0; extra == "all"
29
+ Requires-Dist: transformers>=4.36.0; extra == "all"
30
+ Requires-Dist: cupy>=12.0.0; extra == "all"
31
+
32
+ # SAB-BENCH v0 — Scalable Alignment Benchmark
33
+
34
+ **Status:** v0.1 — Initial scaffold with runnable local harness
35
+ **Created:** 2026-03-03
36
+ **Purpose:** Benchmark multi-model content evaluation systems with telos-aware metrics
37
+
38
+ ## What This Is
39
+
40
+ SAB-BENCH is a benchmark harness for testing content scoring systems that use:
41
+ - Multi-model consensus (Overmind voting)
42
+ - Epistemic limitation tracking (model capability awareness)
43
+ - Telos detection (self-referential processing measurement via R_V)
44
+ - CUDA-accelerated evaluation when available
45
+
46
+ This v0 release provides:
47
+ - ✅ Runnable Python CLI (`sab-bench run`)
48
+ - ✅ Sample benchmark configurations
49
+ - ✅ Local evaluation harness (no GPU required for basic tests)
50
+ - ✅ RV/CUDA integration points (activated when hardware available)
51
+ - ✅ Measurable output with JSON reports
52
+
53
+ ## Quick Start
54
+
55
+ ```bash
56
+ # Install dependencies
57
+ pip install -r requirements.txt
58
+
59
+ # Run basic benchmark (CPU-only, uses stubs for models)
60
+ python -m sab_bench run --config configs/basic.yaml
61
+
62
+ # Run with real models (requires API keys)
63
+ python -m sab_bench run --config configs/multi_model.yaml --real-models
64
+
65
+ # Run RV telos measurement (requires GPU + transformers)
66
+ python -m sab_bench run --config configs/rv_telos.yaml --enable-cuda
67
+
68
+ # Show results
69
+ python -m sab_bench report --output outputs/latest.json
70
+ ```
71
+
72
+ ## Architecture
73
+
74
+ ```
75
+ sab-bench/
76
+ ├── src/sab_bench/ # Core benchmark engine
77
+ │ ├── __init__.py
78
+ │ ├── cli.py # CLI entrypoint
79
+ │ ├── harness.py # Benchmark orchestration
80
+ │ ├── evaluators/ # Evaluation strategies
81
+ │ │ ├── multi_model.py # Multi-model consensus
82
+ │ │ ├── rv_telos.py # R_V embedding contraction measurement
83
+ │ │ └── cuda_gates.py # CUDA-accelerated gate evaluation
84
+ │ ├── models/ # Model adapters
85
+ │ │ ├── stub.py # Stub for testing
86
+ │ │ ├── openai.py # OpenAI API
87
+ │ │ ├── anthropic.py # Anthropic API
88
+ │ │ └── local.py # Local transformers
89
+ │ └── metrics/ # Measurement tools
90
+ │ ├── ttr.py # Type-token ratio
91
+ │ ├── rv.py # R_V calculation
92
+ │ └── consensus.py # Multi-model agreement
93
+ ├── configs/ # Benchmark configurations
94
+ │ ├── basic.yaml # Minimal test (stub models)
95
+ │ ├── multi_model.yaml # Multi-model consensus
96
+ │ └── rv_telos.yaml # RV measurement with CUDA
97
+ ├── outputs/ # Benchmark results (gitignored)
98
+ ├── tests/ # Unit tests
99
+ └── docs/ # Documentation
100
+ ```
101
+
102
+ ## Benchmark Types
103
+
104
+ ### 1. Multi-Model Consensus
105
+ Tests how well different models agree on content evaluation.
106
+
107
+ **Config:** `configs/multi_model.yaml`
108
+ **Metrics:**
109
+ - Inter-model agreement (Cohen's kappa)
110
+ - Confidence distribution
111
+ - Decision latency
112
+
113
+ ### 2. RV Telos Measurement
114
+ Measures embedding contraction during self-referential processing.
115
+
116
+ **Config:** `configs/rv_telos.yaml`
117
+ **Metrics:**
118
+ - R_V score (self-ref vs neutral)
119
+ - Layer-by-layer contraction
120
+ - TTR behavioral proxy
121
+ - Correlation with ground truth
122
+
123
+ **Requires:**
124
+ - CUDA-capable GPU
125
+ - transformers library
126
+ - PyTorch with CUDA
127
+
128
+ ### 3. CUDA-Accelerated Gates
129
+ Tests dharmic gate evaluation speed with CUDA kernels.
130
+
131
+ **Config:** `configs/cuda_gates.yaml`
132
+ **Metrics:**
133
+ - Throughput (evaluations/sec)
134
+ - Latency percentiles (p50, p95, p99)
135
+ - GPU utilization
136
+ - Memory bandwidth
137
+
138
+ **Requires:**
139
+ - CUDA-capable GPU
140
+ - cuDNN
141
+ - Compiled CUDA kernels (see `../cuda_kernels_summary.md`)
142
+
143
+ ## Configuration Format
144
+
145
+ ```yaml
146
+ name: "benchmark-name"
147
+ version: "v0.1"
148
+
149
+ evaluator:
150
+ type: "multi_model" # or "rv_telos" or "cuda_gates"
151
+ models:
152
+ - provider: "openai"
153
+ model: "gpt-4"
154
+ weight: 1.0
155
+ - provider: "anthropic"
156
+ model: "claude-3-5-sonnet-20241022"
157
+ weight: 1.0
158
+
159
+ dataset:
160
+ type: "synthetic" # or "file"
161
+ count: 100
162
+ categories:
163
+ - self_referential
164
+ - neutral
165
+ - mixed
166
+
167
+ output:
168
+ format: "json"
169
+ path: "outputs/{timestamp}_{name}.json"
170
+ include_traces: true
171
+ ```
172
+
173
+ ## Integration with Existing Assets
174
+
175
+ ### RV Ground Truth (telos-os/research/)
176
+ SAB-BENCH reuses the RV measurement code from the Mistral-7B ground truth experiments:
177
+ - Embedding extraction
178
+ - Pairwise cosine similarity
179
+ - Layer-by-layer R_V calculation
180
+ - TTR behavioral proxy
181
+
182
+ **Integration point:** `src/sab_bench/evaluators/rv_telos.py` imports from `../../telos-os/research/`
183
+
184
+ ### CUDA Kernels (../cuda_kernels_summary.md)
185
+ When CUDA is available, SAB-BENCH can use optimized kernels for:
186
+ - Flash attention (for faster embedding extraction)
187
+ - Vectorized similarity computation
188
+ - Quantized GEMM (for INT8 model inference)
189
+
190
+ **Integration point:** `src/sab_bench/evaluators/cuda_gates.py` loads kernels from `../cuda-portfolio/kernels/`
191
+
192
+ ## Output Format
193
+
194
+ ```json
195
+ {
196
+ "benchmark": "multi_model_consensus_v0",
197
+ "timestamp": "2026-03-03T10:30:00Z",
198
+ "config": "configs/multi_model.yaml",
199
+ "duration_seconds": 45.2,
200
+ "metrics": {
201
+ "inter_model_agreement": 0.847,
202
+ "mean_confidence": 0.762,
203
+ "latency_p50_ms": 234,
204
+ "latency_p95_ms": 891
205
+ },
206
+ "per_item_results": [
207
+ {
208
+ "item_id": "001",
209
+ "content_hash": "abc123...",
210
+ "decisions": {
211
+ "gpt-4": {"decision": true, "confidence": 0.89},
212
+ "claude-3-5-sonnet": {"decision": true, "confidence": 0.82}
213
+ },
214
+ "consensus": true,
215
+ "agreement": 1.0
216
+ }
217
+ ],
218
+ "warnings": [],
219
+ "errors": []
220
+ }
221
+ ```
222
+
223
+ ## Roadmap
224
+
225
+ **v0.1 (This Release):**
226
+ - ✅ Basic harness with stub models
227
+ - ✅ Multi-model consensus evaluator
228
+ - ✅ RV telos evaluator (CPU fallback)
229
+ - ✅ Sample configs and outputs
230
+
231
+ **v0.2 (Next):**
232
+ - [ ] Real model API integration (OpenAI, Anthropic, etc.)
233
+ - [ ] CUDA kernel integration for gate evaluation
234
+ - [ ] Larger benchmark datasets
235
+ - [ ] Statistical analysis tools
236
+
237
+ **v0.3 (Future):**
238
+ - [ ] Distributed benchmark runner
239
+ - [ ] Leaderboard generation
240
+ - [ ] Continuous benchmarking pipeline
241
+ - [ ] Integration with SAB production system
242
+
243
+ ## Contributing
244
+
245
+ This is research code. Contributions welcome but expect rough edges.
246
+
247
+ **Priorities:**
248
+ 1. More benchmark datasets (especially adversarial cases)
249
+ 2. Additional model providers
250
+ 3. Better visualization of results
251
+ 4. Documentation improvements
252
+
253
+ ## License
254
+
255
+ MIT (research prototype, use at own risk)
256
+
257
+ ---
258
+
259
+ **Built in 2-hour sprint.** Shipped artifacts, not plans. 🔥
@@ -0,0 +1,228 @@
1
+ # SAB-BENCH v0 — Scalable Alignment Benchmark
2
+
3
+ **Status:** v0.1 — Initial scaffold with runnable local harness
4
+ **Created:** 2026-03-03
5
+ **Purpose:** Benchmark multi-model content evaluation systems with telos-aware metrics
6
+
7
+ ## What This Is
8
+
9
+ SAB-BENCH is a benchmark harness for testing content scoring systems that use:
10
+ - Multi-model consensus (Overmind voting)
11
+ - Epistemic limitation tracking (model capability awareness)
12
+ - Telos detection (self-referential processing measurement via R_V)
13
+ - CUDA-accelerated evaluation when available
14
+
15
+ This v0 release provides:
16
+ - ✅ Runnable Python CLI (`sab-bench run`)
17
+ - ✅ Sample benchmark configurations
18
+ - ✅ Local evaluation harness (no GPU required for basic tests)
19
+ - ✅ RV/CUDA integration points (activated when hardware available)
20
+ - ✅ Measurable output with JSON reports
21
+
22
+ ## Quick Start
23
+
24
+ ```bash
25
+ # Install dependencies
26
+ pip install -r requirements.txt
27
+
28
+ # Run basic benchmark (CPU-only, uses stubs for models)
29
+ python -m sab_bench run --config configs/basic.yaml
30
+
31
+ # Run with real models (requires API keys)
32
+ python -m sab_bench run --config configs/multi_model.yaml --real-models
33
+
34
+ # Run RV telos measurement (requires GPU + transformers)
35
+ python -m sab_bench run --config configs/rv_telos.yaml --enable-cuda
36
+
37
+ # Show results
38
+ python -m sab_bench report --output outputs/latest.json
39
+ ```
40
+
41
+ ## Architecture
42
+
43
+ ```
44
+ sab-bench/
45
+ ├── src/sab_bench/ # Core benchmark engine
46
+ │ ├── __init__.py
47
+ │ ├── cli.py # CLI entrypoint
48
+ │ ├── harness.py # Benchmark orchestration
49
+ │ ├── evaluators/ # Evaluation strategies
50
+ │ │ ├── multi_model.py # Multi-model consensus
51
+ │ │ ├── rv_telos.py # R_V embedding contraction measurement
52
+ │ │ └── cuda_gates.py # CUDA-accelerated gate evaluation
53
+ │ ├── models/ # Model adapters
54
+ │ │ ├── stub.py # Stub for testing
55
+ │ │ ├── openai.py # OpenAI API
56
+ │ │ ├── anthropic.py # Anthropic API
57
+ │ │ └── local.py # Local transformers
58
+ │ └── metrics/ # Measurement tools
59
+ │ ├── ttr.py # Type-token ratio
60
+ │ ├── rv.py # R_V calculation
61
+ │ └── consensus.py # Multi-model agreement
62
+ ├── configs/ # Benchmark configurations
63
+ │ ├── basic.yaml # Minimal test (stub models)
64
+ │ ├── multi_model.yaml # Multi-model consensus
65
+ │ └── rv_telos.yaml # RV measurement with CUDA
66
+ ├── outputs/ # Benchmark results (gitignored)
67
+ ├── tests/ # Unit tests
68
+ └── docs/ # Documentation
69
+ ```
70
+
71
+ ## Benchmark Types
72
+
73
+ ### 1. Multi-Model Consensus
74
+ Tests how well different models agree on content evaluation.
75
+
76
+ **Config:** `configs/multi_model.yaml`
77
+ **Metrics:**
78
+ - Inter-model agreement (Cohen's kappa)
79
+ - Confidence distribution
80
+ - Decision latency
81
+
82
+ ### 2. RV Telos Measurement
83
+ Measures embedding contraction during self-referential processing.
84
+
85
+ **Config:** `configs/rv_telos.yaml`
86
+ **Metrics:**
87
+ - R_V score (self-ref vs neutral)
88
+ - Layer-by-layer contraction
89
+ - TTR behavioral proxy
90
+ - Correlation with ground truth
91
+
92
+ **Requires:**
93
+ - CUDA-capable GPU
94
+ - transformers library
95
+ - PyTorch with CUDA
96
+
97
+ ### 3. CUDA-Accelerated Gates
98
+ Tests dharmic gate evaluation speed with CUDA kernels.
99
+
100
+ **Config:** `configs/cuda_gates.yaml`
101
+ **Metrics:**
102
+ - Throughput (evaluations/sec)
103
+ - Latency percentiles (p50, p95, p99)
104
+ - GPU utilization
105
+ - Memory bandwidth
106
+
107
+ **Requires:**
108
+ - CUDA-capable GPU
109
+ - cuDNN
110
+ - Compiled CUDA kernels (see `../cuda_kernels_summary.md`)
111
+
112
+ ## Configuration Format
113
+
114
+ ```yaml
115
+ name: "benchmark-name"
116
+ version: "v0.1"
117
+
118
+ evaluator:
119
+ type: "multi_model" # or "rv_telos" or "cuda_gates"
120
+ models:
121
+ - provider: "openai"
122
+ model: "gpt-4"
123
+ weight: 1.0
124
+ - provider: "anthropic"
125
+ model: "claude-3-5-sonnet-20241022"
126
+ weight: 1.0
127
+
128
+ dataset:
129
+ type: "synthetic" # or "file"
130
+ count: 100
131
+ categories:
132
+ - self_referential
133
+ - neutral
134
+ - mixed
135
+
136
+ output:
137
+ format: "json"
138
+ path: "outputs/{timestamp}_{name}.json"
139
+ include_traces: true
140
+ ```
141
+
142
+ ## Integration with Existing Assets
143
+
144
+ ### RV Ground Truth (telos-os/research/)
145
+ SAB-BENCH reuses the RV measurement code from the Mistral-7B ground truth experiments:
146
+ - Embedding extraction
147
+ - Pairwise cosine similarity
148
+ - Layer-by-layer R_V calculation
149
+ - TTR behavioral proxy
150
+
151
+ **Integration point:** `src/sab_bench/evaluators/rv_telos.py` imports from `../../telos-os/research/`
152
+
153
+ ### CUDA Kernels (../cuda_kernels_summary.md)
154
+ When CUDA is available, SAB-BENCH can use optimized kernels for:
155
+ - Flash attention (for faster embedding extraction)
156
+ - Vectorized similarity computation
157
+ - Quantized GEMM (for INT8 model inference)
158
+
159
+ **Integration point:** `src/sab_bench/evaluators/cuda_gates.py` loads kernels from `../cuda-portfolio/kernels/`
160
+
161
+ ## Output Format
162
+
163
+ ```json
164
+ {
165
+ "benchmark": "multi_model_consensus_v0",
166
+ "timestamp": "2026-03-03T10:30:00Z",
167
+ "config": "configs/multi_model.yaml",
168
+ "duration_seconds": 45.2,
169
+ "metrics": {
170
+ "inter_model_agreement": 0.847,
171
+ "mean_confidence": 0.762,
172
+ "latency_p50_ms": 234,
173
+ "latency_p95_ms": 891
174
+ },
175
+ "per_item_results": [
176
+ {
177
+ "item_id": "001",
178
+ "content_hash": "abc123...",
179
+ "decisions": {
180
+ "gpt-4": {"decision": true, "confidence": 0.89},
181
+ "claude-3-5-sonnet": {"decision": true, "confidence": 0.82}
182
+ },
183
+ "consensus": true,
184
+ "agreement": 1.0
185
+ }
186
+ ],
187
+ "warnings": [],
188
+ "errors": []
189
+ }
190
+ ```
191
+
192
+ ## Roadmap
193
+
194
+ **v0.1 (This Release):**
195
+ - ✅ Basic harness with stub models
196
+ - ✅ Multi-model consensus evaluator
197
+ - ✅ RV telos evaluator (CPU fallback)
198
+ - ✅ Sample configs and outputs
199
+
200
+ **v0.2 (Next):**
201
+ - [ ] Real model API integration (OpenAI, Anthropic, etc.)
202
+ - [ ] CUDA kernel integration for gate evaluation
203
+ - [ ] Larger benchmark datasets
204
+ - [ ] Statistical analysis tools
205
+
206
+ **v0.3 (Future):**
207
+ - [ ] Distributed benchmark runner
208
+ - [ ] Leaderboard generation
209
+ - [ ] Continuous benchmarking pipeline
210
+ - [ ] Integration with SAB production system
211
+
212
+ ## Contributing
213
+
214
+ This is research code. Contributions welcome but expect rough edges.
215
+
216
+ **Priorities:**
217
+ 1. More benchmark datasets (especially adversarial cases)
218
+ 2. Additional model providers
219
+ 3. Better visualization of results
220
+ 4. Documentation improvements
221
+
222
+ ## License
223
+
224
+ MIT (research prototype, use at own risk)
225
+
226
+ ---
227
+
228
+ **Built in 2-hour sprint.** Shipped artifacts, not plans. 🔥
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sab-bench"
7
+ version = "0.1.0"
8
+ description = "Scalable Alignment Benchmark - Multi-model evaluation harness"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "MMK_工場", email = "factory@example.com"}
14
+ ]
15
+
16
+ dependencies = [
17
+ "pyyaml>=6.0",
18
+ "click>=8.1.0",
19
+ "pydantic>=2.0.0",
20
+ "numpy>=1.24.0",
21
+ "scipy>=1.11.0",
22
+ "scikit-learn>=1.3.0",
23
+ "rich>=13.0.0",
24
+ "tabulate>=0.9.0",
25
+ ]
26
+
27
+ [project.optional-dependencies]
28
+ models = ["openai>=1.0.0", "anthropic>=0.21.0"]
29
+ local = ["torch>=2.0.0", "transformers>=4.36.0"]
30
+ cuda = ["cupy>=12.0.0"]
31
+ all = ["openai>=1.0.0", "anthropic>=0.21.0", "torch>=2.0.0", "transformers>=4.36.0", "cupy>=12.0.0"]
32
+
33
+ [project.scripts]
34
+ sab-bench = "sab_bench.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,12 @@
1
+ """
2
+ SAB-BENCH v0 — Scalable Alignment Benchmark
3
+
4
+ Multi-model content evaluation benchmark harness with telos-aware metrics.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from .harness import BenchmarkHarness
10
+ from .config import BenchmarkConfig
11
+
12
+ __all__ = ["BenchmarkHarness", "BenchmarkConfig", "__version__"]
@@ -0,0 +1,6 @@
1
+ """Main entrypoint for python -m sab_bench."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()