concept-benchmark 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. concept_benchmark-0.1.4/.github/workflows/build.yml +88 -0
  2. concept_benchmark-0.1.4/.gitignore +141 -0
  3. concept_benchmark-0.1.4/CITATION.cff +31 -0
  4. concept_benchmark-0.1.4/PKG-INFO +367 -0
  5. concept_benchmark-0.1.4/README.md +324 -0
  6. concept_benchmark-0.1.4/cbm_configurations.csv +718 -0
  7. concept_benchmark-0.1.4/concept_benchmark/__init__.py +3 -0
  8. concept_benchmark-0.1.4/concept_benchmark/_logging.py +44 -0
  9. concept_benchmark-0.1.4/concept_benchmark/alignment.py +241 -0
  10. concept_benchmark-0.1.4/concept_benchmark/benchmarks/__init__.py +4 -0
  11. concept_benchmark-0.1.4/concept_benchmark/benchmarks/_common.py +259 -0
  12. concept_benchmark-0.1.4/concept_benchmark/benchmarks/robot.py +1558 -0
  13. concept_benchmark-0.1.4/concept_benchmark/benchmarks/robot_text.py +798 -0
  14. concept_benchmark-0.1.4/concept_benchmark/benchmarks/sudoku.py +879 -0
  15. concept_benchmark-0.1.4/concept_benchmark/cli.py +245 -0
  16. concept_benchmark-0.1.4/concept_benchmark/concept_descriptions/clip.jsonl +12 -0
  17. concept_benchmark-0.1.4/concept_benchmark/concept_descriptions/gt_concepts.jsonl +7 -0
  18. concept_benchmark-0.1.4/concept_benchmark/concept_descriptions/gt_concepts_subconcept.jsonl +12 -0
  19. concept_benchmark-0.1.4/concept_benchmark/concept_descriptions/llm.jsonl +12 -0
  20. concept_benchmark-0.1.4/concept_benchmark/config.py +679 -0
  21. concept_benchmark-0.1.4/concept_benchmark/cv.py +368 -0
  22. concept_benchmark-0.1.4/concept_benchmark/data.py +1264 -0
  23. concept_benchmark-0.1.4/concept_benchmark/ext/__init__.py +0 -0
  24. concept_benchmark-0.1.4/concept_benchmark/ext/fileutils.py +52 -0
  25. concept_benchmark-0.1.4/concept_benchmark/helper/__init__.py +1 -0
  26. concept_benchmark-0.1.4/concept_benchmark/helper/data_utils.py +171 -0
  27. concept_benchmark-0.1.4/concept_benchmark/intervention.py +796 -0
  28. concept_benchmark-0.1.4/concept_benchmark/kflip.py +268 -0
  29. concept_benchmark-0.1.4/concept_benchmark/lfcbm.py +559 -0
  30. concept_benchmark-0.1.4/concept_benchmark/llm_client.py +260 -0
  31. concept_benchmark-0.1.4/concept_benchmark/metrics.py +23 -0
  32. concept_benchmark-0.1.4/concept_benchmark/models.py +1253 -0
  33. concept_benchmark-0.1.4/concept_benchmark/paths.py +24 -0
  34. concept_benchmark-0.1.4/concept_benchmark/synthetic/__init__.py +4 -0
  35. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/__init__.py +0 -0
  36. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/robot_catalog.py +243 -0
  37. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/robot_draw.py +1152 -0
  38. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/HardCorpus.jsonl +89 -0
  39. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/HardCorpus_EarsGeneric.jsonl +25 -0
  40. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/HardCorpus_FootGeneric.jsonl +70 -0
  41. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/HardCorpus_NoAnt.jsonl +40 -0
  42. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/Templates.jsonl +40 -0
  43. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/Templates.txt +50 -0
  44. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/Templates_FootGeneric.jsonl +40 -0
  45. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/Templates_simple 2.txt +52 -0
  46. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/Templates_simple.txt +52 -0
  47. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/concepts.csv +16 -0
  48. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/templates_simple.jsonl +40 -0
  49. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/static/text_templates/templates_simple_FootGeneric.jsonl +40 -0
  50. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/sudoku_handwriting_helper.py +403 -0
  51. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/sudoku_helper.py +605 -0
  52. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/text_concept_detector.py +429 -0
  53. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/textgen.py +390 -0
  54. concept_benchmark-0.1.4/concept_benchmark/synthetic/helper/utils.py +126 -0
  55. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot.py +476 -0
  56. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot_text/__init__.py +28 -0
  57. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot_text/catalog.py +135 -0
  58. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot_text/corpus.py +192 -0
  59. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot_text/dataset.py +228 -0
  60. concept_benchmark-0.1.4/concept_benchmark/synthetic/robot_text/lfcbm.py +339 -0
  61. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku.py +561 -0
  62. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku_ocr/__init__.py +0 -0
  63. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku_ocr/infer_ocr_fast.py +136 -0
  64. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku_ocr/make_ocr_dataset.py +454 -0
  65. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku_ocr/ocr_utils.py +170 -0
  66. concept_benchmark-0.1.4/concept_benchmark/synthetic/sudoku_ocr/train_ocr_fast.py +337 -0
  67. concept_benchmark-0.1.4/concept_benchmark/train.py +459 -0
  68. concept_benchmark-0.1.4/docs/assets/logo.svg +90 -0
  69. concept_benchmark-0.1.4/docs/assets/robot_banner.png +0 -0
  70. concept_benchmark-0.1.4/docs/assets/robot_concepts.png +0 -0
  71. concept_benchmark-0.1.4/docs/assets/robot_foot_shapes.png +0 -0
  72. concept_benchmark-0.1.4/docs/assets/sudoku_handwritten.png +0 -0
  73. concept_benchmark-0.1.4/fonts/Pecita.otf +0 -0
  74. concept_benchmark-0.1.4/fonts/pecita/Pecita.txt +120 -0
  75. concept_benchmark-0.1.4/install.sh +86 -0
  76. concept_benchmark-0.1.4/my_experiment.yaml +27 -0
  77. concept_benchmark-0.1.4/pr_description.md +84 -0
  78. concept_benchmark-0.1.4/pyproject.toml +105 -0
  79. concept_benchmark-0.1.4/tests/conftest.py +372 -0
  80. concept_benchmark-0.1.4/tests/test_cli.py +69 -0
  81. concept_benchmark-0.1.4/tests/test_concept_dataset.py +328 -0
  82. concept_benchmark-0.1.4/tests/test_concept_dataset_sample.py +161 -0
  83. concept_benchmark-0.1.4/tests/test_concept_image_dataset_sample.py +76 -0
  84. concept_benchmark-0.1.4/tests/test_config.py +124 -0
  85. concept_benchmark-0.1.4/tests/test_models_detectors.py +81 -0
  86. concept_benchmark-0.1.4/tests/test_propagation.py +110 -0
  87. concept_benchmark-0.1.4/tests/test_robot_pipeline_regression.py +239 -0
  88. concept_benchmark-0.1.4/tests/test_smoke_pipelines.py +167 -0
  89. concept_benchmark-0.1.4/tests/test_train_heads.py +37 -0
  90. concept_benchmark-0.1.4/uv.lock +4858 -0
@@ -0,0 +1,88 @@
1
+ name: build
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ types: [opened, reopened, ready_for_review]
9
+ # Allow manual runs from the Actions tab
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ concurrency:
16
+ group: ${{ github.workflow }}-${{ github.ref }}
17
+ cancel-in-progress: true
18
+
19
+ jobs:
20
+ lint:
21
+ runs-on: ubuntu-latest
22
+ timeout-minutes: 5
23
+ steps:
24
+ - uses: actions/checkout@v4
25
+ - uses: astral-sh/setup-uv@v5
26
+ with:
27
+ enable-cache: true
28
+ - name: Set up Python
29
+ uses: actions/setup-python@v5
30
+ with:
31
+ python-version: "3.12"
32
+ - name: Install ruff
33
+ run: uv tool install ruff
34
+ - name: Lint
35
+ run: ruff check concept_benchmark/ tests/
36
+ - name: Format check
37
+ run: ruff format --check concept_benchmark/ tests/
38
+
39
+ test:
40
+ needs: lint
41
+ runs-on: ${{ matrix.os }}
42
+ timeout-minutes: 20
43
+ env:
44
+ MODULE_NAME: concept_benchmark
45
+ PYTHONWARNINGS: error
46
+ strategy:
47
+ fail-fast: false
48
+ matrix:
49
+ os: [ubuntu-latest]
50
+ # 3.11 has bugs: meson
51
+ python-version: ["3.10", "3.12", "3.13"]
52
+
53
+ steps:
54
+ - uses: actions/checkout@v4
55
+
56
+ - name: Install pycairo dependencies
57
+ run: |
58
+ sudo apt-get update
59
+ sudo apt-get install -y libcairo2-dev pkg-config libmagickwand-dev
60
+
61
+ - name: Install uv
62
+ uses: astral-sh/setup-uv@v5
63
+ with:
64
+ enable-cache: true
65
+ cache-dependency-glob: "uv.lock"
66
+ cache-local-path: "~/.local"
67
+
68
+ - name: Set up Python ${{ matrix.python-version }}
69
+ uses: actions/setup-python@v5
70
+ with:
71
+ python-version: ${{ matrix.python-version }}
72
+
73
+ - name: Install the project (dev deps)
74
+ run: uv sync --locked --all-extras --dev
75
+
76
+ - name: Run tests
77
+ run: >
78
+ uv run pytest -q -ra
79
+ --maxfail=1
80
+ --durations=20
81
+ --junitxml=./test-results/pytest.xml
82
+
83
+ - name: Upload test results
84
+ if: always()
85
+ uses: actions/upload-artifact@v4
86
+ with:
87
+ name: pytest-${{ matrix.os }}-py${{ matrix.python-version }}
88
+ path: test-results/pytest.xml
@@ -0,0 +1,141 @@
1
+ # binaries
2
+ *.bin
3
+
4
+ # devcontainers
5
+ .devcontainer/
6
+
7
+ # Local contributor guide (not versioned)
8
+ AGENTS.md
9
+ CLAUDE.md
10
+
11
+ # PyCharm
12
+ .idea/
13
+ .RData
14
+ .Rhistory
15
+
16
+ # VS Code
17
+ .vscode
18
+
19
+ # Docker
20
+ !*compose.yml
21
+
22
+ # Data
23
+ data/*.RData
24
+ data/*.pickle
25
+ data/*.data
26
+ data/*/*.data
27
+ data/*/raw
28
+ data/cub/
29
+ data/sudoku/
30
+ data/robot_images/
31
+
32
+ # folders to ignore
33
+ dev/
34
+ results/
35
+ plots/
36
+ reports/
37
+ local/
38
+ paper_reports/
39
+ reporting/templates/figure/
40
+ reporting/templates/*/
41
+
42
+ # sphinx
43
+ docs/_build
44
+
45
+ # scikit-learn
46
+ doc/
47
+ doc/_build/
48
+ doc/auto_examples/
49
+ doc/modules/generated/
50
+ doc/datasets/generated/
51
+ ci_scripts/
52
+ skltemplate/
53
+
54
+ # other
55
+ examples/paper/results
56
+
57
+ # jekyll
58
+ .nojekyll
59
+
60
+ # Notes
61
+ *.ft
62
+
63
+ # odyssey
64
+ *.command
65
+
66
+ # Byte-compiled / optimized / DLL files
67
+ __pycache__/
68
+ *.py[cod]
69
+ *$py.class
70
+
71
+ # General
72
+ *.DS_Store
73
+
74
+ # iPython notebooks
75
+ *.ipynb_checkpoints
76
+
77
+
78
+ # C extensions
79
+ *.so
80
+
81
+ *.egg-info
82
+ temp_results
83
+ temp_results/*
84
+
85
+ # Distribution / packaging
86
+ .pytest_cache
87
+ .Python
88
+ env/
89
+ venv/
90
+ build/
91
+ develop-eggs/
92
+ dist/
93
+ downloads/
94
+ eggs/
95
+ .eggs/
96
+ lib/
97
+ lib64/
98
+ parts/
99
+ sdist/
100
+ var/
101
+ *.egg-info/
102
+ .installed.cfg
103
+ *.egg
104
+
105
+ # PyInstaller
106
+ # Usually these files are written by a python script from a template
107
+ # before PyInstaller builds the exe
108
+ *.manifest
109
+ *.spec
110
+
111
+ # Installer logs
112
+ pip-log.txt
113
+ pip-delete-this-directory.txt
114
+
115
+ # Unit test / coverage reports
116
+
117
+ htmlcov/
118
+ .tox/
119
+ .coverage
120
+ .coverage.*
121
+ .cache
122
+ nosetests.xml
123
+ coverage.xml
124
+ *,cover
125
+ .hypothesis/
126
+
127
+ # Translations
128
+ *.mo
129
+ *.pot
130
+
131
+ # Django stuff:
132
+ *.log
133
+
134
+ # PyBuilder
135
+ target/
136
+
137
+ #Ipython Notebooks
138
+ examples/.ipynb_checkpoints
139
+
140
+
141
+ /results/
@@ -0,0 +1,31 @@
1
+ cff-version: 1.2.0
2
+ message: "If you use this software, please cite it as below."
3
+ title: "Measuring What Matters: Synthetic Benchmarks for Concept Bottleneck Models"
4
+ type: software
5
+ version: 0.0.1
6
+ date-released: 2026-01-01
7
+ url: "https://github.com/ustunb/concept-benchmark"
8
+ repository-code: "https://github.com/ustunb/concept-benchmark"
9
+ license: MIT
10
+ authors:
11
+ - family-names: Skirzynski
12
+ given-names: Julian
13
+ affiliation: "UC San Diego, CSE"
14
+ - family-names: Cheon
15
+ given-names: Harry
16
+ affiliation: "UC San Diego, CSE"
17
+ - family-names: Kadekodi
18
+ given-names: Shreyas
19
+ affiliation: "UC San Diego, CSE"
20
+ - family-names: Stewart
21
+ given-names: Meredith
22
+ affiliation: "UC San Diego, CSE"
23
+ - family-names: Ustun
24
+ given-names: Berk
25
+ affiliation: "UC San Diego, CSE"
26
+ keywords:
27
+ - Concept Bottleneck Models
28
+ - Safety
29
+ - Interpretability
30
+ - Alignment
31
+ - Benchmarks
@@ -0,0 +1,367 @@
1
+ Metadata-Version: 2.4
2
+ Name: concept-benchmark
3
+ Version: 0.1.4
4
+ Summary: Synthetic benchmarks for evaluating Concept Bottleneck Models.
5
+ Project-URL: Repository, https://github.com/ustunb/concept-benchmark
6
+ Author: Julian Skirzynski, Harry Cheon, Shreyas Kadekodi, Meredith Stewart, Berk Ustun
7
+ License-Expression: MIT
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Science/Research
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.10
17
+ Requires-Dist: albumentations
18
+ Requires-Dist: colorir>=2.0.0
19
+ Requires-Dist: cvxpy>=1.4.0
20
+ Requires-Dist: datasets>=4.0.0
21
+ Requires-Dist: dill>=0.3.8
22
+ Requires-Dist: fastparquet>=2024.11.0
23
+ Requires-Dist: google-generativeai>=0.3.0
24
+ Requires-Dist: matplotlib>=3.10.3
25
+ Requires-Dist: numpy>=2.2.6
26
+ Requires-Dist: open-clip-torch>=2.20.0
27
+ Requires-Dist: pandas>=2.3.1
28
+ Requires-Dist: pero>=0.18.0
29
+ Requires-Dist: pillow>=10.0.0
30
+ Requires-Dist: psutil>=7.0.0
31
+ Requires-Dist: pyarrow>=21.0.0
32
+ Requires-Dist: pycairo>=1.28.0
33
+ Requires-Dist: pyyaml>=6.0
34
+ Requires-Dist: scikit-learn>=1.7.0
35
+ Requires-Dist: scipy>=1.15.3
36
+ Requires-Dist: skorch>=1.2.0
37
+ Requires-Dist: torch>=2.7.1
38
+ Requires-Dist: torcheval>=0.0.7
39
+ Requires-Dist: torchvision>=0.23.0
40
+ Requires-Dist: tqdm>=4.67.1
41
+ Requires-Dist: transformers>=4.55.2
42
+ Description-Content-Type: text/markdown
43
+
44
+ # Concept Benchmark
45
+
46
+ [![python](https://img.shields.io/badge/python-3.10%2B-blue)](https://www.python.org)
47
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
48
+
49
+ <p align="center">
50
+ <img src="https://raw.githubusercontent.com/ustunb/concept-benchmark/pretty_branch/docs/assets/logo.svg" width="400" alt="Concept Benchmark logo">
51
+ </p>
52
+
53
+ **Concept Benchmark** is a Python package for benchmarking [concept bottleneck models](https://arxiv.org/abs/2007.04612) (CBMs). It provides synthetic datasets with ground-truth concept labels, allowing users to vary concept granularity, annotation quality, and the labeling rule, and measure how each factor affects model performance and the value of interventions. The package includes two benchmarks -- robot classification (decision support) and Sudoku validation (automation) -- across image, text, and tabular modalities.
54
+
55
+ ## Table of Contents
56
+
57
+ 1. [Installation](#installation)
58
+ 2. [Quick Start](#quick-start)
59
+ 3. [Benchmarks](#benchmarks)
60
+ 4. [CLI Reference](#cli-reference)
61
+ 5. [Citation](#citation)
62
+
63
+ ## Installation
64
+
65
+ The package requires the **cairo** graphics library. Install it first:
66
+
67
+ ```bash
68
+ # macOS
69
+ brew install cairo pkg-config
70
+
71
+ # Ubuntu / Debian
72
+ sudo apt-get install libcairo2-dev pkg-config python3-dev
73
+
74
+ # Fedora / RHEL
75
+ sudo dnf install cairo-devel pkg-config python3-devel
76
+ ```
77
+
78
+ Then install the package:
79
+
80
+ ```bash
81
+ pip install concept-benchmark
82
+ ```
83
+
84
+ Or install from source:
85
+
86
+ ```bash
87
+ git clone https://github.com/ustunb/concept-benchmark.git
88
+ cd concept-benchmark
89
+ ./install.sh
90
+ source venv/bin/activate
91
+ ```
92
+
93
+ Verify the installation:
94
+
95
+ ```bash
96
+ python3 -c "import concept_benchmark; print('OK')"
97
+ ```
98
+
99
+ ## Quick Start
100
+
101
+ A CBM predicts concepts from inputs (e.g., "has pointy feet"), then predicts the label from those concepts. At test time, a user can correct mispredicted concepts -- this is called an *intervention*. The package lets you measure whether correcting *k* concepts improves the label prediction, and how that depends on concept quality and annotation noise.
102
+
103
+ The fastest way to run the benchmark is from the command line. This generates data, trains models, runs interventions, and saves a results CSV — with automatic caching so repeated runs skip completed stages:
104
+
105
+ ```bash
106
+ cbm-benchmark robot --seed 1014 --stages setup cbm dnn intervene collect
107
+ ```
108
+
109
+ Results are saved to `results/robot_ideal_seed1014_2d0aa353_results.csv`. Filter to `model == "cbm"` and `threshold == 0.2` to see accuracy numbers.
110
+
111
+ The same pipeline from Python:
112
+
113
+ ```python
114
+ from concept_benchmark.benchmarks import robot
115
+ from concept_benchmark.config import RobotBenchmarkConfig
116
+
117
+ cfg = RobotBenchmarkConfig(seed=1014)
118
+ robot.run(cfg, stages=["setup", "cbm", "dnn", "intervene", "collect"])
119
+ ```
120
+
121
+ Under the hood, `robot.run()` calls individual functions that you can also use directly to inspect intermediate objects:
122
+
123
+ ```python
124
+ import numpy as np
125
+ from concept_benchmark.benchmarks import robot
126
+ from concept_benchmark.config import RobotBenchmarkConfig
127
+
128
+ cfg = RobotBenchmarkConfig(seed=1014)
129
+ data = robot.setup_dataset(cfg) # generate 32x32 robot images
130
+ cbm = robot.train_cbm(cfg, data) # concept detectors + label predictor
131
+ dnn = robot.train_dnn(cfg, data) # end-to-end baseline (no concepts)
132
+ results = robot.run_interventions(cfg, cbm, data) # measure effect of corrections
133
+
134
+ # CBM baseline (no interventions)
135
+ cbm_acc = float(np.mean(cbm.predict(data.test) == data.test.y))
136
+ print(f"CBM (k=0): {cbm_acc:.4f}")
137
+ # Intervention gains at threshold=0.2
138
+ print(results.query("threshold == 0.2")[["budget", "accuracy"]].to_string(index=False))
139
+ ```
140
+
141
+ Expected output:
142
+ ```
143
+ CBM (k=0): 0.8673
144
+ budget accuracy
145
+ 1 0.9736
146
+ 3 0.9769
147
+ 7 0.9769
148
+ ```
149
+
150
+ See [`scripts/demo_robot.py`](scripts/demo_robot.py) and [`scripts/demo_sudoku.py`](scripts/demo_sudoku.py) for fully-commented examples.
151
+
152
+
153
+ ## Benchmarks
154
+
155
+ The package includes two benchmarks. **Robot classification** is a decision-support task where a human corrects the model's concept predictions to improve accuracy. **Sudoku validation** is an automation task where the system handles routine cases and defers uncertain ones to a human.
156
+
157
+ ### Robot Classification
158
+
159
+ This benchmark targets decision-support settings where a human uses the model's concept predictions to improve their own decisions. The task is to predict the species of a fictional robot -- **Glorp** or **Drent** -- from its body features. Each robot has 9 binary features (mouth type, foot shape, knee presence, etc.). The default labeling rule is: Glorp if mouth is closed, foot is pointy, and robot has knees (all three); Drent otherwise. Which features matter and which are spurious are configurable, mimicking real-world settings where the true relationship between features and labels is unknown. Available as image (`cbm-benchmark robot`) and text (`cbm-benchmark robot-text`) modalities.
160
+
161
+ <p align="center">
162
+ <img src="https://raw.githubusercontent.com/ustunb/concept-benchmark/pretty_branch/docs/assets/robot_concepts.png" width="400" alt="Robot with annotated concepts">
163
+ </p>
164
+
165
+ The following example uses the subconcept variant (which splits foot_shape into 5 fine-grained subtypes, yielding 12 concepts instead of the default 7), and tests whether imposing a sign constraint on the `has_knees` weight preserves or destroys the benefit of interventions.
166
+
167
+ ```python
168
+ import numpy as np
169
+ from concept_benchmark.benchmarks import robot
170
+ from concept_benchmark.config import RobotBenchmarkConfig
171
+
172
+ cfg = RobotBenchmarkConfig(
173
+ seed=1014,
174
+ subconcept=True, # use fine-grained foot subtypes (12 instead of 7)
175
+ intervention_budgets=[1, 3], # correct k=1 or k=3 concepts per sample
176
+ intervention_thresholds=[0.2],
177
+ alignment_constraints={"has_knees": 1}, # force has_knees weight to be positive
178
+ )
179
+
180
+ data = robot.setup_dataset(cfg)
181
+ cbm = robot.train_cbm(cfg, data)
182
+ dnn = robot.train_dnn(cfg, data)
183
+ results = robot.run_interventions(cfg, cbm, data)
184
+ align_stats = robot.align(cfg, cbm, data)
185
+
186
+ cbm_acc = float(np.mean(cbm.predict(data.test) == data.test.y))
187
+ print(f"CBM (k=0): {cbm_acc:.4f}")
188
+ print(results[["budget", "accuracy"]].to_string(index=False))
189
+
190
+ from concept_benchmark.paths import results_dir
191
+ cfg.to_yaml(results_dir / "my_experiment.yaml") # save config for CLI use
192
+ ```
193
+
194
+ Expected output:
195
+ ```
196
+ CBM (k=0): 0.7812
197
+ budget accuracy
198
+ 1 0.9212
199
+ 3 0.9439
200
+ 12 0.9439
201
+ ```
202
+
203
+ To re-run this experiment from the CLI (with automatic caching):
204
+
205
+ ```bash
206
+ cbm-benchmark robot --config results/my_experiment.yaml
207
+ ```
208
+
209
+ The most important parameters used in the config above are listed below. For the full list, see `RobotBenchmarkConfig` in [`concept_benchmark/config.py`](concept_benchmark/config.py) or the fully-commented [`scripts/demo_robot.py`](scripts/demo_robot.py).
210
+
211
+ | Parameter | Default | Description |
212
+ |-----------|---------|-------------|
213
+ | `drop_concepts` | `IDEAL_DROP` | Which concepts to exclude. Two presets are provided: `IDEAL_DROP` for 7 coarse concepts (binary foot_shape), `SUBCONCEPT_DROP` for 12 concepts (5 fine-grained foot subtypes). |
214
+ | `subconcept` | `False` | Shortcut that switches `drop_concepts` to `SUBCONCEPT_DROP`. |
215
+ | `model_rule` | see `config.py` | Python expression defining the labeling rule. Default: Glorp if `(mouth_closed + foot_pointy + has_knees) >= 3`. |
216
+ | `weights` | `{"mouth_type": 5, "foot_shape": 8, "has_knees": -5}` | Concept weights for the stochastic labeling function. |
217
+ | `concept_missing` | `0.0` | Fraction of concept labels masked during training. |
218
+ | `regimes` | `["baseline"]` | How interventions are performed: `baseline` (oracle), `expert` (noisy human), `subjective` (noisy concept labels + noisy human), `machine`/`llm`/`clip` (concepts discovered via [Label-Free CBM](https://arxiv.org/abs/2304.06129)). |
219
+
220
+ <details>
221
+ <summary>Remaining parameters</summary>
222
+
223
+ | Parameter | Default | Description |
224
+ |-----------|---------|-------------|
225
+ | `seed` | `1014` / `1337` | Random seed (image / text) |
226
+ | `size` | `"medium"` | Image resolution: `"small"` (8px), `"medium"` (32px), `"large"` (600px). Image only. |
227
+ | `model_type` | `"stochastic"` | Labeling function: `"deterministic"` or `"stochastic"` |
228
+ | `concept_missing_mech` | `"none"` | Missingness mechanism: `"none"`, `"mcar"`, or `"mnar"` |
229
+ | `intervention_budgets` | `[1, 3]` | Number of concepts to correct per sample |
230
+ | `intervention_thresholds` | `[0.2, 0.4]` | Concepts whose predicted probability is within this distance of 0.5 are candidates for intervention |
231
+ | `intervention_strategy` | `"kflip"` | `"kflip"` (up to *k* concepts) or `"exact_k"` (exactly *k*) |
232
+ | `alignment_constraints` | `{}` | Sign constraints on concept weights (e.g., `{"has_knees": 1}`). Retrains the label predictor and re-evaluates interventions. |
233
+ | `difficulty` | `"hard"` | Corpus difficulty (text only) |
234
+ | `generic_rate` | `0.7` | Fraction of test set using concept-ambiguous text (text only) |
235
+
236
+ </details>
237
+
238
+ > **Note:** The `llm` and `clip` regimes call the Gemini API at intervention time. Set your key before running:
239
+ > ```bash
240
+ > export GEMINI_API_KEY=your_key_here
241
+ > ```
242
+
243
+ ### Sudoku Validation
244
+
245
+ This benchmark targets automation settings where the system handles routine cases and defers uncertain ones to a human. The task is to determine whether a 9x9 Sudoku board is valid, i.e., contains the digits 1-9 exactly once in each row, column, and block. The 27 concepts correspond to the validity of each row, column, and 3x3 block. A board is valid if and only if all 27 concepts are true (AND structure), so a single violated concept is enough to invalidate the board. When the model abstains, a human can verify specific concepts (e.g., "is row 5 valid?") to resolve the uncertainty.
246
+
247
+ <p align="center">
248
+ <img src="https://raw.githubusercontent.com/ustunb/concept-benchmark/pretty_branch/docs/assets/sudoku_handwritten.png" width="400" alt="Sudoku board with handwritten digits and concept annotations">
249
+ </p>
250
+
251
+ The following example generates 1000 boards with handwritten digits, corrupting up to 9 cells in invalid boards. The concept-supervised (CS) model -- the Sudoku equivalent of a CBM -- predicts 27 binary concepts, then a label predictor determines board validity. The selective classification stage finds a confidence threshold that achieves at least 95% accuracy on kept predictions.
252
+
253
+ ```python
254
+ from concept_benchmark.benchmarks import sudoku
255
+ from concept_benchmark.config import SudokuBenchmarkConfig
256
+
257
+ cfg = SudokuBenchmarkConfig(
258
+ seed=171,
259
+ max_corrupt=9, # cells corrupted in invalid boards
260
+ handwriting=True, # render with handwritten digits
261
+ target_accuracy=0.95, # minimum accuracy on kept predictions
262
+ )
263
+
264
+ sudoku.setup_dataset(cfg) # generate boards + handwritten digit images
265
+ sudoku.train_ocr(cfg) # train digit recognizer on cell crops
266
+ cs_model = sudoku.train_cs(cfg) # concept-supervised model (27 concepts -> valid/invalid)
267
+ dnn = sudoku.train_dnn(cfg) # end-to-end baseline (no concepts)
268
+ results = sudoku.run_interventions(cfg, cs_model)
269
+ sel = sudoku.compute_selective_results(cfg) # selective accuracy and coverage
270
+
271
+ # Filter to the target accuracy threshold
272
+ t95 = sel[sel["target_accuracy"] == 0.95]
273
+ print(t95[["model", "selective_acc", "selective_cov"]].to_string(index=False))
274
+
275
+ from concept_benchmark.paths import results_dir
276
+ cfg.to_yaml(results_dir / "my_experiment.yaml") # save config for CLI use
277
+ ```
278
+
279
+ Expected output:
280
+ ```
281
+ model selective_acc selective_cov
282
+ dnn 0.875 0.04
283
+ cs 0.915 1.00
284
+ ```
285
+
286
+ To re-run this experiment from the CLI (with automatic caching):
287
+
288
+ ```bash
289
+ cbm-benchmark sudoku --config results/my_experiment.yaml
290
+ ```
291
+
292
+ The most important parameters are listed below. For the full list, see `SudokuBenchmarkConfig` in [`concept_benchmark/config.py`](concept_benchmark/config.py) or the fully-commented [`scripts/demo_sudoku.py`](scripts/demo_sudoku.py).
293
+
294
+ | Parameter | Default | Description |
295
+ |-----------|---------|-------------|
296
+ | `max_corrupt` | `9` | Number of cells corrupted in invalid boards (higher values produce subtler errors) |
297
+ | `data_type` | `"image"` | `"image"` evaluates on OCR-inferred digits (adds OCR stage); `"tabular"` evaluates on ground-truth digit values (no OCR). Training always uses ground-truth values. |
298
+ | `handwriting` | `True` | Render digits in handwritten style (only applies when `data_type="image"`) |
299
+ | `target_accuracy` | `0.9` | Minimum accuracy required on kept predictions |
300
+
301
+ <details>
302
+ <summary>Remaining parameters</summary>
303
+
304
+ | Parameter | Default | Description |
305
+ |-----------|---------|-------------|
306
+ | `seed` | `171` | Random seed |
307
+ | `n_samples` | `1000` | Number of boards to generate |
308
+ | `valid_ratio` | `0.5` | Fraction of valid boards |
309
+ | `intervention_thresholds` | `[0.2, 0.4, 0.6, 0.8]` | Concept confidence thresholds that determine which concepts are candidates for verification |
310
+
311
+ </details>
312
+
313
+ ## CLI Reference
314
+
315
+ All benchmarks are run via `cbm-benchmark <benchmark>`. Use `cbm-benchmark <benchmark> --help` to see all options. All outputs (datasets, model weights, intervention CSVs, summary tables) are saved under `results/`.
316
+
317
+ ### Pipeline Stages
318
+
319
+ Each benchmark runs a sequence of stages. Use `--stages` to run a subset. The `setup` stage generates the synthetic dataset. The `collect` stage produces a single results table (e.g., `results/robot_ideal_seed1014_2d0aa353_results.csv`) with all accuracy numbers across models, intervention budgets, and alignment variants.
320
+
321
+ ```bash
322
+ # retrain models on existing data (skip data generation)
323
+ cbm-benchmark robot --stages cbm dnn intervene align collect
324
+
325
+ # rerun interventions with different regimes (models already trained)
326
+ cbm-benchmark robot --subconcept --regimes baseline expert --stages intervene collect
327
+ ```
328
+
329
+ | Benchmark | Stages (in order) |
330
+ |-----------|-------------------|
331
+ | `robot` | `setup` · `cbm` · `dnn` · `intervene` · `align` · `collect` |
332
+ | `sudoku` | `setup` · `ocr` · `cs` · `dnn` · `intervene` · `selective` · `align` · `collect` |
333
+ | `robot-text` | `setup` · `cbm` · `dnn` · `lfcbm` · `intervene` · `align` · `collect` |
334
+
335
+ ### Flags
336
+
337
+ | Flag | Benchmarks | Description |
338
+ |------|-----------|-------------|
339
+ | `--seed` | all | Random seed (defaults: robot 1014, sudoku 171, robot-text 1337) |
340
+ | `--stages` | all | Which stages to run (default: all) |
341
+ | `--config` | all | Path to YAML config file. CLI flags like `--regimes` and `--strategy` can further override values loaded from the file. |
342
+ | `--subconcept` | robot | Use subconcept variant (12 concepts with fine-grained foot subtypes instead of 7 coarse) |
343
+ | `--regimes` | robot, robot-text | Intervention regimes: `baseline`, `expert`, `subjective`, `machine`, `llm`, `clip` |
344
+ | `--strategy` | robot, robot-text | `kflip` (up to *k*) or `exact_k` (exactly *k* concepts) |
345
+ | `--concept-missing` | robot | Fraction of concept labels to mask (e.g. `0.2`) |
346
+ | `--concept-missing-mech` | robot | Missingness mechanism: `none`, `mcar`, or `mnar` |
347
+ | `--data-type` | sudoku | Data modality: `tabular` (ground-truth digits) or `image` (OCR from board images) |
348
+ | `--handwriting` | sudoku | Use handwritten digits (only applies with `--data-type image`) |
349
+ | `--no-handwriting` | sudoku | Use printed digits (only applies with `--data-type image`) |
350
+ | `--force-setup` | all | Regenerate all data (images, boards) from scratch, even if cached |
351
+ | `--force-retrain` | robot | Retrain LFCBM/subjective models even if cached |
352
+ | `--lfcbm` | robot-text | Also run the Label-Free CBM variant |
353
+ | `--llm-api-key` | robot | API key for LLM provider (alternative to `GEMINI_API_KEY` env var) |
354
+ | `--dry-run` | all | Print configuration and exit without running |
355
+ | `-v` / `-q` | all | Verbose / quiet output |
356
+
357
+ ## Citation
358
+
359
+ If you use this package in your research, please cite:
360
+
361
+ ```bibtex
362
+ @article{skirzynski2026concept,
363
+ title={Measuring What Matters: Synthetic Benchmarks for Concept Bottleneck Models},
364
+ author={Skirzy\'{n}ski, Julian and Cheon, Harry and Kadekodi, Shreyas and Stewart, Meredith and Ustun, Berk},
365
+ year={2026},
366
+ }
367
+ ```