codebook-lab 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/PKG-INFO +51 -5
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/README.md +50 -4
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/__init__.py +15 -1
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/annotate.py +3 -1
- codebook_lab-1.2.0/codebook_lab/human_reliability.py +915 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/types.py +40 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/PKG-INFO +51 -5
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/SOURCES.txt +2 -4
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/pyproject.toml +1 -1
- codebook_lab-1.2.0/tests/test_human_reliability.py +275 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_types.py +27 -0
- codebook_lab-1.1.0/scripts/multi_run_example.py +0 -41
- codebook_lab-1.1.0/scripts/single_run_example.py +0 -48
- codebook_lab-1.1.0/tests/__init__.py +0 -0
- codebook_lab-1.1.0/tests/conftest.py +0 -13
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/LICENSE +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/conditions.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/examples.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/experiments.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/metrics.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/ollama.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/prompts.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/py.typed +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/__init__.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/codebook.json +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/ground-truth.csv +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/dependency_links.txt +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/requires.txt +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/top_level.txt +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/setup.cfg +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_conditions.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_examples.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_experiments.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_metrics_summary.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_package_import.py +0 -0
- {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_prompts.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebook-lab
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: An LLM annotation experiment pipeline for computational social science.
|
|
5
5
|
Author: Lorcan McLaren
|
|
6
6
|
License-Expression: AGPL-3.0-only
|
|
@@ -45,7 +45,7 @@ Dynamic: license-file
|
|
|
45
45
|
|
|
46
46
|
# CodeBook Lab
|
|
47
47
|
|
|
48
|
-
[](https://doi.org/10.5281/zenodo.19185921)
|
|
48
|
+
[](https://doi.org/10.5281/zenodo.19185921) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/)
|
|
49
49
|
|
|
50
50
|
CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
|
|
51
51
|
|
|
@@ -98,6 +98,7 @@ The package is organized around a small set of importable modules:
|
|
|
98
98
|
- `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
|
|
99
99
|
- `codebook_lab.annotate`: lower-level annotation functions
|
|
100
100
|
- `codebook_lab.metrics`: evaluation and metrics functions
|
|
101
|
+
- `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
|
|
101
102
|
- `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
|
|
102
103
|
- `codebook_lab.examples`: helpers for bundled example tasks
|
|
103
104
|
- `codebook_lab.types`: dataclasses for experiment specifications and result objects
|
|
@@ -281,6 +282,47 @@ Add multiple values to any field and the package sweeps them automatically. For
|
|
|
281
282
|
|
|
282
283
|
If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
|
|
283
284
|
|
|
285
|
+
## Human Reliability And Adjudication
|
|
286
|
+
|
|
287
|
+
When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from codebook_lab import build_human_ground_truth, calculate_human_reliability
|
|
291
|
+
|
|
292
|
+
coder_csvs = {
|
|
293
|
+
"coder1": "annotations/coder1.csv",
|
|
294
|
+
"coder2": "annotations/coder2.csv",
|
|
295
|
+
"coder3": "annotations/coder3.csv",
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
reliability = calculate_human_reliability(
|
|
299
|
+
codebook_path="codebook.json",
|
|
300
|
+
coder_csvs=coder_csvs,
|
|
301
|
+
output_dir="outputs/human_reliability",
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
ground_truth = build_human_ground_truth(
|
|
305
|
+
codebook_path="codebook.json",
|
|
306
|
+
coder_csvs=coder_csvs,
|
|
307
|
+
output_dir="outputs/ground_truth",
|
|
308
|
+
)
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
|
|
312
|
+
|
|
313
|
+
Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
|
|
314
|
+
|
|
315
|
+
Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
resolved = build_human_ground_truth(
|
|
319
|
+
codebook_path="codebook.json",
|
|
320
|
+
coder_csvs=coder_csvs,
|
|
321
|
+
adjudications_csv="adjudication_queue.csv",
|
|
322
|
+
output_dir="outputs/ground_truth_resolved",
|
|
323
|
+
)
|
|
324
|
+
```
|
|
325
|
+
|
|
284
326
|
## Advanced Customization
|
|
285
327
|
|
|
286
328
|
If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
|
|
@@ -297,7 +339,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
|
|
|
297
339
|
If you use CodeBook Lab in research, please cite both:
|
|
298
340
|
|
|
299
341
|
- this software package
|
|
300
|
-
- the associated preprint
|
|
342
|
+
- the associated arXiv preprint
|
|
301
343
|
|
|
302
344
|
Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
|
|
303
345
|
|
|
@@ -324,7 +366,7 @@ BibTeX:
|
|
|
324
366
|
|
|
325
367
|
APSR style:
|
|
326
368
|
|
|
327
|
-
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*.
|
|
369
|
+
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
|
|
328
370
|
|
|
329
371
|
BibTeX:
|
|
330
372
|
|
|
@@ -333,6 +375,10 @@ BibTeX:
|
|
|
333
375
|
author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
|
|
334
376
|
title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
|
|
335
377
|
year = {2026},
|
|
336
|
-
|
|
378
|
+
eprint = {2603.26898},
|
|
379
|
+
archivePrefix = {arXiv},
|
|
380
|
+
primaryClass = {cs.CL},
|
|
381
|
+
doi = {10.48550/arXiv.2603.26898},
|
|
382
|
+
url = {https://arxiv.org/abs/2603.26898}
|
|
337
383
|
}
|
|
338
384
|
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# CodeBook Lab
|
|
2
2
|
|
|
3
|
-
[](https://doi.org/10.5281/zenodo.19185921)
|
|
3
|
+
[](https://doi.org/10.5281/zenodo.19185921) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/)
|
|
4
4
|
|
|
5
5
|
CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
|
|
6
6
|
|
|
@@ -53,6 +53,7 @@ The package is organized around a small set of importable modules:
|
|
|
53
53
|
- `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
|
|
54
54
|
- `codebook_lab.annotate`: lower-level annotation functions
|
|
55
55
|
- `codebook_lab.metrics`: evaluation and metrics functions
|
|
56
|
+
- `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
|
|
56
57
|
- `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
|
|
57
58
|
- `codebook_lab.examples`: helpers for bundled example tasks
|
|
58
59
|
- `codebook_lab.types`: dataclasses for experiment specifications and result objects
|
|
@@ -236,6 +237,47 @@ Add multiple values to any field and the package sweeps them automatically. For
|
|
|
236
237
|
|
|
237
238
|
If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
|
|
238
239
|
|
|
240
|
+
## Human Reliability And Adjudication
|
|
241
|
+
|
|
242
|
+
When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from codebook_lab import build_human_ground_truth, calculate_human_reliability
|
|
246
|
+
|
|
247
|
+
coder_csvs = {
|
|
248
|
+
"coder1": "annotations/coder1.csv",
|
|
249
|
+
"coder2": "annotations/coder2.csv",
|
|
250
|
+
"coder3": "annotations/coder3.csv",
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
reliability = calculate_human_reliability(
|
|
254
|
+
codebook_path="codebook.json",
|
|
255
|
+
coder_csvs=coder_csvs,
|
|
256
|
+
output_dir="outputs/human_reliability",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
ground_truth = build_human_ground_truth(
|
|
260
|
+
codebook_path="codebook.json",
|
|
261
|
+
coder_csvs=coder_csvs,
|
|
262
|
+
output_dir="outputs/ground_truth",
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
|
|
267
|
+
|
|
268
|
+
Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
|
|
269
|
+
|
|
270
|
+
Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
resolved = build_human_ground_truth(
|
|
274
|
+
codebook_path="codebook.json",
|
|
275
|
+
coder_csvs=coder_csvs,
|
|
276
|
+
adjudications_csv="adjudication_queue.csv",
|
|
277
|
+
output_dir="outputs/ground_truth_resolved",
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
239
281
|
## Advanced Customization
|
|
240
282
|
|
|
241
283
|
If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
|
|
@@ -252,7 +294,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
|
|
|
252
294
|
If you use CodeBook Lab in research, please cite both:
|
|
253
295
|
|
|
254
296
|
- this software package
|
|
255
|
-
- the associated preprint
|
|
297
|
+
- the associated arXiv preprint
|
|
256
298
|
|
|
257
299
|
Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
|
|
258
300
|
|
|
@@ -279,7 +321,7 @@ BibTeX:
|
|
|
279
321
|
|
|
280
322
|
APSR style:
|
|
281
323
|
|
|
282
|
-
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*.
|
|
324
|
+
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
|
|
283
325
|
|
|
284
326
|
BibTeX:
|
|
285
327
|
|
|
@@ -288,6 +330,10 @@ BibTeX:
|
|
|
288
330
|
author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
|
|
289
331
|
title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
|
|
290
332
|
year = {2026},
|
|
291
|
-
|
|
333
|
+
eprint = {2603.26898},
|
|
334
|
+
archivePrefix = {arXiv},
|
|
335
|
+
primaryClass = {cs.CL},
|
|
336
|
+
doi = {10.48550/arXiv.2603.26898},
|
|
337
|
+
url = {https://arxiv.org/abs/2603.26898}
|
|
292
338
|
}
|
|
293
339
|
```
|
|
@@ -12,11 +12,19 @@ from .prompts import (
|
|
|
12
12
|
list_prompt_wrappers,
|
|
13
13
|
register_prompt_wrapper,
|
|
14
14
|
)
|
|
15
|
-
from .types import
|
|
15
|
+
from .types import (
|
|
16
|
+
AnnotationRunResult,
|
|
17
|
+
ExperimentRunResult,
|
|
18
|
+
ExperimentSpec,
|
|
19
|
+
HumanGroundTruthResult,
|
|
20
|
+
HumanReliabilityResult,
|
|
21
|
+
MetricsRunResult,
|
|
22
|
+
)
|
|
16
23
|
|
|
17
24
|
if TYPE_CHECKING:
|
|
18
25
|
from .annotate import run_annotation
|
|
19
26
|
from .experiments import expand_param_grid, resolve_task_dir, run_experiment, run_experiment_grid
|
|
27
|
+
from .human_reliability import build_human_ground_truth, calculate_human_reliability
|
|
20
28
|
from .metrics import run_metrics
|
|
21
29
|
|
|
22
30
|
try:
|
|
@@ -30,6 +38,8 @@ _LAZY_EXPORTS = {
|
|
|
30
38
|
"run_annotation": (".annotate", "run_annotation"),
|
|
31
39
|
"run_experiment": (".experiments", "run_experiment"),
|
|
32
40
|
"run_experiment_grid": (".experiments", "run_experiment_grid"),
|
|
41
|
+
"build_human_ground_truth": (".human_reliability", "build_human_ground_truth"),
|
|
42
|
+
"calculate_human_reliability": (".human_reliability", "calculate_human_reliability"),
|
|
33
43
|
"run_metrics": (".metrics", "run_metrics"),
|
|
34
44
|
}
|
|
35
45
|
|
|
@@ -38,7 +48,11 @@ __all__ = [
|
|
|
38
48
|
"AnnotationRunResult",
|
|
39
49
|
"ExperimentRunResult",
|
|
40
50
|
"ExperimentSpec",
|
|
51
|
+
"HumanGroundTruthResult",
|
|
52
|
+
"HumanReliabilityResult",
|
|
41
53
|
"MetricsRunResult",
|
|
54
|
+
"build_human_ground_truth",
|
|
55
|
+
"calculate_human_reliability",
|
|
42
56
|
"copy_example_task",
|
|
43
57
|
"ensure_ollama_available",
|
|
44
58
|
"ensure_ollama_model",
|
|
@@ -205,7 +205,9 @@ def generate_response(chain, prompt, char_counts, timing_data, row_num=None, ann
|
|
|
205
205
|
|
|
206
206
|
structured_chain = (
|
|
207
207
|
_PROMPT_TEMPLATE
|
|
208
|
-
| chain.with_structured_output(
|
|
208
|
+
| chain.with_structured_output(
|
|
209
|
+
AnnotationResponse, method="json_schema", include_raw=True
|
|
210
|
+
)
|
|
209
211
|
)
|
|
210
212
|
|
|
211
213
|
start_time = time.time()
|