codebook-lab 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/PKG-INFO +51 -5
  2. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/README.md +50 -4
  3. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/__init__.py +15 -1
  4. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/annotate.py +3 -1
  5. codebook_lab-1.2.0/codebook_lab/human_reliability.py +915 -0
  6. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/types.py +40 -0
  7. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/PKG-INFO +51 -5
  8. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/SOURCES.txt +2 -4
  9. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/pyproject.toml +1 -1
  10. codebook_lab-1.2.0/tests/test_human_reliability.py +275 -0
  11. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_types.py +27 -0
  12. codebook_lab-1.1.0/scripts/multi_run_example.py +0 -41
  13. codebook_lab-1.1.0/scripts/single_run_example.py +0 -48
  14. codebook_lab-1.1.0/tests/__init__.py +0 -0
  15. codebook_lab-1.1.0/tests/conftest.py +0 -13
  16. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/LICENSE +0 -0
  17. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/conditions.py +0 -0
  18. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/examples.py +0 -0
  19. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/experiments.py +0 -0
  20. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/metrics.py +0 -0
  21. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/ollama.py +0 -0
  22. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/prompts.py +0 -0
  23. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/py.typed +0 -0
  24. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/__init__.py +0 -0
  25. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/codebook.json +0 -0
  26. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/ground-truth.csv +0 -0
  27. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/dependency_links.txt +0 -0
  28. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/requires.txt +0 -0
  29. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/codebook_lab.egg-info/top_level.txt +0 -0
  30. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/setup.cfg +0 -0
  31. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_conditions.py +0 -0
  32. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_examples.py +0 -0
  33. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_experiments.py +0 -0
  34. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_metrics_summary.py +0 -0
  35. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_package_import.py +0 -0
  36. {codebook_lab-1.1.0 → codebook_lab-1.2.0}/tests/test_prompts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codebook-lab
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: An LLM annotation experiment pipeline for computational social science.
5
5
  Author: Lorcan McLaren
6
6
  License-Expression: AGPL-3.0-only
@@ -45,7 +45,7 @@ Dynamic: license-file
45
45
 
46
46
  # CodeBook Lab
47
47
 
48
- [![DOI](https://zenodo.org/badge/1186234207.svg)](https://doi.org/10.5281/zenodo.19185921)
48
+ [![DOI](https://zenodo.org/badge/1186234207.svg)](https://doi.org/10.5281/zenodo.19185921) [![PyPI](https://img.shields.io/pypi/v/codebook-lab)](https://pypi.org/project/codebook-lab/) [![Python](https://img.shields.io/pypi/pyversions/codebook-lab)](https://pypi.org/project/codebook-lab/) [![License](https://img.shields.io/pypi/l/codebook-lab)](https://pypi.org/project/codebook-lab/)
49
49
 
50
50
  CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
51
51
 
@@ -98,6 +98,7 @@ The package is organized around a small set of importable modules:
98
98
  - `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
99
99
  - `codebook_lab.annotate`: lower-level annotation functions
100
100
  - `codebook_lab.metrics`: evaluation and metrics functions
101
+ - `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
101
102
  - `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
102
103
  - `codebook_lab.examples`: helpers for bundled example tasks
103
104
  - `codebook_lab.types`: dataclasses for experiment specifications and result objects
@@ -281,6 +282,47 @@ Add multiple values to any field and the package sweeps them automatically. For
281
282
 
282
283
  If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
283
284
 
285
+ ## Human Reliability And Adjudication
286
+
287
+ When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
288
+
289
+ ```python
290
+ from codebook_lab import build_human_ground_truth, calculate_human_reliability
291
+
292
+ coder_csvs = {
293
+ "coder1": "annotations/coder1.csv",
294
+ "coder2": "annotations/coder2.csv",
295
+ "coder3": "annotations/coder3.csv",
296
+ }
297
+
298
+ reliability = calculate_human_reliability(
299
+ codebook_path="codebook.json",
300
+ coder_csvs=coder_csvs,
301
+ output_dir="outputs/human_reliability",
302
+ )
303
+
304
+ ground_truth = build_human_ground_truth(
305
+ codebook_path="codebook.json",
306
+ coder_csvs=coder_csvs,
307
+ output_dir="outputs/ground_truth",
308
+ )
309
+ ```
310
+
311
+ Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
312
+
313
+ Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
314
+
315
+ Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
316
+
317
+ ```python
318
+ resolved = build_human_ground_truth(
319
+ codebook_path="codebook.json",
320
+ coder_csvs=coder_csvs,
321
+ adjudications_csv="adjudication_queue.csv",
322
+ output_dir="outputs/ground_truth_resolved",
323
+ )
324
+ ```
325
+
284
326
  ## Advanced Customization
285
327
 
286
328
  If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
@@ -297,7 +339,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
297
339
  If you use CodeBook Lab in research, please cite both:
298
340
 
299
341
  - this software package
300
- - the associated preprint
342
+ - the associated arXiv preprint
301
343
 
302
344
  Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
303
345
 
@@ -324,7 +366,7 @@ BibTeX:
324
366
 
325
367
  APSR style:
326
368
 
327
- McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. Preprint.
369
+ McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
328
370
 
329
371
  BibTeX:
330
372
 
@@ -333,6 +375,10 @@ BibTeX:
333
375
  author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
334
376
  title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
335
377
  year = {2026},
336
- note = {Preprint}
378
+ eprint = {2603.26898},
379
+ archivePrefix = {arXiv},
380
+ primaryClass = {cs.CL},
381
+ doi = {10.48550/arXiv.2603.26898},
382
+ url = {https://arxiv.org/abs/2603.26898}
337
383
  }
338
384
  ```
@@ -1,6 +1,6 @@
1
1
  # CodeBook Lab
2
2
 
3
- [![DOI](https://zenodo.org/badge/1186234207.svg)](https://doi.org/10.5281/zenodo.19185921)
3
+ [![DOI](https://zenodo.org/badge/1186234207.svg)](https://doi.org/10.5281/zenodo.19185921) [![PyPI](https://img.shields.io/pypi/v/codebook-lab)](https://pypi.org/project/codebook-lab/) [![Python](https://img.shields.io/pypi/pyversions/codebook-lab)](https://pypi.org/project/codebook-lab/) [![License](https://img.shields.io/pypi/l/codebook-lab)](https://pypi.org/project/codebook-lab/)
4
4
 
5
5
  CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
6
6
 
@@ -53,6 +53,7 @@ The package is organized around a small set of importable modules:
53
53
  - `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
54
54
  - `codebook_lab.annotate`: lower-level annotation functions
55
55
  - `codebook_lab.metrics`: evaluation and metrics functions
56
+ - `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
56
57
  - `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
57
58
  - `codebook_lab.examples`: helpers for bundled example tasks
58
59
  - `codebook_lab.types`: dataclasses for experiment specifications and result objects
@@ -236,6 +237,47 @@ Add multiple values to any field and the package sweeps them automatically. For
236
237
 
237
238
  If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
238
239
 
240
+ ## Human Reliability And Adjudication
241
+
242
+ When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
243
+
244
+ ```python
245
+ from codebook_lab import build_human_ground_truth, calculate_human_reliability
246
+
247
+ coder_csvs = {
248
+ "coder1": "annotations/coder1.csv",
249
+ "coder2": "annotations/coder2.csv",
250
+ "coder3": "annotations/coder3.csv",
251
+ }
252
+
253
+ reliability = calculate_human_reliability(
254
+ codebook_path="codebook.json",
255
+ coder_csvs=coder_csvs,
256
+ output_dir="outputs/human_reliability",
257
+ )
258
+
259
+ ground_truth = build_human_ground_truth(
260
+ codebook_path="codebook.json",
261
+ coder_csvs=coder_csvs,
262
+ output_dir="outputs/ground_truth",
263
+ )
264
+ ```
265
+
266
+ Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
267
+
268
+ Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
269
+
270
+ Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
271
+
272
+ ```python
273
+ resolved = build_human_ground_truth(
274
+ codebook_path="codebook.json",
275
+ coder_csvs=coder_csvs,
276
+ adjudications_csv="adjudication_queue.csv",
277
+ output_dir="outputs/ground_truth_resolved",
278
+ )
279
+ ```
280
+
239
281
  ## Advanced Customization
240
282
 
241
283
  If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
@@ -252,7 +294,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
252
294
  If you use CodeBook Lab in research, please cite both:
253
295
 
254
296
  - this software package
255
- - the associated preprint
297
+ - the associated arXiv preprint
256
298
 
257
299
  Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
258
300
 
@@ -279,7 +321,7 @@ BibTeX:
279
321
 
280
322
  APSR style:
281
323
 
282
- McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. Preprint.
324
+ McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
283
325
 
284
326
  BibTeX:
285
327
 
@@ -288,6 +330,10 @@ BibTeX:
288
330
  author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
289
331
  title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
290
332
  year = {2026},
291
- note = {Preprint}
333
+ eprint = {2603.26898},
334
+ archivePrefix = {arXiv},
335
+ primaryClass = {cs.CL},
336
+ doi = {10.48550/arXiv.2603.26898},
337
+ url = {https://arxiv.org/abs/2603.26898}
292
338
  }
293
339
  ```
@@ -12,11 +12,19 @@ from .prompts import (
12
12
  list_prompt_wrappers,
13
13
  register_prompt_wrapper,
14
14
  )
15
- from .types import AnnotationRunResult, ExperimentRunResult, ExperimentSpec, MetricsRunResult
15
+ from .types import (
16
+ AnnotationRunResult,
17
+ ExperimentRunResult,
18
+ ExperimentSpec,
19
+ HumanGroundTruthResult,
20
+ HumanReliabilityResult,
21
+ MetricsRunResult,
22
+ )
16
23
 
17
24
  if TYPE_CHECKING:
18
25
  from .annotate import run_annotation
19
26
  from .experiments import expand_param_grid, resolve_task_dir, run_experiment, run_experiment_grid
27
+ from .human_reliability import build_human_ground_truth, calculate_human_reliability
20
28
  from .metrics import run_metrics
21
29
 
22
30
  try:
@@ -30,6 +38,8 @@ _LAZY_EXPORTS = {
30
38
  "run_annotation": (".annotate", "run_annotation"),
31
39
  "run_experiment": (".experiments", "run_experiment"),
32
40
  "run_experiment_grid": (".experiments", "run_experiment_grid"),
41
+ "build_human_ground_truth": (".human_reliability", "build_human_ground_truth"),
42
+ "calculate_human_reliability": (".human_reliability", "calculate_human_reliability"),
33
43
  "run_metrics": (".metrics", "run_metrics"),
34
44
  }
35
45
 
@@ -38,7 +48,11 @@ __all__ = [
38
48
  "AnnotationRunResult",
39
49
  "ExperimentRunResult",
40
50
  "ExperimentSpec",
51
+ "HumanGroundTruthResult",
52
+ "HumanReliabilityResult",
41
53
  "MetricsRunResult",
54
+ "build_human_ground_truth",
55
+ "calculate_human_reliability",
42
56
  "copy_example_task",
43
57
  "ensure_ollama_available",
44
58
  "ensure_ollama_model",
@@ -205,7 +205,9 @@ def generate_response(chain, prompt, char_counts, timing_data, row_num=None, ann
205
205
 
206
206
  structured_chain = (
207
207
  _PROMPT_TEMPLATE
208
- | chain.with_structured_output(AnnotationResponse, include_raw=True)
208
+ | chain.with_structured_output(
209
+ AnnotationResponse, method="json_schema", include_raw=True
210
+ )
209
211
  )
210
212
 
211
213
  start_time = time.time()