codebook-lab 1.1.1__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/PKG-INFO +43 -1
  2. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/README.md +42 -0
  3. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/__init__.py +15 -1
  4. codebook_lab-1.2.0/codebook_lab/human_reliability.py +915 -0
  5. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/types.py +40 -0
  6. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/PKG-INFO +43 -1
  7. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/SOURCES.txt +2 -0
  8. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/pyproject.toml +1 -1
  9. codebook_lab-1.2.0/tests/test_human_reliability.py +275 -0
  10. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_types.py +27 -0
  11. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/LICENSE +0 -0
  12. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/annotate.py +0 -0
  13. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/conditions.py +0 -0
  14. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/examples.py +0 -0
  15. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/experiments.py +0 -0
  16. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/metrics.py +0 -0
  17. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/ollama.py +0 -0
  18. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/prompts.py +0 -0
  19. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/py.typed +0 -0
  20. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/__init__.py +0 -0
  21. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/codebook.json +0 -0
  22. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/ground-truth.csv +0 -0
  23. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/dependency_links.txt +0 -0
  24. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/requires.txt +0 -0
  25. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/top_level.txt +0 -0
  26. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/setup.cfg +0 -0
  27. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_conditions.py +0 -0
  28. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_examples.py +0 -0
  29. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_experiments.py +0 -0
  30. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_metrics_summary.py +0 -0
  31. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_package_import.py +0 -0
  32. {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_prompts.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codebook-lab
3
- Version: 1.1.1
3
+ Version: 1.2.0
4
4
  Summary: An LLM annotation experiment pipeline for computational social science.
5
5
  Author: Lorcan McLaren
6
6
  License-Expression: AGPL-3.0-only
@@ -98,6 +98,7 @@ The package is organized around a small set of importable modules:
98
98
  - `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
99
99
  - `codebook_lab.annotate`: lower-level annotation functions
100
100
  - `codebook_lab.metrics`: evaluation and metrics functions
101
+ - `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
101
102
  - `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
102
103
  - `codebook_lab.examples`: helpers for bundled example tasks
103
104
  - `codebook_lab.types`: dataclasses for experiment specifications and result objects
@@ -281,6 +282,47 @@ Add multiple values to any field and the package sweeps them automatically. For
281
282
 
282
283
  If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
283
284
 
285
+ ## Human Reliability And Adjudication
286
+
287
+ When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
288
+
289
+ ```python
290
+ from codebook_lab import build_human_ground_truth, calculate_human_reliability
291
+
292
+ coder_csvs = {
293
+ "coder1": "annotations/coder1.csv",
294
+ "coder2": "annotations/coder2.csv",
295
+ "coder3": "annotations/coder3.csv",
296
+ }
297
+
298
+ reliability = calculate_human_reliability(
299
+ codebook_path="codebook.json",
300
+ coder_csvs=coder_csvs,
301
+ output_dir="outputs/human_reliability",
302
+ )
303
+
304
+ ground_truth = build_human_ground_truth(
305
+ codebook_path="codebook.json",
306
+ coder_csvs=coder_csvs,
307
+ output_dir="outputs/ground_truth",
308
+ )
309
+ ```
310
+
311
+ Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
312
+
313
+ Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
314
+
315
+ Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
316
+
317
+ ```python
318
+ resolved = build_human_ground_truth(
319
+ codebook_path="codebook.json",
320
+ coder_csvs=coder_csvs,
321
+ adjudications_csv="adjudication_queue.csv",
322
+ output_dir="outputs/ground_truth_resolved",
323
+ )
324
+ ```
325
+
284
326
  ## Advanced Customization
285
327
 
286
328
  If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
@@ -53,6 +53,7 @@ The package is organized around a small set of importable modules:
53
53
  - `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
54
54
  - `codebook_lab.annotate`: lower-level annotation functions
55
55
  - `codebook_lab.metrics`: evaluation and metrics functions
56
+ - `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
56
57
  - `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
57
58
  - `codebook_lab.examples`: helpers for bundled example tasks
58
59
  - `codebook_lab.types`: dataclasses for experiment specifications and result objects
@@ -236,6 +237,47 @@ Add multiple values to any field and the package sweeps them automatically. For
236
237
 
237
238
  If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
238
239
 
240
+ ## Human Reliability And Adjudication
241
+
242
+ When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
243
+
244
+ ```python
245
+ from codebook_lab import build_human_ground_truth, calculate_human_reliability
246
+
247
+ coder_csvs = {
248
+ "coder1": "annotations/coder1.csv",
249
+ "coder2": "annotations/coder2.csv",
250
+ "coder3": "annotations/coder3.csv",
251
+ }
252
+
253
+ reliability = calculate_human_reliability(
254
+ codebook_path="codebook.json",
255
+ coder_csvs=coder_csvs,
256
+ output_dir="outputs/human_reliability",
257
+ )
258
+
259
+ ground_truth = build_human_ground_truth(
260
+ codebook_path="codebook.json",
261
+ coder_csvs=coder_csvs,
262
+ output_dir="outputs/ground_truth",
263
+ )
264
+ ```
265
+
266
+ Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
267
+
268
+ Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
269
+
270
+ Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
271
+
272
+ ```python
273
+ resolved = build_human_ground_truth(
274
+ codebook_path="codebook.json",
275
+ coder_csvs=coder_csvs,
276
+ adjudications_csv="adjudication_queue.csv",
277
+ output_dir="outputs/ground_truth_resolved",
278
+ )
279
+ ```
280
+
239
281
  ## Advanced Customization
240
282
 
241
283
  If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
@@ -12,11 +12,19 @@ from .prompts import (
12
12
  list_prompt_wrappers,
13
13
  register_prompt_wrapper,
14
14
  )
15
- from .types import AnnotationRunResult, ExperimentRunResult, ExperimentSpec, MetricsRunResult
15
+ from .types import (
16
+ AnnotationRunResult,
17
+ ExperimentRunResult,
18
+ ExperimentSpec,
19
+ HumanGroundTruthResult,
20
+ HumanReliabilityResult,
21
+ MetricsRunResult,
22
+ )
16
23
 
17
24
  if TYPE_CHECKING:
18
25
  from .annotate import run_annotation
19
26
  from .experiments import expand_param_grid, resolve_task_dir, run_experiment, run_experiment_grid
27
+ from .human_reliability import build_human_ground_truth, calculate_human_reliability
20
28
  from .metrics import run_metrics
21
29
 
22
30
  try:
@@ -30,6 +38,8 @@ _LAZY_EXPORTS = {
30
38
  "run_annotation": (".annotate", "run_annotation"),
31
39
  "run_experiment": (".experiments", "run_experiment"),
32
40
  "run_experiment_grid": (".experiments", "run_experiment_grid"),
41
+ "build_human_ground_truth": (".human_reliability", "build_human_ground_truth"),
42
+ "calculate_human_reliability": (".human_reliability", "calculate_human_reliability"),
33
43
  "run_metrics": (".metrics", "run_metrics"),
34
44
  }
35
45
 
@@ -38,7 +48,11 @@ __all__ = [
38
48
  "AnnotationRunResult",
39
49
  "ExperimentRunResult",
40
50
  "ExperimentSpec",
51
+ "HumanGroundTruthResult",
52
+ "HumanReliabilityResult",
41
53
  "MetricsRunResult",
54
+ "build_human_ground_truth",
55
+ "calculate_human_reliability",
42
56
  "copy_example_task",
43
57
  "ensure_ollama_available",
44
58
  "ensure_ollama_model",