codebook-lab 1.1.1__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/PKG-INFO +43 -1
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/README.md +42 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/__init__.py +15 -1
- codebook_lab-1.2.0/codebook_lab/human_reliability.py +915 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/types.py +40 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/PKG-INFO +43 -1
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/SOURCES.txt +2 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/pyproject.toml +1 -1
- codebook_lab-1.2.0/tests/test_human_reliability.py +275 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_types.py +27 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/LICENSE +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/annotate.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/conditions.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/examples.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/experiments.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/metrics.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/ollama.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/prompts.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/py.typed +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/__init__.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/codebook.json +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab/tasks/policy-sentiment/ground-truth.csv +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/dependency_links.txt +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/requires.txt +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/codebook_lab.egg-info/top_level.txt +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/setup.cfg +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_conditions.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_examples.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_experiments.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_metrics_summary.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_package_import.py +0 -0
- {codebook_lab-1.1.1 → codebook_lab-1.2.0}/tests/test_prompts.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebook-lab
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.2.0
|
|
4
4
|
Summary: An LLM annotation experiment pipeline for computational social science.
|
|
5
5
|
Author: Lorcan McLaren
|
|
6
6
|
License-Expression: AGPL-3.0-only
|
|
@@ -98,6 +98,7 @@ The package is organized around a small set of importable modules:
|
|
|
98
98
|
- `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
|
|
99
99
|
- `codebook_lab.annotate`: lower-level annotation functions
|
|
100
100
|
- `codebook_lab.metrics`: evaluation and metrics functions
|
|
101
|
+
- `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
|
|
101
102
|
- `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
|
|
102
103
|
- `codebook_lab.examples`: helpers for bundled example tasks
|
|
103
104
|
- `codebook_lab.types`: dataclasses for experiment specifications and result objects
|
|
@@ -281,6 +282,47 @@ Add multiple values to any field and the package sweeps them automatically. For
|
|
|
281
282
|
|
|
282
283
|
If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
|
|
283
284
|
|
|
285
|
+
## Human Reliability And Adjudication
|
|
286
|
+
|
|
287
|
+
When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
|
|
288
|
+
|
|
289
|
+
```python
|
|
290
|
+
from codebook_lab import build_human_ground_truth, calculate_human_reliability
|
|
291
|
+
|
|
292
|
+
coder_csvs = {
|
|
293
|
+
"coder1": "annotations/coder1.csv",
|
|
294
|
+
"coder2": "annotations/coder2.csv",
|
|
295
|
+
"coder3": "annotations/coder3.csv",
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
reliability = calculate_human_reliability(
|
|
299
|
+
codebook_path="codebook.json",
|
|
300
|
+
coder_csvs=coder_csvs,
|
|
301
|
+
output_dir="outputs/human_reliability",
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
ground_truth = build_human_ground_truth(
|
|
305
|
+
codebook_path="codebook.json",
|
|
306
|
+
coder_csvs=coder_csvs,
|
|
307
|
+
output_dir="outputs/ground_truth",
|
|
308
|
+
)
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
|
|
312
|
+
|
|
313
|
+
Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
|
|
314
|
+
|
|
315
|
+
Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
|
|
316
|
+
|
|
317
|
+
```python
|
|
318
|
+
resolved = build_human_ground_truth(
|
|
319
|
+
codebook_path="codebook.json",
|
|
320
|
+
coder_csvs=coder_csvs,
|
|
321
|
+
adjudications_csv="adjudication_queue.csv",
|
|
322
|
+
output_dir="outputs/ground_truth_resolved",
|
|
323
|
+
)
|
|
324
|
+
```
|
|
325
|
+
|
|
284
326
|
## Advanced Customization
|
|
285
327
|
|
|
286
328
|
If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
|
|
@@ -53,6 +53,7 @@ The package is organized around a small set of importable modules:
|
|
|
53
53
|
- `codebook_lab.experiments`: high-level functions for single experiments and multi-run comparisons
|
|
54
54
|
- `codebook_lab.annotate`: lower-level annotation functions
|
|
55
55
|
- `codebook_lab.metrics`: evaluation and metrics functions
|
|
56
|
+
- `codebook_lab.human_reliability`: human coder validation, ICR, disagreement, and ground-truth helpers
|
|
56
57
|
- `codebook_lab.prompts`: prompt wrapper registry for built-in and custom prompt styles
|
|
57
58
|
- `codebook_lab.examples`: helpers for bundled example tasks
|
|
58
59
|
- `codebook_lab.types`: dataclasses for experiment specifications and result objects
|
|
@@ -236,6 +237,47 @@ Add multiple values to any field and the package sweeps them automatically. For
|
|
|
236
237
|
|
|
237
238
|
If you are still designing a task and do not yet have human-coded labels, you can run annotation with `codebook_lab.run_annotation(...)` on an unlabeled CSV and add `ground-truth.csv` later when you want to score model performance with `codebook_lab.run_metrics(...)`.
|
|
238
239
|
|
|
240
|
+
## Human Reliability And Adjudication
|
|
241
|
+
|
|
242
|
+
When multiple human coders annotate the same items, CodeBook Lab can validate the coder CSVs, calculate inter-coder reliability, find disagreements, and build a consensus `ground-truth.csv`.
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
from codebook_lab import build_human_ground_truth, calculate_human_reliability
|
|
246
|
+
|
|
247
|
+
coder_csvs = {
|
|
248
|
+
"coder1": "annotations/coder1.csv",
|
|
249
|
+
"coder2": "annotations/coder2.csv",
|
|
250
|
+
"coder3": "annotations/coder3.csv",
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
reliability = calculate_human_reliability(
|
|
254
|
+
codebook_path="codebook.json",
|
|
255
|
+
coder_csvs=coder_csvs,
|
|
256
|
+
output_dir="outputs/human_reliability",
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
ground_truth = build_human_ground_truth(
|
|
260
|
+
codebook_path="codebook.json",
|
|
261
|
+
coder_csvs=coder_csvs,
|
|
262
|
+
output_dir="outputs/ground_truth",
|
|
263
|
+
)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
Each coder CSV must contain a stable item identifier column. The default is `sample_id`; pass `id_column="..."` to use a different column. By default, coder assignments are inferred from the submitted files. To validate expected coverage, pass an optional assignment CSV in either long format (`sample_id,coder_id`) or wide format (`sample_id,ra_1,ra_2,...`).
|
|
267
|
+
|
|
268
|
+
Reliability outputs include `validation_issues.csv`, `pairwise_icr.csv`, `multirater_icr.csv`, `disagreements.csv`, and `summary.md`. Ground-truth outputs include `ground-truth.csv`, `adjudication_queue.csv`, and `validation_issues.csv`.
|
|
269
|
+
|
|
270
|
+
Rows without a strict majority are written to `adjudication_queue.csv`. Open that queue in CodeBook Studio's adjudication mode, fill the unresolved blanks, export the completed queue, then rebuild:
|
|
271
|
+
|
|
272
|
+
```python
|
|
273
|
+
resolved = build_human_ground_truth(
|
|
274
|
+
codebook_path="codebook.json",
|
|
275
|
+
coder_csvs=coder_csvs,
|
|
276
|
+
adjudications_csv="adjudication_queue.csv",
|
|
277
|
+
output_dir="outputs/ground_truth_resolved",
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
239
281
|
## Advanced Customization
|
|
240
282
|
|
|
241
283
|
If you want to go beyond the default wrappers and hyperparameters, `codebook_lab/annotate.py` and `codebook_lab/prompts.py` are the main extension points.
|
|
@@ -12,11 +12,19 @@ from .prompts import (
|
|
|
12
12
|
list_prompt_wrappers,
|
|
13
13
|
register_prompt_wrapper,
|
|
14
14
|
)
|
|
15
|
-
from .types import
|
|
15
|
+
from .types import (
|
|
16
|
+
AnnotationRunResult,
|
|
17
|
+
ExperimentRunResult,
|
|
18
|
+
ExperimentSpec,
|
|
19
|
+
HumanGroundTruthResult,
|
|
20
|
+
HumanReliabilityResult,
|
|
21
|
+
MetricsRunResult,
|
|
22
|
+
)
|
|
16
23
|
|
|
17
24
|
if TYPE_CHECKING:
|
|
18
25
|
from .annotate import run_annotation
|
|
19
26
|
from .experiments import expand_param_grid, resolve_task_dir, run_experiment, run_experiment_grid
|
|
27
|
+
from .human_reliability import build_human_ground_truth, calculate_human_reliability
|
|
20
28
|
from .metrics import run_metrics
|
|
21
29
|
|
|
22
30
|
try:
|
|
@@ -30,6 +38,8 @@ _LAZY_EXPORTS = {
|
|
|
30
38
|
"run_annotation": (".annotate", "run_annotation"),
|
|
31
39
|
"run_experiment": (".experiments", "run_experiment"),
|
|
32
40
|
"run_experiment_grid": (".experiments", "run_experiment_grid"),
|
|
41
|
+
"build_human_ground_truth": (".human_reliability", "build_human_ground_truth"),
|
|
42
|
+
"calculate_human_reliability": (".human_reliability", "calculate_human_reliability"),
|
|
33
43
|
"run_metrics": (".metrics", "run_metrics"),
|
|
34
44
|
}
|
|
35
45
|
|
|
@@ -38,7 +48,11 @@ __all__ = [
|
|
|
38
48
|
"AnnotationRunResult",
|
|
39
49
|
"ExperimentRunResult",
|
|
40
50
|
"ExperimentSpec",
|
|
51
|
+
"HumanGroundTruthResult",
|
|
52
|
+
"HumanReliabilityResult",
|
|
41
53
|
"MetricsRunResult",
|
|
54
|
+
"build_human_ground_truth",
|
|
55
|
+
"calculate_human_reliability",
|
|
42
56
|
"copy_example_task",
|
|
43
57
|
"ensure_ollama_available",
|
|
44
58
|
"ensure_ollama_model",
|