codebook-lab 1.0.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/PKG-INFO +9 -5
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/README.md +8 -4
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/annotate.py +143 -105
- codebook_lab-1.1.1/codebook_lab/conditions.py +154 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/metrics.py +90 -30
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab.egg-info/PKG-INFO +9 -5
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab.egg-info/SOURCES.txt +2 -4
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/pyproject.toml +1 -1
- codebook_lab-1.1.1/tests/test_conditions.py +144 -0
- codebook_lab-1.0.0/scripts/multi_run_example.py +0 -41
- codebook_lab-1.0.0/scripts/single_run_example.py +0 -48
- codebook_lab-1.0.0/tests/__init__.py +0 -0
- codebook_lab-1.0.0/tests/conftest.py +0 -13
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/LICENSE +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/__init__.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/examples.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/experiments.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/ollama.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/prompts.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/py.typed +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/tasks/__init__.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/tasks/policy-sentiment/codebook.json +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/tasks/policy-sentiment/ground-truth.csv +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/types.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab.egg-info/dependency_links.txt +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab.egg-info/requires.txt +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab.egg-info/top_level.txt +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/setup.cfg +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_examples.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_experiments.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_metrics_summary.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_package_import.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_prompts.py +0 -0
- {codebook_lab-1.0.0 → codebook_lab-1.1.1}/tests/test_types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebook-lab
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: An LLM annotation experiment pipeline for computational social science.
|
|
5
5
|
Author: Lorcan McLaren
|
|
6
6
|
License-Expression: AGPL-3.0-only
|
|
@@ -45,7 +45,7 @@ Dynamic: license-file
|
|
|
45
45
|
|
|
46
46
|
# CodeBook Lab
|
|
47
47
|
|
|
48
|
-
[](https://doi.org/10.5281/zenodo.19185921)
|
|
48
|
+
[](https://doi.org/10.5281/zenodo.19185921) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/)
|
|
49
49
|
|
|
50
50
|
CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
|
|
51
51
|
|
|
@@ -297,7 +297,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
|
|
|
297
297
|
If you use CodeBook Lab in research, please cite both:
|
|
298
298
|
|
|
299
299
|
- this software package
|
|
300
|
-
- the associated preprint
|
|
300
|
+
- the associated arXiv preprint
|
|
301
301
|
|
|
302
302
|
Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
|
|
303
303
|
|
|
@@ -324,7 +324,7 @@ BibTeX:
|
|
|
324
324
|
|
|
325
325
|
APSR style:
|
|
326
326
|
|
|
327
|
-
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*.
|
|
327
|
+
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
|
|
328
328
|
|
|
329
329
|
BibTeX:
|
|
330
330
|
|
|
@@ -333,6 +333,10 @@ BibTeX:
|
|
|
333
333
|
author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
|
|
334
334
|
title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
|
|
335
335
|
year = {2026},
|
|
336
|
-
|
|
336
|
+
eprint = {2603.26898},
|
|
337
|
+
archivePrefix = {arXiv},
|
|
338
|
+
primaryClass = {cs.CL},
|
|
339
|
+
doi = {10.48550/arXiv.2603.26898},
|
|
340
|
+
url = {https://arxiv.org/abs/2603.26898}
|
|
337
341
|
}
|
|
338
342
|
```
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# CodeBook Lab
|
|
2
2
|
|
|
3
|
-
[](https://doi.org/10.5281/zenodo.19185921)
|
|
3
|
+
[](https://doi.org/10.5281/zenodo.19185921) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/)
|
|
4
4
|
|
|
5
5
|
CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
|
|
6
6
|
|
|
@@ -252,7 +252,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
|
|
|
252
252
|
If you use CodeBook Lab in research, please cite both:
|
|
253
253
|
|
|
254
254
|
- this software package
|
|
255
|
-
- the associated preprint
|
|
255
|
+
- the associated arXiv preprint
|
|
256
256
|
|
|
257
257
|
Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
|
|
258
258
|
|
|
@@ -279,7 +279,7 @@ BibTeX:
|
|
|
279
279
|
|
|
280
280
|
APSR style:
|
|
281
281
|
|
|
282
|
-
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*.
|
|
282
|
+
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
|
|
283
283
|
|
|
284
284
|
BibTeX:
|
|
285
285
|
|
|
@@ -288,6 +288,10 @@ BibTeX:
|
|
|
288
288
|
author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
|
|
289
289
|
title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
|
|
290
290
|
year = {2026},
|
|
291
|
-
|
|
291
|
+
eprint = {2603.26898},
|
|
292
|
+
archivePrefix = {arXiv},
|
|
293
|
+
primaryClass = {cs.CL},
|
|
294
|
+
doi = {10.48550/arXiv.2603.26898},
|
|
295
|
+
url = {https://arxiv.org/abs/2603.26898}
|
|
292
296
|
}
|
|
293
297
|
```
|
|
@@ -8,9 +8,24 @@ import pandas as pd
|
|
|
8
8
|
import regex
|
|
9
9
|
from codecarbon import OfflineEmissionsTracker
|
|
10
10
|
from langchain_core.prompts import ChatPromptTemplate
|
|
11
|
-
from langchain_ollama.
|
|
12
|
-
|
|
11
|
+
from langchain_ollama.chat_models import ChatOllama
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from .conditions import (
|
|
15
|
+
get_annotation_column_name,
|
|
16
|
+
get_annotation_entries,
|
|
17
|
+
is_annotation_applicable,
|
|
18
|
+
normalize_annotation_response_value,
|
|
19
|
+
)
|
|
13
20
|
from .ollama import ensure_ollama_available
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class AnnotationResponse(BaseModel):
|
|
24
|
+
"""Schema used by ChatOllama structured output to guarantee valid JSON."""
|
|
25
|
+
response: str
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_PROMPT_TEMPLATE = ChatPromptTemplate.from_template("""{question}""")
|
|
14
29
|
from .prompts import PromptContext, get_prompt_type_name, render_prompt
|
|
15
30
|
from .types import AnnotationRunResult
|
|
16
31
|
|
|
@@ -55,17 +70,20 @@ class _AnnotationProgressBar:
|
|
|
55
70
|
sys.stderr.write("\n")
|
|
56
71
|
sys.stderr.flush()
|
|
57
72
|
|
|
73
|
+
def skip(self, count: int = 1) -> None:
|
|
74
|
+
"""Reduce the remaining work estimate when prompts are skipped."""
|
|
75
|
+
if count <= 0:
|
|
76
|
+
return
|
|
77
|
+
self.total_steps = max(self.completed_steps, self.total_steps - count)
|
|
78
|
+
|
|
58
79
|
|
|
59
80
|
def _count_annotations(codebook, process_textbox=False):
|
|
60
|
-
"""Count
|
|
81
|
+
"""Count the maximum number of annotation prompts that could be issued for one row."""
|
|
61
82
|
count = 0
|
|
62
|
-
for
|
|
63
|
-
if
|
|
83
|
+
for _, _, _, annotation in get_annotation_entries(codebook):
|
|
84
|
+
if annotation.get("type") == "textbox" and not process_textbox:
|
|
64
85
|
continue
|
|
65
|
-
|
|
66
|
-
if annotation.get("type") == "textbox" and not process_textbox:
|
|
67
|
-
continue
|
|
68
|
-
count += 1
|
|
86
|
+
count += 1
|
|
69
87
|
return count
|
|
70
88
|
|
|
71
89
|
def load_codebook(codebook_path):
|
|
@@ -90,19 +108,10 @@ def get_annotation_column_names(codebook):
|
|
|
90
108
|
Returns:
|
|
91
109
|
List of column names in ``<section_name>_<annotation_name>`` format.
|
|
92
110
|
"""
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
continue
|
|
98
|
-
|
|
99
|
-
section_name = section["section_name"]
|
|
100
|
-
annotations = section.get("annotations", {})
|
|
101
|
-
|
|
102
|
-
for annotation in annotations.values():
|
|
103
|
-
annotation_columns.append(f"{section_name}_{annotation['name']}")
|
|
104
|
-
|
|
105
|
-
return annotation_columns
|
|
111
|
+
return [
|
|
112
|
+
get_annotation_column_name(section_content, annotation)
|
|
113
|
+
for _, section_content, _, annotation in get_annotation_entries(codebook)
|
|
114
|
+
]
|
|
106
115
|
|
|
107
116
|
def load_input_dataframe(csv_path, codebook):
|
|
108
117
|
"""Load the input CSV and remove any existing annotation label columns.
|
|
@@ -161,24 +170,23 @@ def setup_model(model_name, temperature=None, top_p=None):
|
|
|
161
170
|
top_p: Optional nucleus-sampling value.
|
|
162
171
|
|
|
163
172
|
Returns:
|
|
164
|
-
|
|
173
|
+
``ChatOllama`` instance. The caller builds structured-output chains
|
|
174
|
+
from this model as needed.
|
|
165
175
|
"""
|
|
166
176
|
model_kwargs = {}
|
|
167
177
|
if temperature is not None:
|
|
168
178
|
model_kwargs['temperature'] = float(temperature)
|
|
169
179
|
if top_p is not None:
|
|
170
180
|
model_kwargs['top_p'] = float(top_p)
|
|
171
|
-
|
|
172
|
-
llm =
|
|
173
|
-
|
|
174
|
-
chain = prompt_template | llm
|
|
175
|
-
return chain
|
|
181
|
+
|
|
182
|
+
llm = ChatOllama(model=model_name, **model_kwargs)
|
|
183
|
+
return llm
|
|
176
184
|
|
|
177
185
|
def generate_response(chain, prompt, char_counts, timing_data, row_num=None, annotation_name=None):
|
|
178
186
|
"""Run one prompt through the model and update timing/count statistics.
|
|
179
187
|
|
|
180
188
|
Args:
|
|
181
|
-
chain:
|
|
189
|
+
chain: ``ChatOllama`` instance returned by :func:`setup_model`.
|
|
182
190
|
prompt: Fully rendered prompt string.
|
|
183
191
|
char_counts: Mutable dict with ``input_chars`` and ``output_chars`` integers.
|
|
184
192
|
timing_data: Mutable dict with inference timing counters.
|
|
@@ -191,28 +199,42 @@ def generate_response(chain, prompt, char_counts, timing_data, row_num=None, ann
|
|
|
191
199
|
try:
|
|
192
200
|
# Track input characters
|
|
193
201
|
char_counts['input_chars'] += len(prompt)
|
|
194
|
-
|
|
202
|
+
|
|
195
203
|
if row_num and annotation_name:
|
|
196
204
|
logger.info("[Row %s] Sending request for: %s...", row_num, annotation_name)
|
|
197
205
|
|
|
206
|
+
structured_chain = (
|
|
207
|
+
_PROMPT_TEMPLATE
|
|
208
|
+
| chain.with_structured_output(
|
|
209
|
+
AnnotationResponse, method="json_schema", include_raw=True
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
198
213
|
start_time = time.time()
|
|
199
|
-
|
|
214
|
+
result = structured_chain.invoke({"question": prompt})
|
|
200
215
|
end_time = time.time()
|
|
201
216
|
inference_time = end_time - start_time
|
|
202
217
|
timing_data['total_inference_time'] += inference_time
|
|
203
218
|
timing_data['inference_count'] += 1
|
|
204
219
|
|
|
220
|
+
if result.get("parsed") is not None:
|
|
221
|
+
response = result["parsed"].model_dump_json()
|
|
222
|
+
else:
|
|
223
|
+
raw = result.get("raw")
|
|
224
|
+
response = raw.content if raw else ""
|
|
225
|
+
logger.debug("Structured parsing failed, using raw response for %s", annotation_name)
|
|
226
|
+
|
|
205
227
|
char_counts['output_chars'] += len(response)
|
|
206
228
|
|
|
207
229
|
if row_num and annotation_name:
|
|
208
230
|
logger.info("[Row %s] %s done (%.1fs)", row_num, annotation_name, inference_time)
|
|
209
|
-
|
|
231
|
+
|
|
210
232
|
return response
|
|
211
233
|
except Exception as e:
|
|
212
234
|
logger.warning("Error generating response: %s", e)
|
|
213
235
|
return ""
|
|
214
236
|
|
|
215
|
-
def extract_json_response(response, annotation_type, min_value=None, max_value=None):
|
|
237
|
+
def extract_json_response(response, annotation_type, min_value=None, max_value=None, options=None):
|
|
216
238
|
"""
|
|
217
239
|
Extract and validate JSON response based on annotation type
|
|
218
240
|
|
|
@@ -221,12 +243,22 @@ def extract_json_response(response, annotation_type, min_value=None, max_value=N
|
|
|
221
243
|
annotation_type: Annotation type string such as ``"dropdown"`` or ``"likert"``.
|
|
222
244
|
min_value: Optional integer lower bound for Likert annotations.
|
|
223
245
|
max_value: Optional integer upper bound for Likert annotations.
|
|
246
|
+
options: Optional dropdown option list used to normalize categorical labels.
|
|
224
247
|
|
|
225
248
|
Returns:
|
|
226
249
|
Parsed response value coerced into the expected annotation format.
|
|
227
250
|
"""
|
|
228
251
|
pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')
|
|
229
252
|
json_strings = pattern.findall(response)
|
|
253
|
+
|
|
254
|
+
def normalize_dropdown_value(value):
|
|
255
|
+
return normalize_annotation_response_value(
|
|
256
|
+
{
|
|
257
|
+
"type": "dropdown",
|
|
258
|
+
"options": options or [],
|
|
259
|
+
},
|
|
260
|
+
value,
|
|
261
|
+
)
|
|
230
262
|
|
|
231
263
|
for json_string in json_strings:
|
|
232
264
|
try:
|
|
@@ -235,7 +267,7 @@ def extract_json_response(response, annotation_type, min_value=None, max_value=N
|
|
|
235
267
|
|
|
236
268
|
# Validate and format based on annotation type
|
|
237
269
|
if annotation_type == "dropdown":
|
|
238
|
-
return response_value
|
|
270
|
+
return normalize_dropdown_value(response_value)
|
|
239
271
|
elif annotation_type == "checkbox":
|
|
240
272
|
# Convert to 1 or 0
|
|
241
273
|
if isinstance(response_value, bool):
|
|
@@ -251,7 +283,7 @@ def extract_json_response(response, annotation_type, min_value=None, max_value=N
|
|
|
251
283
|
return 0
|
|
252
284
|
elif annotation_type == "textbox":
|
|
253
285
|
# Return as string
|
|
254
|
-
return str(response_value)
|
|
286
|
+
return str(response_value).strip()
|
|
255
287
|
elif annotation_type == "likert":
|
|
256
288
|
# Validate is within range and convert to int
|
|
257
289
|
try:
|
|
@@ -266,12 +298,16 @@ def extract_json_response(response, annotation_type, min_value=None, max_value=N
|
|
|
266
298
|
return response_value
|
|
267
299
|
|
|
268
300
|
# Fallback
|
|
269
|
-
return response_value
|
|
301
|
+
return str(response_value).strip() if isinstance(response_value, str) else response_value
|
|
270
302
|
except json.JSONDecodeError as e:
|
|
271
303
|
logger.debug("Error parsing JSON: %s", e)
|
|
272
304
|
|
|
273
305
|
# If no valid JSON, try to extract direct response
|
|
274
|
-
|
|
306
|
+
stripped_response = response.strip()
|
|
307
|
+
|
|
308
|
+
if annotation_type == "dropdown":
|
|
309
|
+
return normalize_dropdown_value(stripped_response)
|
|
310
|
+
elif annotation_type == "checkbox":
|
|
275
311
|
if "yes" in response.lower() or "true" in response.lower():
|
|
276
312
|
return 1
|
|
277
313
|
elif "no" in response.lower() or "false" in response.lower():
|
|
@@ -288,8 +324,10 @@ def extract_json_response(response, annotation_type, min_value=None, max_value=N
|
|
|
288
324
|
except ValueError:
|
|
289
325
|
continue
|
|
290
326
|
return (min_value + max_value) // 2 # Default to middle value
|
|
327
|
+
elif annotation_type == "textbox":
|
|
328
|
+
return stripped_response
|
|
291
329
|
|
|
292
|
-
return
|
|
330
|
+
return None
|
|
293
331
|
|
|
294
332
|
def format_prompt(section_name, section_instruction, name, tooltip, annotation_type,
|
|
295
333
|
options=None, min_value=None, max_value=None, example=None,
|
|
@@ -466,73 +504,73 @@ def classify_text(chain, text, codebook, prompt_type="standard", use_examples=Fa
|
|
|
466
504
|
if timing_data is None:
|
|
467
505
|
timing_data = {'total_inference_time': 0, 'inference_count': 0}
|
|
468
506
|
|
|
469
|
-
for
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
507
|
+
for section_key, section, annotation_key, annotation in get_annotation_entries(codebook):
|
|
508
|
+
section_name = section['section_name']
|
|
509
|
+
section_instruction = section.get('section_instruction', '')
|
|
510
|
+
name = annotation['name']
|
|
511
|
+
annotation_type = annotation['type']
|
|
512
|
+
annotation_full_name = f"{section_name}_{name}"
|
|
513
|
+
column_name = get_annotation_column_name(section, annotation)
|
|
514
|
+
|
|
515
|
+
if annotation_type == "textbox" and not process_textbox:
|
|
516
|
+
if progress_bar is not None:
|
|
517
|
+
progress_bar.skip()
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
if not is_annotation_applicable(codebook, section_key, annotation_key, responses):
|
|
521
|
+
responses[column_name] = None
|
|
522
|
+
if progress_bar is not None:
|
|
523
|
+
progress_bar.skip()
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
tooltip = annotation.get('tooltip', '')
|
|
527
|
+
example = annotation.get('example', '')
|
|
528
|
+
|
|
529
|
+
options = None
|
|
530
|
+
min_value = None
|
|
531
|
+
max_value = None
|
|
532
|
+
|
|
533
|
+
if annotation_type == "dropdown":
|
|
534
|
+
options = annotation.get('options', [])
|
|
535
|
+
elif annotation_type == "likert":
|
|
536
|
+
min_value = annotation.get('min_value')
|
|
537
|
+
max_value = annotation.get('max_value')
|
|
538
|
+
|
|
539
|
+
prompt = format_prompt(
|
|
540
|
+
section_name,
|
|
541
|
+
section_instruction,
|
|
542
|
+
name,
|
|
543
|
+
tooltip,
|
|
544
|
+
annotation_type,
|
|
545
|
+
options,
|
|
546
|
+
min_value,
|
|
547
|
+
max_value,
|
|
548
|
+
example,
|
|
549
|
+
text,
|
|
550
|
+
prompt_type=prompt_type,
|
|
551
|
+
use_examples=use_examples
|
|
552
|
+
)
|
|
553
|
+
|
|
554
|
+
response_text = generate_response(
|
|
555
|
+
chain,
|
|
556
|
+
prompt,
|
|
557
|
+
char_counts,
|
|
558
|
+
timing_data,
|
|
559
|
+
row_num=row_num,
|
|
560
|
+
annotation_name=annotation_full_name
|
|
561
|
+
)
|
|
562
|
+
response_value = extract_json_response(
|
|
563
|
+
response_text,
|
|
564
|
+
annotation_type,
|
|
565
|
+
min_value,
|
|
566
|
+
max_value,
|
|
567
|
+
options=options,
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
responses[column_name] = response_value if response_value is not None else None
|
|
571
|
+
|
|
572
|
+
if progress_bar is not None and row_num is not None and total_rows is not None:
|
|
573
|
+
progress_bar.update(row_num, total_rows, annotation_full_name)
|
|
536
574
|
|
|
537
575
|
return responses, char_counts, timing_data
|
|
538
576
|
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def get_sorted_annotation_keys(section_content: dict[str, Any]) -> list[str]:
|
|
9
|
+
"""Return annotation keys in the same stable order used by CodeBook Studio."""
|
|
10
|
+
|
|
11
|
+
def sort_key(annotation_key: str) -> tuple[int, int | str]:
|
|
12
|
+
suffix = annotation_key.split("_")[-1]
|
|
13
|
+
return (0, int(suffix)) if suffix.isdigit() else (1, annotation_key)
|
|
14
|
+
|
|
15
|
+
return sorted(section_content.get("annotations", {}).keys(), key=sort_key)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_annotation_column_name(section_content: dict[str, Any], annotation: dict[str, Any]) -> str:
|
|
19
|
+
"""Return the canonical CSV column name for an annotation."""
|
|
20
|
+
return f"{section_content['section_name']}_{annotation['name']}"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_annotation_entries(codebook: dict[str, Any]) -> list[tuple[str, dict[str, Any], str, dict[str, Any]]]:
|
|
24
|
+
"""Return all section/annotation entries in display order."""
|
|
25
|
+
entries: list[tuple[str, dict[str, Any], str, dict[str, Any]]] = []
|
|
26
|
+
|
|
27
|
+
for section_key, section_content in codebook.items():
|
|
28
|
+
if not section_key.startswith("section_"):
|
|
29
|
+
continue
|
|
30
|
+
for annotation_key in get_sorted_annotation_keys(section_content):
|
|
31
|
+
annotation = section_content.get("annotations", {}).get(annotation_key, {})
|
|
32
|
+
entries.append((section_key, section_content, annotation_key, annotation))
|
|
33
|
+
|
|
34
|
+
return entries
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_annotation_lookup(
|
|
38
|
+
codebook: dict[str, Any],
|
|
39
|
+
) -> dict[tuple[str, str], tuple[dict[str, Any], dict[str, Any]]]:
|
|
40
|
+
"""Build a lookup from stable section/annotation keys to annotation metadata."""
|
|
41
|
+
return {
|
|
42
|
+
(section_key, annotation_key): (section_content, annotation)
|
|
43
|
+
for section_key, section_content, annotation_key, annotation in get_annotation_entries(codebook)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_annotation_condition(annotation: dict[str, Any]) -> dict[str, Any] | None:
|
|
48
|
+
"""Return a normalized condition block when one is present."""
|
|
49
|
+
condition = annotation.get("condition")
|
|
50
|
+
if not isinstance(condition, dict):
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
section_key = condition.get("section_key")
|
|
54
|
+
annotation_key = condition.get("annotation_key")
|
|
55
|
+
if not section_key or not annotation_key:
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
"section_key": section_key,
|
|
60
|
+
"annotation_key": annotation_key,
|
|
61
|
+
"value": condition.get("value"),
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def normalize_annotation_response_value(annotation: dict[str, Any], value: Any) -> Any:
|
|
66
|
+
"""Coerce stored responses into stable comparable values."""
|
|
67
|
+
if pd.isna(value):
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
annotation_type = annotation.get("type", "dropdown")
|
|
71
|
+
if annotation_type == "dropdown":
|
|
72
|
+
normalized = str(value).strip().strip("`").strip()
|
|
73
|
+
if normalized == "":
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
options = annotation.get("options") or []
|
|
77
|
+
if not options:
|
|
78
|
+
return normalized
|
|
79
|
+
|
|
80
|
+
option_lookup = {str(option).strip().casefold(): option for option in options}
|
|
81
|
+
return option_lookup.get(normalized.casefold())
|
|
82
|
+
|
|
83
|
+
if annotation_type == "checkbox":
|
|
84
|
+
lowered = str(value).strip().lower()
|
|
85
|
+
if lowered in {"1", "true", "yes"}:
|
|
86
|
+
return 1
|
|
87
|
+
if lowered in {"0", "false", "no"}:
|
|
88
|
+
return 0
|
|
89
|
+
return value
|
|
90
|
+
|
|
91
|
+
if annotation_type == "likert":
|
|
92
|
+
try:
|
|
93
|
+
return int(value)
|
|
94
|
+
except (TypeError, ValueError):
|
|
95
|
+
return value
|
|
96
|
+
|
|
97
|
+
if annotation_type == "textbox":
|
|
98
|
+
return str(value).strip()
|
|
99
|
+
|
|
100
|
+
return str(value).strip()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def is_annotation_applicable(
|
|
104
|
+
codebook: dict[str, Any],
|
|
105
|
+
section_key: str,
|
|
106
|
+
annotation_key: str,
|
|
107
|
+
response_values: dict[str, Any],
|
|
108
|
+
lookup: dict[tuple[str, str], tuple[dict[str, Any], dict[str, Any]]] | None = None,
|
|
109
|
+
visited: set[tuple[str, str]] | None = None,
|
|
110
|
+
) -> bool:
|
|
111
|
+
"""Return whether an annotation should be shown/generate for the current responses."""
|
|
112
|
+
lookup = lookup or get_annotation_lookup(codebook)
|
|
113
|
+
current_entry = lookup.get((section_key, annotation_key))
|
|
114
|
+
if not current_entry:
|
|
115
|
+
return True
|
|
116
|
+
|
|
117
|
+
_, annotation = current_entry
|
|
118
|
+
condition = get_annotation_condition(annotation)
|
|
119
|
+
if not condition:
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
target_key = (condition["section_key"], condition["annotation_key"])
|
|
123
|
+
if target_key == (section_key, annotation_key):
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
target_entry = lookup.get(target_key)
|
|
127
|
+
if not target_entry:
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
visited = visited or set()
|
|
131
|
+
if (section_key, annotation_key) in visited:
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
target_section_content, target_annotation = target_entry
|
|
135
|
+
if not is_annotation_applicable(
|
|
136
|
+
codebook,
|
|
137
|
+
condition["section_key"],
|
|
138
|
+
condition["annotation_key"],
|
|
139
|
+
response_values,
|
|
140
|
+
lookup=lookup,
|
|
141
|
+
visited=visited | {(section_key, annotation_key)},
|
|
142
|
+
):
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
target_column_name = get_annotation_column_name(target_section_content, target_annotation)
|
|
146
|
+
actual_value = normalize_annotation_response_value(target_annotation, response_values.get(target_column_name))
|
|
147
|
+
expected_value = normalize_annotation_response_value(target_annotation, condition.get("value"))
|
|
148
|
+
|
|
149
|
+
if actual_value is None:
|
|
150
|
+
return False
|
|
151
|
+
if target_annotation.get("type") == "textbox" and actual_value == "":
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
return actual_value == expected_value
|
|
@@ -16,6 +16,13 @@ import krippendorff
|
|
|
16
16
|
from scipy.stats import spearmanr
|
|
17
17
|
from sklearn.metrics import confusion_matrix
|
|
18
18
|
|
|
19
|
+
from .conditions import (
|
|
20
|
+
get_annotation_column_name,
|
|
21
|
+
get_annotation_condition,
|
|
22
|
+
get_annotation_entries,
|
|
23
|
+
get_annotation_lookup,
|
|
24
|
+
normalize_annotation_response_value,
|
|
25
|
+
)
|
|
19
26
|
from .types import MetricsRunResult
|
|
20
27
|
|
|
21
28
|
logger = logging.getLogger(__name__)
|
|
@@ -82,38 +89,90 @@ def extract_column_info_from_codebook(codebook_path):
|
|
|
82
89
|
"""
|
|
83
90
|
with open(codebook_path, 'r') as file:
|
|
84
91
|
codebook = json.load(file)
|
|
85
|
-
|
|
92
|
+
|
|
93
|
+
lookup = get_annotation_lookup(codebook)
|
|
86
94
|
column_info = {}
|
|
87
|
-
|
|
88
|
-
for
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
95
|
+
|
|
96
|
+
for section_key, section, annotation_key, annotation in get_annotation_entries(codebook):
|
|
97
|
+
column_name = get_annotation_column_name(section, annotation)
|
|
98
|
+
annotation_type = annotation.get('type', 'dropdown')
|
|
99
|
+
|
|
100
|
+
properties = {
|
|
101
|
+
'type': annotation_type,
|
|
102
|
+
'section_key': section_key,
|
|
103
|
+
'annotation_key': annotation_key,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
if annotation_type == 'dropdown':
|
|
107
|
+
properties['options'] = annotation.get('options', [])
|
|
108
|
+
elif annotation_type == 'likert':
|
|
109
|
+
properties['min_value'] = annotation.get('min_value', 0)
|
|
110
|
+
properties['max_value'] = annotation.get('max_value', 5)
|
|
111
|
+
|
|
112
|
+
condition = get_annotation_condition(annotation)
|
|
113
|
+
if condition:
|
|
114
|
+
source_entry = lookup.get((condition['section_key'], condition['annotation_key']))
|
|
115
|
+
if source_entry:
|
|
116
|
+
source_section, source_annotation = source_entry
|
|
117
|
+
properties['condition'] = {
|
|
118
|
+
'source_column': get_annotation_column_name(source_section, source_annotation),
|
|
119
|
+
'source_type': source_annotation.get('type', 'dropdown'),
|
|
120
|
+
'value': normalize_annotation_response_value(source_annotation, condition.get('value')),
|
|
103
121
|
}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
if annotation_type == 'dropdown':
|
|
107
|
-
properties['options'] = annotation.get('options', [])
|
|
108
|
-
elif annotation_type == 'likert':
|
|
109
|
-
properties['min_value'] = annotation.get('min_value', 0)
|
|
110
|
-
properties['max_value'] = annotation.get('max_value', 5)
|
|
111
|
-
|
|
112
|
-
column_info[column_name] = properties
|
|
122
|
+
|
|
123
|
+
column_info[column_name] = properties
|
|
113
124
|
|
|
114
125
|
logger.debug("Extracted column info from codebook: %s", column_info)
|
|
115
126
|
return column_info
|
|
116
127
|
|
|
128
|
+
|
|
129
|
+
def _is_row_applicable_for_column(merged_row, column, column_info, side="gt", visited=None):
|
|
130
|
+
"""Return whether a conditional annotation is applicable for one merged row."""
|
|
131
|
+
info = column_info.get(column, {})
|
|
132
|
+
condition = info.get("condition")
|
|
133
|
+
if not condition:
|
|
134
|
+
return True
|
|
135
|
+
|
|
136
|
+
source_column = condition.get("source_column")
|
|
137
|
+
if not source_column:
|
|
138
|
+
return True
|
|
139
|
+
|
|
140
|
+
visited = visited or set()
|
|
141
|
+
if column in visited:
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
if source_column in column_info and not _is_row_applicable_for_column(
|
|
145
|
+
merged_row,
|
|
146
|
+
source_column,
|
|
147
|
+
column_info,
|
|
148
|
+
side=side,
|
|
149
|
+
visited=visited | {column},
|
|
150
|
+
):
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
source_value = merged_row.get(f"{source_column}_{side}")
|
|
154
|
+
source_annotation = {"type": condition.get("source_type", "dropdown")}
|
|
155
|
+
actual_value = normalize_annotation_response_value(source_annotation, source_value)
|
|
156
|
+
expected_value = normalize_annotation_response_value(source_annotation, condition.get("value"))
|
|
157
|
+
|
|
158
|
+
if actual_value is None:
|
|
159
|
+
return False
|
|
160
|
+
if condition.get("source_type") == "textbox" and actual_value == "":
|
|
161
|
+
return False
|
|
162
|
+
|
|
163
|
+
return actual_value == expected_value
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _get_applicable_row_mask(merged_df, column, column_info, side="gt"):
|
|
167
|
+
"""Return a boolean mask for rows where an annotation is applicable."""
|
|
168
|
+
if "condition" not in column_info.get(column, {}):
|
|
169
|
+
return pd.Series(True, index=merged_df.index)
|
|
170
|
+
|
|
171
|
+
return merged_df.apply(
|
|
172
|
+
lambda row: _is_row_applicable_for_column(row, column, column_info, side=side),
|
|
173
|
+
axis=1,
|
|
174
|
+
)
|
|
175
|
+
|
|
117
176
|
def load_data(ground_truth_path, llm_output_path, columns_to_compare):
|
|
118
177
|
"""Load and align ground-truth and model-output CSV files for evaluation.
|
|
119
178
|
|
|
@@ -413,8 +472,9 @@ def evaluate_performance(merged_df, columns_to_compare, column_info, process_tex
|
|
|
413
472
|
reports[column] = "Textbox processing skipped."
|
|
414
473
|
continue
|
|
415
474
|
|
|
416
|
-
|
|
417
|
-
|
|
475
|
+
applicable_mask = _get_applicable_row_mask(merged_df, column, column_info, side="gt")
|
|
476
|
+
y_true = merged_df.loc[applicable_mask, column_gt]
|
|
477
|
+
y_pred = merged_df.loc[applicable_mask, column_llm]
|
|
418
478
|
|
|
419
479
|
# Handle values based on annotation type
|
|
420
480
|
if annotation_type == 'checkbox':
|
|
@@ -548,8 +608,8 @@ def evaluate_performance(merged_df, columns_to_compare, column_info, process_tex
|
|
|
548
608
|
|
|
549
609
|
# For Krippendorff's alpha
|
|
550
610
|
label_to_int = {label: i for i, label in enumerate(['missing'] + all_labels)}
|
|
551
|
-
y_true_encoded = np.array([label_to_int[
|
|
552
|
-
y_pred_encoded = np.array([label_to_int[
|
|
611
|
+
y_true_encoded = np.array([label_to_int[value] for value in y_true_clean.tolist()])
|
|
612
|
+
y_pred_encoded = np.array([label_to_int[value] for value in y_pred_clean.tolist()])
|
|
553
613
|
data = np.array([y_true_encoded, y_pred_encoded])
|
|
554
614
|
krippendorff_alpha_scores[column] = krippendorff.alpha(reliability_data=data)
|
|
555
615
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebook-lab
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: An LLM annotation experiment pipeline for computational social science.
|
|
5
5
|
Author: Lorcan McLaren
|
|
6
6
|
License-Expression: AGPL-3.0-only
|
|
@@ -45,7 +45,7 @@ Dynamic: license-file
|
|
|
45
45
|
|
|
46
46
|
# CodeBook Lab
|
|
47
47
|
|
|
48
|
-
[](https://doi.org/10.5281/zenodo.19185921)
|
|
48
|
+
[](https://doi.org/10.5281/zenodo.19185921) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/) [](https://pypi.org/project/codebook-lab/)
|
|
49
49
|
|
|
50
50
|
CodeBook Lab is an LLM annotation experiment pipeline for computational social science. It takes a codebook and labelled dataset from [CodeBook Studio](https://codebook.streamlit.app/) ([source](https://github.com/LorcanMcLaren/codebook-studio)) and runs structured experiments across the dimensions that matter for text-as-data research: model choice, model size, prompt style, zero-shot versus few-shot learning, and sampling hyperparameters — all benchmarked against human labels.
|
|
51
51
|
|
|
@@ -297,7 +297,7 @@ This project is licensed under the [GNU Affero General Public License v3.0](http
|
|
|
297
297
|
If you use CodeBook Lab in research, please cite both:
|
|
298
298
|
|
|
299
299
|
- this software package
|
|
300
|
-
- the associated preprint
|
|
300
|
+
- the associated arXiv preprint
|
|
301
301
|
|
|
302
302
|
Citation metadata is also available in the project's [`CITATION.cff`](https://github.com/LorcanMcLaren/codebook-lab/blob/main/CITATION.cff).
|
|
303
303
|
|
|
@@ -324,7 +324,7 @@ BibTeX:
|
|
|
324
324
|
|
|
325
325
|
APSR style:
|
|
326
326
|
|
|
327
|
-
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*.
|
|
327
|
+
McLaren, Lorcan, James P. Cross, Zuzanna Krakowska, Robin Rauner, and Martijn Schoonvelde. 2026. *Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation*. arXiv preprint arXiv:2603.26898. [https://arxiv.org/abs/2603.26898](https://arxiv.org/abs/2603.26898).
|
|
328
328
|
|
|
329
329
|
BibTeX:
|
|
330
330
|
|
|
@@ -333,6 +333,10 @@ BibTeX:
|
|
|
333
333
|
author = {McLaren, Lorcan and Cross, James P. and Krakowska, Zuzanna and Rauner, Robin and Schoonvelde, Martijn},
|
|
334
334
|
title = {Magic Words or Methodical Work? Challenging Conventional Wisdom in LLM-Based Political Text Annotation},
|
|
335
335
|
year = {2026},
|
|
336
|
-
|
|
336
|
+
eprint = {2603.26898},
|
|
337
|
+
archivePrefix = {arXiv},
|
|
338
|
+
primaryClass = {cs.CL},
|
|
339
|
+
doi = {10.48550/arXiv.2603.26898},
|
|
340
|
+
url = {https://arxiv.org/abs/2603.26898}
|
|
337
341
|
}
|
|
338
342
|
```
|
|
@@ -3,6 +3,7 @@ README.md
|
|
|
3
3
|
pyproject.toml
|
|
4
4
|
codebook_lab/__init__.py
|
|
5
5
|
codebook_lab/annotate.py
|
|
6
|
+
codebook_lab/conditions.py
|
|
6
7
|
codebook_lab/examples.py
|
|
7
8
|
codebook_lab/experiments.py
|
|
8
9
|
codebook_lab/metrics.py
|
|
@@ -18,10 +19,7 @@ codebook_lab.egg-info/top_level.txt
|
|
|
18
19
|
codebook_lab/tasks/__init__.py
|
|
19
20
|
codebook_lab/tasks/policy-sentiment/codebook.json
|
|
20
21
|
codebook_lab/tasks/policy-sentiment/ground-truth.csv
|
|
21
|
-
|
|
22
|
-
scripts/single_run_example.py
|
|
23
|
-
tests/__init__.py
|
|
24
|
-
tests/conftest.py
|
|
22
|
+
tests/test_conditions.py
|
|
25
23
|
tests/test_examples.py
|
|
26
24
|
tests/test_experiments.py
|
|
27
25
|
tests/test_metrics_summary.py
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from codebook_lab.annotate import classify_text, extract_json_response
|
|
8
|
+
from codebook_lab.metrics import evaluate_performance, extract_column_info_from_codebook
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _conditional_codebook() -> dict:
|
|
12
|
+
return {
|
|
13
|
+
"header_column": "id",
|
|
14
|
+
"text_column": "text",
|
|
15
|
+
"section_1": {
|
|
16
|
+
"section_name": "1. Relevance",
|
|
17
|
+
"section_instruction": "",
|
|
18
|
+
"annotations": {
|
|
19
|
+
"annotation_1": {
|
|
20
|
+
"name": "is_relevant",
|
|
21
|
+
"type": "dropdown",
|
|
22
|
+
"tooltip": "",
|
|
23
|
+
"options": ["Yes", "No"],
|
|
24
|
+
}
|
|
25
|
+
},
|
|
26
|
+
},
|
|
27
|
+
"section_2": {
|
|
28
|
+
"section_name": "2. Stance",
|
|
29
|
+
"section_instruction": "",
|
|
30
|
+
"annotations": {
|
|
31
|
+
"annotation_1": {
|
|
32
|
+
"name": "stance",
|
|
33
|
+
"type": "dropdown",
|
|
34
|
+
"tooltip": "",
|
|
35
|
+
"options": ["Positive", "Negative"],
|
|
36
|
+
"condition": {
|
|
37
|
+
"section_key": "section_1",
|
|
38
|
+
"annotation_key": "annotation_1",
|
|
39
|
+
"value": "Yes",
|
|
40
|
+
},
|
|
41
|
+
}
|
|
42
|
+
},
|
|
43
|
+
},
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_classify_text_skips_inactive_conditional_annotations(monkeypatch):
|
|
48
|
+
codebook = _conditional_codebook()
|
|
49
|
+
prompts_seen: list[str] = []
|
|
50
|
+
responses = iter(
|
|
51
|
+
[
|
|
52
|
+
'{"response": "No"}',
|
|
53
|
+
]
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
def fake_generate_response(*args, **kwargs):
|
|
57
|
+
prompts_seen.append(kwargs.get("annotation_name", ""))
|
|
58
|
+
return next(responses)
|
|
59
|
+
|
|
60
|
+
monkeypatch.setattr("codebook_lab.annotate.generate_response", fake_generate_response)
|
|
61
|
+
|
|
62
|
+
result, _, _ = classify_text(
|
|
63
|
+
chain=object(),
|
|
64
|
+
text="Example text",
|
|
65
|
+
codebook=codebook,
|
|
66
|
+
prompt_type="standard",
|
|
67
|
+
use_examples=False,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
assert prompts_seen == ["1. Relevance_is_relevant"]
|
|
71
|
+
assert result["1. Relevance_is_relevant"] == "No"
|
|
72
|
+
assert "2. Stance_stance" in result
|
|
73
|
+
assert result["2. Stance_stance"] is None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def test_metrics_ignore_non_applicable_conditional_rows(tmp_path):
|
|
77
|
+
codebook = _conditional_codebook()
|
|
78
|
+
codebook_path = tmp_path / "codebook.json"
|
|
79
|
+
codebook_path.write_text(json.dumps(codebook))
|
|
80
|
+
|
|
81
|
+
column_info = extract_column_info_from_codebook(codebook_path)
|
|
82
|
+
merged_df = pd.DataFrame(
|
|
83
|
+
{
|
|
84
|
+
"1. Relevance_is_relevant_gt": ["No", "Yes"],
|
|
85
|
+
"1. Relevance_is_relevant_llm": ["No", "No"],
|
|
86
|
+
"2. Stance_stance_gt": [None, "Positive"],
|
|
87
|
+
"2. Stance_stance_llm": [None, None],
|
|
88
|
+
}
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
metrics = evaluate_performance(
|
|
92
|
+
merged_df=merged_df,
|
|
93
|
+
columns_to_compare=["2. Stance_stance"],
|
|
94
|
+
column_info=column_info,
|
|
95
|
+
process_textbox=False,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
accuracy_scores = metrics[0]
|
|
99
|
+
percentage_agreement_scores = metrics[6]
|
|
100
|
+
|
|
101
|
+
assert accuracy_scores["2. Stance_stance"] == 0.0
|
|
102
|
+
assert percentage_agreement_scores["2. Stance_stance"] == 0.0
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def test_extract_json_response_normalizes_dropdown_options():
|
|
106
|
+
options = ["Yes", "No"]
|
|
107
|
+
|
|
108
|
+
assert extract_json_response(
|
|
109
|
+
'{"response": " yes\\n"}',
|
|
110
|
+
"dropdown",
|
|
111
|
+
options=options,
|
|
112
|
+
) == "Yes"
|
|
113
|
+
assert extract_json_response(" No\n", "dropdown", options=options) == "No"
|
|
114
|
+
assert extract_json_response(
|
|
115
|
+
'{"response": "JSON"}',
|
|
116
|
+
"dropdown",
|
|
117
|
+
options=options,
|
|
118
|
+
) is None
|
|
119
|
+
assert extract_json_response("JSON\n", "dropdown", options=options) is None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def test_classify_text_stores_none_for_invalid_dropdown_outputs(monkeypatch):
|
|
123
|
+
codebook = _conditional_codebook()
|
|
124
|
+
responses = iter(
|
|
125
|
+
[
|
|
126
|
+
"JSON\n",
|
|
127
|
+
]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
def fake_generate_response(*args, **kwargs):
|
|
131
|
+
return next(responses)
|
|
132
|
+
|
|
133
|
+
monkeypatch.setattr("codebook_lab.annotate.generate_response", fake_generate_response)
|
|
134
|
+
|
|
135
|
+
result, _, _ = classify_text(
|
|
136
|
+
chain=object(),
|
|
137
|
+
text="Example text",
|
|
138
|
+
codebook=codebook,
|
|
139
|
+
prompt_type="standard",
|
|
140
|
+
use_examples=False,
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
assert result["1. Relevance_is_relevant"] is None
|
|
144
|
+
assert result["2. Stance_stance"] is None
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
"""Run a small multi-experiment sweep with CodeBook Lab.
|
|
2
|
-
|
|
3
|
-
This script is intentionally small so users can test the package quickly.
|
|
4
|
-
Edit the grid below to explore more combinations once the basic workflow is
|
|
5
|
-
working in your environment. The package will try to start a local Ollama
|
|
6
|
-
server if needed and will pull any missing models automatically.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
|
|
11
|
-
from codebook_lab import run_experiment_grid
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
OUTPUT_ROOT = Path("outputs")
|
|
15
|
-
|
|
16
|
-
PARAM_GRID = {
|
|
17
|
-
"country_iso_code": "IRL",
|
|
18
|
-
"tasks": ["policy-sentiment"],
|
|
19
|
-
"models": ["gemma3:270m"],
|
|
20
|
-
"use_examples": [False, True],
|
|
21
|
-
"prompt_types": ["standard"],
|
|
22
|
-
"temperatures": [None],
|
|
23
|
-
"top_ps": [None],
|
|
24
|
-
"process_textboxes": [True],
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
def main() -> None:
|
|
29
|
-
"""Run a small sweep and print a short summary of the completed runs."""
|
|
30
|
-
results = run_experiment_grid(
|
|
31
|
-
param_grid=PARAM_GRID,
|
|
32
|
-
output_root=OUTPUT_ROOT,
|
|
33
|
-
)
|
|
34
|
-
|
|
35
|
-
print(f"Completed {len(results)} experiment runs.")
|
|
36
|
-
for result in results:
|
|
37
|
-
print(f"- {result.model_id}: {result.experiment_directory}")
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if __name__ == "__main__":
|
|
41
|
-
main()
|
|
@@ -1,48 +0,0 @@
|
|
|
1
|
-
"""Run one bundled-example experiment with CodeBook Lab.
|
|
2
|
-
|
|
3
|
-
Edit the constants below if you want to change the model, task, or output
|
|
4
|
-
location. This script assumes:
|
|
5
|
-
|
|
6
|
-
1. CodeBook Lab has been installed in the current environment, for example
|
|
7
|
-
with ``python -m pip install codebook-lab``.
|
|
8
|
-
2. Ollama is installed and available on PATH.
|
|
9
|
-
|
|
10
|
-
The package will try to start a local Ollama server if needed and will pull the
|
|
11
|
-
requested model automatically before running the experiment.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
|
|
16
|
-
from codebook_lab import ExperimentSpec, run_experiment
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
TASK = "policy-sentiment"
|
|
20
|
-
MODEL = "gemma3:270m"
|
|
21
|
-
COUNTRY_ISO_CODE = "IRL"
|
|
22
|
-
OUTPUT_ROOT = Path("outputs")
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def main() -> None:
|
|
26
|
-
"""Run a single experiment and print the key output locations."""
|
|
27
|
-
result = run_experiment(
|
|
28
|
-
ExperimentSpec(
|
|
29
|
-
task=TASK,
|
|
30
|
-
model=MODEL,
|
|
31
|
-
use_examples=False,
|
|
32
|
-
prompt_type="standard",
|
|
33
|
-
temperature=None,
|
|
34
|
-
top_p=None,
|
|
35
|
-
process_textbox=True,
|
|
36
|
-
country_iso_code=COUNTRY_ISO_CODE,
|
|
37
|
-
),
|
|
38
|
-
output_root=OUTPUT_ROOT,
|
|
39
|
-
)
|
|
40
|
-
|
|
41
|
-
print("Completed single experiment run.")
|
|
42
|
-
print(f"Experiment directory: {result.experiment_directory}")
|
|
43
|
-
print(f"Metrics CSV: {result.metrics.output_csv}")
|
|
44
|
-
print(f"Classification report: {result.metrics.report_file}")
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
if __name__ == "__main__":
|
|
48
|
-
main()
|
|
File without changes
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
|
|
5
|
-
import pytest
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
@pytest.fixture()
|
|
9
|
-
def bundled_task_dir() -> Path:
|
|
10
|
-
"""Return the path to the bundled policy-sentiment example task."""
|
|
11
|
-
task_dir = Path(__file__).resolve().parent.parent / "codebook_lab" / "tasks" / "policy-sentiment"
|
|
12
|
-
assert task_dir.exists(), f"Bundled task directory not found: {task_dir}"
|
|
13
|
-
return task_dir
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{codebook_lab-1.0.0 → codebook_lab-1.1.1}/codebook_lab/tasks/policy-sentiment/ground-truth.csv
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|