codebook-lab 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1422 @@
1
+ import contextlib
2
+ import importlib.util
3
+ import io
4
+ import logging
5
+ from functools import lru_cache
6
+ import warnings
7
+
8
+ import pandas as pd
9
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, cohen_kappa_score
10
+ import numpy as np
11
+ import os
12
+ from pathlib import Path
13
+ import json
14
+ from datetime import datetime
15
+ import krippendorff
16
+ from scipy.stats import spearmanr
17
+ from sklearn.metrics import confusion_matrix
18
+
19
+ from .types import MetricsRunResult
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ TEXTBOX_DEPENDENCY_GUIDANCE = (
25
+ "Install the optional textbox extras to enable the full textbox metric suite: "
26
+ "python -m pip install \"codebook-lab[textbox]\"."
27
+ )
28
+
29
+ BERT_SCORE_NUMPY_WARNING = (
30
+ "The given NumPy array is not writable, and PyTorch does not support non-writable tensors."
31
+ )
32
+
33
+
34
+ @contextlib.contextmanager
35
+ def _quiet_optional_nlp_output():
36
+ """Suppress noisy but non-actionable output from optional NLP metric libraries."""
37
+ with warnings.catch_warnings():
38
+ warnings.filterwarnings("ignore", message=BERT_SCORE_NUMPY_WARNING, category=UserWarning)
39
+ with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
40
+ try:
41
+ from transformers.utils import logging as transformers_logging
42
+ except ImportError:
43
+ transformers_logging = None
44
+
45
+ if transformers_logging is None:
46
+ yield
47
+ return
48
+
49
+ previous_verbosity = transformers_logging.get_verbosity()
50
+ transformers_logging.set_verbosity_error()
51
+ try:
52
+ yield
53
+ finally:
54
+ transformers_logging.set_verbosity(previous_verbosity)
55
+
56
+
57
+ @lru_cache(maxsize=1)
58
+ def _load_sentence_transformer_model():
59
+ from sentence_transformers import SentenceTransformer
60
+
61
+ with _quiet_optional_nlp_output():
62
+ return SentenceTransformer("all-MiniLM-L6-v2")
63
+
64
+
65
+ @lru_cache(maxsize=1)
66
+ def _load_bert_score_module():
67
+ import bert_score
68
+
69
+ return bert_score
70
+
71
+
72
+ def extract_column_info_from_codebook(codebook_path):
73
+ """
74
+ Extract column names and their annotation types from the codebook file.
75
+
76
+ Args:
77
+ codebook_path: Path to a ``codebook.json`` file.
78
+
79
+ Returns:
80
+ Dictionary mapping ``<section_name>_<annotation_name>`` column names to
81
+ annotation metadata such as type, options, and Likert bounds.
82
+ """
83
+ with open(codebook_path, 'r') as file:
84
+ codebook = json.load(file)
85
+
86
+ column_info = {}
87
+
88
+ for key, section in codebook.items():
89
+ if key.startswith('section_'):
90
+ section_name = section['section_name']
91
+ annotations = section['annotations']
92
+
93
+ for annotation_key, annotation in annotations.items():
94
+ name = annotation['name']
95
+
96
+ column_name = f"{section_name}_{name}"
97
+
98
+ # Extract annotation type and relevant properties
99
+ annotation_type = annotation.get('type', 'dropdown') # Default to dropdown for backward compatibility
100
+
101
+ properties = {
102
+ 'type': annotation_type
103
+ }
104
+
105
+ # Add type-specific properties
106
+ if annotation_type == 'dropdown':
107
+ properties['options'] = annotation.get('options', [])
108
+ elif annotation_type == 'likert':
109
+ properties['min_value'] = annotation.get('min_value', 0)
110
+ properties['max_value'] = annotation.get('max_value', 5)
111
+
112
+ column_info[column_name] = properties
113
+
114
+ logger.debug("Extracted column info from codebook: %s", column_info)
115
+ return column_info
116
+
117
+ def load_data(ground_truth_path, llm_output_path, columns_to_compare):
118
+ """Load and align ground-truth and model-output CSV files for evaluation.
119
+
120
+ Args:
121
+ ground_truth_path: Path to the human-labeled reference CSV.
122
+ llm_output_path: Path to the model-generated annotation CSV.
123
+ columns_to_compare: List of annotation column names to evaluate.
124
+
125
+ Returns:
126
+ Combined DataFrame containing suffixed ``_gt`` and ``_llm`` columns.
127
+ """
128
+ ground_truth_df = pd.read_csv(ground_truth_path)
129
+ llm_output_df = pd.read_csv(llm_output_path)
130
+
131
+ logger.debug("Ground truth dataframe shape: %s", ground_truth_df.shape)
132
+ logger.debug("LLM output dataframe shape: %s", llm_output_df.shape)
133
+
134
+ # Verify the row counts match
135
+ if len(ground_truth_df) != len(llm_output_df):
136
+ logger.warning("Dataframes have different numbers of rows!")
137
+ logger.warning("Ground truth rows: %s, LLM output rows: %s", len(ground_truth_df), len(llm_output_df))
138
+ logger.warning("Proceeding with merge based on row order, but results may be incorrect.")
139
+ else:
140
+ logger.info("Both dataframes have %s rows. Proceeding with row-based merge.", len(ground_truth_df))
141
+
142
+ logger.debug("Before merge:")
143
+ for column in columns_to_compare:
144
+ logger.debug("Column: %s", column)
145
+ if column in ground_truth_df.columns and column in llm_output_df.columns:
146
+ gt_values = ground_truth_df[column].fillna('').astype(str)
147
+ llm_values = llm_output_df[column].fillna('').astype(str)
148
+ gt_unique = sorted([x for x in gt_values.unique() if x != ''])
149
+ llm_unique = sorted([x for x in llm_values.unique() if x != ''])
150
+ logger.debug("Ground truth unique values: %s", gt_unique)
151
+ logger.debug("LLM output unique values: %s", llm_unique)
152
+ else:
153
+ if column not in ground_truth_df.columns:
154
+ logger.warning("Column '%s' not found in ground truth dataframe", column)
155
+ if column not in llm_output_df.columns:
156
+ logger.warning("Column '%s' not found in LLM output dataframe", column)
157
+
158
+ # Create new dataframe by adding suffix to column names
159
+ gt_columns = {col: f"{col}_gt" for col in ground_truth_df.columns if col in columns_to_compare}
160
+ llm_columns = {col: f"{col}_llm" for col in llm_output_df.columns if col in columns_to_compare}
161
+
162
+ # Create copies with renamed columns for the columns we want to compare
163
+ gt_renamed = ground_truth_df.rename(columns=gt_columns)
164
+ llm_renamed = llm_output_df.rename(columns=llm_columns)
165
+
166
+ # Concatenate horizontally (cbind/hstack) based on row position
167
+ merged_df = pd.concat([gt_renamed, llm_renamed], axis=1)
168
+
169
+ logger.debug("Merged dataframe has %s rows and %s columns", len(merged_df), merged_df.shape[1])
170
+
171
+ logger.debug("After merge:")
172
+ for column in columns_to_compare:
173
+ column_gt = f'{column}_gt'
174
+ column_llm = f'{column}_llm'
175
+
176
+ if column_gt in merged_df.columns and column_llm in merged_df.columns:
177
+ logger.debug("Column: %s", column)
178
+ gt_values = merged_df[column_gt].fillna('').astype(str)
179
+ llm_values = merged_df[column_llm].fillna('').astype(str)
180
+ gt_unique = sorted([x for x in gt_values.unique() if x != ''])
181
+ llm_unique = sorted([x for x in llm_values.unique() if x != ''])
182
+ logger.debug("Ground truth unique values: %s", gt_unique)
183
+ logger.debug("LLM output unique values: %s", llm_unique)
184
+
185
+ logger.debug("Value counts in ground truth:\n%s", merged_df[column_gt].fillna('').astype(str).value_counts().to_string())
186
+ logger.debug("Value counts in LLM output:\n%s", merged_df[column_llm].fillna('').astype(str).value_counts().to_string())
187
+
188
+ gt_col = merged_df[column_gt].fillna('').astype(str)
189
+ llm_col = merged_df[column_llm].fillna('').astype(str)
190
+ mismatches = merged_df[gt_col != llm_col]
191
+ if len(mismatches) > 0:
192
+ logger.debug("Found %s mismatches for %s. First few examples:", len(mismatches), column)
193
+ for idx, row in mismatches.head().iterrows():
194
+ logger.debug("Row index: %s", idx)
195
+ logger.debug("Ground truth: '%s'", row[column_gt])
196
+ logger.debug("LLM output: '%s'", row[column_llm])
197
+ logger.debug("---")
198
+ else:
199
+ if column_gt not in merged_df.columns:
200
+ logger.warning("Column '%s' not found in merged dataframe", column_gt)
201
+ if column_llm not in merged_df.columns:
202
+ logger.warning("Column '%s' not found in merged dataframe", column_llm)
203
+
204
+ # Convert all columns to string type before returning
205
+ for column in columns_to_compare:
206
+ column_gt = f'{column}_gt'
207
+ column_llm = f'{column}_llm'
208
+ if column_gt in merged_df.columns:
209
+ merged_df[column_gt] = merged_df[column_gt].fillna('').astype(str)
210
+ if column_llm in merged_df.columns:
211
+ merged_df[column_llm] = merged_df[column_llm].fillna('').astype(str)
212
+
213
+ return merged_df
214
+
215
+ def fill_specific_missing_values(df, columns, fill_value):
216
+ """Fill missing values in specific DataFrame columns.
217
+
218
+ Args:
219
+ df: Pandas DataFrame to mutate in place.
220
+ columns: Iterable of column names to fill if present.
221
+ fill_value: Replacement value for missing entries.
222
+ """
223
+ for column in columns:
224
+ if column in df.columns:
225
+ df[column] = df[column].fillna(fill_value)
226
+ else:
227
+ logger.warning("Column '%s' not found in DataFrame.", column)
228
+
229
+ def fill_missing_values(df, columns_to_compare, fill_value="unknown"):
230
+ """Fill missing values in a list of DataFrame columns if they exist.
231
+
232
+ Args:
233
+ df: Pandas DataFrame to mutate in place.
234
+ columns_to_compare: Iterable of column names to fill if present.
235
+ fill_value: Replacement value for missing entries.
236
+ """
237
+ for column in columns_to_compare:
238
+ if column in df.columns:
239
+ df[column] = df[column].fillna(fill_value)
240
+ else:
241
+ logger.warning("Column '%s' not found in DataFrame.", column)
242
+
243
+ def calculate_percentage_agreement(y_true, y_pred):
244
+ """Compute exact-match agreement between two aligned series."""
245
+ return sum(y_true == y_pred) / len(y_true)
246
+
247
+ def read_emissions_data(emissions_file):
248
+ """Read emissions and energy metadata from a CodeCarbon CSV file.
249
+
250
+ Args:
251
+ emissions_file: Path to ``emissions.csv`` produced during annotation.
252
+
253
+ Returns:
254
+ Tuple of ``(emissions, energy_consumed, cpu_model, gpu_model)``.
255
+ """
256
+ try:
257
+ emissions_df = pd.read_csv(emissions_file)
258
+ if len(emissions_df) > 0:
259
+ latest_row = emissions_df.iloc[-1]
260
+ return (latest_row['emissions'], latest_row['energy_consumed'],
261
+ latest_row['cpu_model'], latest_row['gpu_model'])
262
+ return None, None, None, None
263
+ except Exception as e:
264
+ logger.warning("Error reading emissions file: %s", e)
265
+ return None, None, None, None
266
+
267
+ def read_timing_data(timing_file):
268
+ """Read inference timing statistics from a JSON sidecar file.
269
+
270
+ Args:
271
+ timing_file: Path to ``timing_data.json``.
272
+
273
+ Returns:
274
+ Tuple of ``(total_inference_time, avg_inference_time)``.
275
+ """
276
+ try:
277
+ with open(timing_file, 'r') as file:
278
+ timing_data = json.load(file)
279
+ return (timing_data.get('total_inference_time', None),
280
+ timing_data.get('avg_inference_time', None))
281
+ except Exception as e:
282
+ logger.warning("Error reading timing file: %s", e)
283
+ return None, None
284
+
285
+ def read_char_counts(char_counts_file):
286
+ """Read prompt and response character counts from a JSON sidecar file.
287
+
288
+ Args:
289
+ char_counts_file: Path to ``char_counts.json``.
290
+
291
+ Returns:
292
+ Tuple of ``(input_chars, output_chars)``.
293
+ """
294
+ try:
295
+ with open(char_counts_file, 'r') as file:
296
+ char_counts = json.load(file)
297
+ return (char_counts.get('input_chars', None),
298
+ char_counts.get('output_chars', None))
299
+ except Exception as e:
300
+ logger.warning("Error reading character counts file: %s", e)
301
+ return None, None
302
+
303
+ def quadratic_weighted_kappa(y_true, y_pred):
304
+ """
305
+ Calculate the quadratic weighted kappa.
306
+
307
+ Args:
308
+ y_true: Iterable of gold ordinal labels.
309
+ y_pred: Iterable of predicted ordinal labels.
310
+
311
+ Returns:
312
+ Quadratic weighted kappa as a float, or ``nan`` when it cannot be computed.
313
+ """
314
+ # Convert to numeric if not already
315
+ y_true = pd.to_numeric(y_true, errors='coerce')
316
+ y_pred = pd.to_numeric(y_pred, errors='coerce')
317
+
318
+ # Drop missing values
319
+ mask = ~(np.isnan(y_true) | np.isnan(y_pred))
320
+ y_true = y_true[mask]
321
+ y_pred = y_pred[mask]
322
+
323
+ if len(y_true) == 0:
324
+ return float('nan')
325
+
326
+ # Get the unique classes
327
+ labels = sorted(set(np.concatenate([y_true, y_pred])))
328
+ n_labels = len(labels)
329
+
330
+ if n_labels <= 1:
331
+ return float('nan') # Can't calculate with only one class
332
+
333
+ # Create a label mapping
334
+ label_map = {l: i for i, l in enumerate(labels)}
335
+
336
+ # Map the values to integers
337
+ y_true_mapped = np.array([label_map[v] for v in y_true])
338
+ y_pred_mapped = np.array([label_map[v] for v in y_pred])
339
+
340
+ # Calculate the confusion matrix
341
+ cm = confusion_matrix(y_true_mapped, y_pred_mapped, labels=range(n_labels))
342
+
343
+ # Calculate the weights matrix (quadratic weighting)
344
+ weights = np.zeros((n_labels, n_labels))
345
+ for i in range(n_labels):
346
+ for j in range(n_labels):
347
+ weights[i, j] = ((i - j) ** 2) / ((n_labels - 1) ** 2)
348
+
349
+ # Calculate expected matrix
350
+ row_sum = np.sum(cm, axis=1)
351
+ col_sum = np.sum(cm, axis=0)
352
+ expected = np.outer(row_sum, col_sum) / np.sum(cm)
353
+
354
+ # Calculate weighted kappa
355
+ k = 1.0 - np.sum(weights * cm) / np.sum(weights * expected)
356
+
357
+ return k
358
+
359
+ def evaluate_performance(merged_df, columns_to_compare, column_info, process_textbox=False):
360
+ """Compute evaluation metrics for each requested annotation column.
361
+
362
+ Args:
363
+ merged_df: DataFrame returned by :func:`load_data`.
364
+ columns_to_compare: List of annotation columns to evaluate.
365
+ column_info: Metadata mapping produced by :func:`extract_column_info_from_codebook`.
366
+ process_textbox: Whether textbox similarity metrics should be calculated.
367
+
368
+ Returns:
369
+ Tuple of metric dictionaries and report text keyed by annotation column name.
370
+ """
371
+ # Initialize all metric dictionaries with default NaN values for all columns
372
+ accuracy_scores = {col: float('nan') for col in columns_to_compare}
373
+ precision_scores = {col: float('nan') for col in columns_to_compare}
374
+ recall_scores = {col: float('nan') for col in columns_to_compare}
375
+ f1_scores = {col: float('nan') for col in columns_to_compare}
376
+ cohen_kappa_scores = {col: float('nan') for col in columns_to_compare}
377
+ krippendorff_alpha_scores = {col: float('nan') for col in columns_to_compare}
378
+ percentage_agreement_scores = {col: float('nan') for col in columns_to_compare}
379
+ reports = {col: "Not processed" for col in columns_to_compare}
380
+
381
+ # Ordinal metrics for likert scales
382
+ spearman_corr_scores = {col: float('nan') for col in columns_to_compare}
383
+ quadratic_kappa_scores = {col: float('nan') for col in columns_to_compare}
384
+
385
+ # Textbox metrics
386
+ norm_levenshtein_scores = {col: float('nan') for col in columns_to_compare}
387
+ bleu_scores = {col: float('nan') for col in columns_to_compare}
388
+ rouge1_f_scores = {col: float('nan') for col in columns_to_compare}
389
+ rouge2_f_scores = {col: float('nan') for col in columns_to_compare}
390
+ rougeL_f_scores = {col: float('nan') for col in columns_to_compare}
391
+ cosine_scores = {col: float('nan') for col in columns_to_compare}
392
+ bertscore_p_scores = {col: float('nan') for col in columns_to_compare}
393
+ bertscore_r_scores = {col: float('nan') for col in columns_to_compare}
394
+ bertscore_f1_scores = {col: float('nan') for col in columns_to_compare}
395
+
396
+ for column in columns_to_compare:
397
+ column_gt = f'{column}_gt'
398
+ column_llm = f'{column}_llm'
399
+
400
+ # Check if both columns exist in the dataframe
401
+ if column_gt not in merged_df.columns or column_llm not in merged_df.columns:
402
+ logger.warning("Skipping column '%s' as one or both corresponding columns are missing in the merged dataframe", column)
403
+ reports[column] = "Column missing in dataframe."
404
+ continue
405
+
406
+ # Get annotation type
407
+ annotation_type = column_info.get(column, {}).get('type', 'dropdown')
408
+ logger.debug("Processing column: %s (Type: %s)", column, annotation_type)
409
+
410
+ # Skip textbox annotations if process_textbox is False
411
+ if annotation_type == 'textbox' and not process_textbox:
412
+ logger.info("Skipping textbox column '%s' as process_textbox is False", column)
413
+ reports[column] = "Textbox processing skipped."
414
+ continue
415
+
416
+ y_true = merged_df[column_gt]
417
+ y_pred = merged_df[column_llm]
418
+
419
+ # Handle values based on annotation type
420
+ if annotation_type == 'checkbox':
421
+ logger.debug("Handling checkbox column - converting to binary values")
422
+ y_true = y_true.fillna('0')
423
+ y_pred = y_pred.fillna('0')
424
+ # Ensure values are '0' or '1'
425
+ y_true = y_true.map(lambda x: '1' if str(x).lower() in ['1', 'true', 'yes', 'checked'] else '0')
426
+ y_pred = y_pred.map(lambda x: '1' if str(x).lower() in ['1', 'true', 'yes', 'checked'] else '0')
427
+ elif annotation_type == 'likert':
428
+ logger.debug("Handling likert column - calculating both ordinal and classification metrics")
429
+ # Get min and max values from codebook
430
+ min_value = column_info[column].get('min_value', 0)
431
+ max_value = column_info[column].get('max_value', 5)
432
+
433
+ # Convert to numeric values for ordinal metrics
434
+ y_true_numeric = pd.to_numeric(y_true, errors='coerce')
435
+ y_pred_numeric = pd.to_numeric(y_pred, errors='coerce')
436
+
437
+ # Calculate ordinal metrics
438
+ valid_indices = ~(y_true_numeric.isna() | y_pred_numeric.isna())
439
+
440
+ if valid_indices.any():
441
+ # Calculate Spearman's correlation
442
+ corr, _ = spearmanr(y_true_numeric[valid_indices], y_pred_numeric[valid_indices])
443
+ spearman_corr_scores[column] = corr
444
+
445
+ # Calculate quadratic weighted kappa
446
+ quadratic_kappa_scores[column] = quadratic_weighted_kappa(
447
+ y_true_numeric[valid_indices], y_pred_numeric[valid_indices])
448
+
449
+ # For classification metrics, convert to integers and then to string
450
+ # This ensures '1.0' and '1' are treated as the same value
451
+ # First cast to float, then to int, then to string to standardize
452
+ y_true = y_true_numeric.fillna(-9999).astype(int).astype(str)
453
+ y_pred = y_pred_numeric.fillna(-9999).astype(int).astype(str)
454
+
455
+ # Replace placeholder for NaN
456
+ y_true = y_true.replace('-9999', '')
457
+ y_pred = y_pred.replace('-9999', '')
458
+ elif annotation_type == 'dropdown':
459
+ valid_indices = y_true.notna() | y_pred.notna()
460
+ y_true = y_true[valid_indices]
461
+ y_pred = y_pred[valid_indices]
462
+ elif annotation_type == 'textbox':
463
+ logger.debug("Handling textbox column - calculating text similarity metrics")
464
+ # Process textbox with specialized metrics
465
+ try:
466
+ textbox_metrics = evaluate_textbox_performance(y_true, y_pred)
467
+ missing_dependencies = sorted(set(textbox_metrics.pop('_missing_dependencies', [])))
468
+
469
+ # Store results in corresponding dictionaries
470
+ norm_levenshtein_scores[column] = textbox_metrics['norm_levenshtein']
471
+ bleu_scores[column] = textbox_metrics['bleu']
472
+ rouge1_f_scores[column] = textbox_metrics['rouge1_f']
473
+ rouge2_f_scores[column] = textbox_metrics['rouge2_f']
474
+ rougeL_f_scores[column] = textbox_metrics['rougeL_f']
475
+ cosine_scores[column] = textbox_metrics['cosine_similarity']
476
+ bertscore_p_scores[column] = textbox_metrics['bertscore_precision']
477
+ bertscore_r_scores[column] = textbox_metrics['bertscore_recall']
478
+ bertscore_f1_scores[column] = textbox_metrics['bertscore_f1']
479
+
480
+ # For textbox, we only keep percentage agreement for the traditional metrics
481
+ y_true = y_true.fillna('')
482
+ y_pred = y_pred.fillna('')
483
+ if missing_dependencies:
484
+ reports[column] = (
485
+ "Textbox field, available metrics calculated. "
486
+ f"Missing packages for some metrics: {', '.join(missing_dependencies)}. "
487
+ f"{TEXTBOX_DEPENDENCY_GUIDANCE}"
488
+ )
489
+ else:
490
+ reports[column] = "Textbox field, specialized metrics calculated."
491
+ except Exception as e:
492
+ logger.warning("Error calculating textbox metrics for column '%s': %s", column, e)
493
+ reports[column] = f"Error in textbox metrics: {str(e)}"
494
+
495
+ logger.debug("Initial data shape - y_true: %s, y_pred: %s", y_true.shape, y_pred.shape)
496
+
497
+ true_unique = sorted([x for x in y_true.unique() if pd.notna(x)])
498
+ pred_unique = sorted([x for x in y_pred.unique() if pd.notna(x)])
499
+ logger.debug("Initial unique values in y_true: %s", true_unique)
500
+ logger.debug("Initial unique values in y_pred: %s", pred_unique)
501
+
502
+ logger.debug("Detailed value counts:")
503
+ logger.debug("Ground truth:\n%s", y_true.value_counts(dropna=False).to_string())
504
+ logger.debug("Predictions:\n%s", y_pred.value_counts(dropna=False).to_string())
505
+
506
+ if len(y_true) == 0:
507
+ logger.warning("No valid entries for column '%s'", column)
508
+ reports[column] = "No valid data for metrics calculation."
509
+ continue
510
+
511
+ try:
512
+ # Skip traditional classification metrics for textbox fields
513
+ if annotation_type == 'textbox':
514
+ # Just calculate percentage agreement
515
+ y_true_clean = y_true.fillna('missing')
516
+ y_pred_clean = y_pred.fillna('missing')
517
+ percentage_agreement_scores[column] = calculate_percentage_agreement(y_true_clean, y_pred_clean)
518
+ continue
519
+
520
+ # For metrics calculation, we need to handle NaN values
521
+ y_true_clean = y_true.fillna('missing')
522
+ y_pred_clean = y_pred.fillna('missing')
523
+
524
+ # Calculate agreement metrics
525
+ percentage_agreement_scores[column] = calculate_percentage_agreement(y_true_clean, y_pred_clean)
526
+
527
+ # Get unique valid labels (excluding NaN)
528
+ true_labels = set(x for x in y_true.unique() if pd.notna(x))
529
+ pred_labels = set(x for x in y_pred.unique() if pd.notna(x))
530
+ all_labels = sorted(true_labels | pred_labels)
531
+ logger.debug("All unique labels: %s", all_labels)
532
+
533
+ # Calculate metrics
534
+ accuracy_scores[column] = accuracy_score(y_true_clean, y_pred_clean)
535
+ precision_scores[column] = precision_score(y_true_clean, y_pred_clean,
536
+ average='macro',
537
+ zero_division=0,
538
+ labels=all_labels)
539
+ recall_scores[column] = recall_score(y_true_clean, y_pred_clean,
540
+ average='macro',
541
+ zero_division=0,
542
+ labels=all_labels)
543
+ f1_scores[column] = f1_score(y_true_clean, y_pred_clean,
544
+ average='macro',
545
+ zero_division=0,
546
+ labels=all_labels)
547
+ cohen_kappa_scores[column] = cohen_kappa_score(y_true_clean, y_pred_clean)
548
+
549
+ # For Krippendorff's alpha
550
+ label_to_int = {label: i for i, label in enumerate(['missing'] + all_labels)}
551
+ y_true_encoded = np.array([label_to_int[y_true_clean[i]] for i in range(len(y_true_clean))])
552
+ y_pred_encoded = np.array([label_to_int[y_pred_clean[i]] for i in range(len(y_pred_clean))])
553
+ data = np.array([y_true_encoded, y_pred_encoded])
554
+ krippendorff_alpha_scores[column] = krippendorff.alpha(reliability_data=data)
555
+
556
+ # Generate classification report
557
+ reports[column] = classification_report(y_true_clean, y_pred_clean,
558
+ labels=all_labels,
559
+ zero_division=0)
560
+ except Exception as e:
561
+ logger.warning("Error processing column '%s': %s: %s", column, type(e).__name__, e)
562
+ logger.debug("Current state - y_true sample:\n%s", y_true.head().to_string())
563
+ logger.debug("Current state - y_pred sample:\n%s", y_pred.head().to_string())
564
+ reports[column] = f"Error: {str(e)}"
565
+
566
+ return (accuracy_scores, precision_scores, recall_scores, f1_scores,
567
+ cohen_kappa_scores, krippendorff_alpha_scores, percentage_agreement_scores,
568
+ spearman_corr_scores, quadratic_kappa_scores,
569
+ norm_levenshtein_scores, bleu_scores,
570
+ rouge1_f_scores, rouge2_f_scores, rougeL_f_scores,
571
+ cosine_scores,
572
+ bertscore_p_scores, bertscore_r_scores, bertscore_f1_scores,
573
+ reports)
574
+
575
+ def evaluate_textbox_performance(y_true, y_pred):
576
+ """
577
+ Calculate similarity metrics for textbox annotations.
578
+
579
+ Args:
580
+ y_true: Pandas Series of reference textbox annotations.
581
+ y_pred: Pandas Series of predicted textbox annotations.
582
+
583
+ Returns:
584
+ Dictionary of textbox similarity metrics plus an internal
585
+ ``_missing_dependencies`` list when optional packages are unavailable.
586
+ """
587
+ missing_dependencies = []
588
+
589
+ try:
590
+ import Levenshtein
591
+ except ImportError:
592
+ Levenshtein = None
593
+ missing_dependencies.append('python-Levenshtein')
594
+
595
+ try:
596
+ import nltk
597
+ from nltk.translate.bleu_score import sentence_bleu
598
+ except ImportError:
599
+ nltk = None
600
+ sentence_bleu = None
601
+ missing_dependencies.append('nltk')
602
+
603
+ try:
604
+ from rouge_score import rouge_scorer
605
+ except ImportError:
606
+ rouge_scorer = None
607
+ missing_dependencies.append('rouge-score')
608
+
609
+ if importlib.util.find_spec("sentence_transformers") is None:
610
+ missing_dependencies.append("sentence-transformers")
611
+
612
+ if importlib.util.find_spec("bert_score") is None:
613
+ missing_dependencies.append("bert-score")
614
+
615
+ try:
616
+ import torch
617
+ except ImportError:
618
+ torch = None
619
+ missing_dependencies.append('torch')
620
+
621
+ # Normalize text - remove trailing whitespace and lowercase
622
+ y_true_norm = y_true.fillna('').str.lower().str.strip()
623
+ y_pred_norm = y_pred.fillna('').str.lower().str.strip()
624
+
625
+ logger.info("Starting textbox evaluation with %s entries", len(y_true))
626
+ logger.debug("Non-empty ground truth: %s", sum(y_true_norm != ''))
627
+ logger.debug("Non-empty predictions: %s", sum(y_pred_norm != ''))
628
+
629
+ # Initialize containers for individual scores
630
+ norm_levenshtein_scores = []
631
+ cosine_scores = []
632
+
633
+ # Lists to collect text pairs for batch metrics
634
+ valid_refs = []
635
+ valid_cands = []
636
+
637
+ # Process each text pair
638
+ for i in range(len(y_true_norm)):
639
+ true_text = y_true_norm.iloc[i]
640
+ pred_text = y_pred_norm.iloc[i]
641
+
642
+ # Add to valid pairs if both texts have content
643
+ if true_text != '' and pred_text != '':
644
+ valid_refs.append(true_text)
645
+ valid_cands.append(pred_text)
646
+
647
+ # 1. Normalized Levenshtein Distance
648
+ if Levenshtein is not None and (true_text != '' or pred_text != ''):
649
+ lev_dist = Levenshtein.distance(true_text, pred_text)
650
+ max_len = max(len(true_text), len(pred_text))
651
+ if max_len > 0:
652
+ norm_lev_sim = 1 - (lev_dist / max_len)
653
+ norm_levenshtein_scores.append(norm_lev_sim)
654
+
655
+ logger.debug("Found %s valid pairs for batch metrics", len(valid_refs))
656
+
657
+ # Skip batch metrics if no valid pairs
658
+ if not valid_refs:
659
+ return {
660
+ 'norm_levenshtein': np.mean(norm_levenshtein_scores) if norm_levenshtein_scores else float('nan'),
661
+ 'bleu': float('nan'),
662
+ 'rouge1_f': float('nan'),
663
+ 'rouge2_f': float('nan'),
664
+ 'rougeL_f': float('nan'),
665
+ 'cosine_similarity': float('nan'),
666
+ 'bertscore_precision': float('nan'),
667
+ 'bertscore_recall': float('nan'),
668
+ 'bertscore_f1': float('nan'),
669
+ '_missing_dependencies': missing_dependencies
670
+ }
671
+
672
+ # 2. BLEU Score calculation
673
+ bleu_score = float('nan')
674
+ if nltk is not None and sentence_bleu is not None:
675
+ try:
676
+ # Calculate BLEU score manually with NLTK
677
+ bleu_scores = []
678
+ for ref, cand in zip(valid_refs, valid_cands):
679
+ reference = [ref.split()]
680
+ candidate = cand.split()
681
+ if reference[0] and candidate: # Ensure non-empty
682
+ try:
683
+ # Use smoothing for short segments
684
+ smoothing_function = nltk.translate.bleu_score.SmoothingFunction().method1
685
+ score = sentence_bleu(reference, candidate,
686
+ weights=(0.25, 0.25, 0.25, 0.25),
687
+ smoothing_function=smoothing_function)
688
+ bleu_scores.append(score)
689
+ except Exception as e:
690
+ logger.debug("Individual BLEU error: %s", e)
691
+
692
+ bleu_score = np.mean(bleu_scores) if bleu_scores else float('nan')
693
+ logger.debug("Calculated %s BLEU scores, mean: %s", len(bleu_scores), bleu_score)
694
+ except Exception as e:
695
+ logger.warning("BLEU calculation error: %s", e)
696
+ else:
697
+ logger.info("Skipping BLEU calculation because nltk is not installed.")
698
+
699
+ # 3. ROUGE Score calculation
700
+ rouge1_f = rouge2_f = rougeL_f = float('nan')
701
+ if rouge_scorer is not None:
702
+ try:
703
+ # Calculate ROUGE directly
704
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
705
+ rouge1_scores = []
706
+ rouge2_scores = []
707
+ rougeL_scores = []
708
+
709
+ for ref, cand in zip(valid_refs, valid_cands):
710
+ try:
711
+ score = scorer.score(ref, cand)
712
+ rouge1_scores.append(score['rouge1'].fmeasure)
713
+ rouge2_scores.append(score['rouge2'].fmeasure)
714
+ rougeL_scores.append(score['rougeL'].fmeasure)
715
+ except Exception as e:
716
+ logger.debug("Individual ROUGE error: %s", e)
717
+
718
+ rouge1_f = np.mean(rouge1_scores) if rouge1_scores else float('nan')
719
+ rouge2_f = np.mean(rouge2_scores) if rouge2_scores else float('nan')
720
+ rougeL_f = np.mean(rougeL_scores) if rougeL_scores else float('nan')
721
+ logger.debug("Calculated %s ROUGE scores", len(rouge1_scores))
722
+ except Exception as e:
723
+ logger.warning("ROUGE calculation error: %s", e)
724
+ else:
725
+ logger.info("Skipping ROUGE calculation because rouge-score is not installed.")
726
+
727
+ # 4. Cosine Similarity with Sentence Embeddings
728
+ if "sentence-transformers" not in missing_dependencies and torch is not None:
729
+ try:
730
+ model = _load_sentence_transformer_model()
731
+
732
+ true_embeddings = model.encode(valid_refs, convert_to_tensor=True)
733
+ pred_embeddings = model.encode(valid_cands, convert_to_tensor=True)
734
+
735
+ for i in range(len(valid_refs)):
736
+ cos_sim = torch.nn.functional.cosine_similarity(
737
+ true_embeddings[i].unsqueeze(0),
738
+ pred_embeddings[i].unsqueeze(0)
739
+ ).item()
740
+ cosine_scores.append(cos_sim)
741
+ logger.debug("Calculated %s cosine similarity scores", len(cosine_scores))
742
+ except Exception as e:
743
+ logger.warning("Embedding error: %s", e)
744
+ else:
745
+ logger.info("Skipping cosine similarity because sentence-transformers and/or torch are not installed.")
746
+
747
+ # 5. BERTScore calculation
748
+ bertscore_p = bertscore_r = bertscore_f1 = float('nan')
749
+ if "bert-score" not in missing_dependencies and torch is not None:
750
+ try:
751
+ bert_score = _load_bert_score_module()
752
+
753
+ # Calculate BERTScore directly
754
+ with _quiet_optional_nlp_output():
755
+ P, R, F1 = bert_score.score(
756
+ valid_cands,
757
+ valid_refs,
758
+ lang="en",
759
+ model_type="roberta-large",
760
+ rescale_with_baseline=True,
761
+ verbose=False
762
+ )
763
+
764
+ # Convert tensor outputs to Python floats
765
+ bertscore_p = torch.mean(P).item() if len(P) > 0 else float('nan')
766
+ bertscore_r = torch.mean(R).item() if len(R) > 0 else float('nan')
767
+ bertscore_f1 = torch.mean(F1).item() if len(F1) > 0 else float('nan')
768
+ logger.debug("Calculated BERTScore metrics")
769
+ except Exception as e:
770
+ logger.warning("BERTScore calculation error: %s", e)
771
+ else:
772
+ logger.info("Skipping BERTScore because bert-score and/or torch are not installed.")
773
+
774
+ # Gather all results
775
+ results = {
776
+ 'norm_levenshtein': np.mean(norm_levenshtein_scores) if norm_levenshtein_scores else float('nan'),
777
+ 'bleu': bleu_score,
778
+ 'rouge1_f': rouge1_f,
779
+ 'rouge2_f': rouge2_f,
780
+ 'rougeL_f': rougeL_f,
781
+ 'cosine_similarity': np.mean(cosine_scores) if cosine_scores else float('nan'),
782
+ 'bertscore_precision': bertscore_p,
783
+ 'bertscore_recall': bertscore_r,
784
+ 'bertscore_f1': bertscore_f1,
785
+ '_missing_dependencies': missing_dependencies
786
+ }
787
+
788
+ logger.info("Textbox metrics calculated successfully")
789
+ for metric, value in results.items():
790
+ if metric != '_missing_dependencies':
791
+ logger.debug(" %s: %s", metric, value)
792
+ if missing_dependencies:
793
+ logger.warning(
794
+ "Textbox metrics missing dependencies: %s. %s",
795
+ ", ".join(sorted(set(missing_dependencies))),
796
+ TEXTBOX_DEPENDENCY_GUIDANCE,
797
+ )
798
+
799
+ return results
800
+
801
+ def append_metrics_to_csv(output_csv, label, model_id, quantization_type, temperature, top_p, codebook_path,
802
+ columns_to_compare, accuracy_scores, precision_scores, recall_scores,
803
+ f1_scores, cohen_kappa_scores, krippendorff_alpha_scores,
804
+ percentage_agreement_scores, spearman_corr_scores, quadratic_kappa_scores,
805
+ norm_levenshtein_scores, bleu_scores,
806
+ rouge1_f_scores, rouge2_f_scores, rougeL_f_scores,
807
+ cosine_scores,
808
+ bertscore_p_scores, bertscore_r_scores, bertscore_f1_scores,
809
+ column_info,
810
+ prompt_type=None, use_examples=None, process_textbox=None,
811
+ emissions=None, energy_consumed=None, cpu_model=None, gpu_model=None,
812
+ total_inference_time=None, avg_inference_time=None,
813
+ input_chars=None, output_chars=None,
814
+ timestamp=None, experiment_directory=None):
815
+ """Append one experiment's metrics to the aggregate CSV log.
816
+
817
+ Args:
818
+ output_csv: Path to the aggregate metrics CSV to create or update.
819
+ label: Experiment label, usually the task name.
820
+ model_id: Stable identifier for the model and prompt configuration.
821
+ quantization_type: Optional quantization metadata string.
822
+ temperature: Optional temperature metadata value.
823
+ top_p: Optional top-p metadata value.
824
+ codebook_path: Path to the codebook used for the experiment.
825
+ columns_to_compare: Annotation columns included in the evaluation.
826
+ accuracy_scores: Per-column accuracy values.
827
+ precision_scores: Per-column precision values.
828
+ recall_scores: Per-column recall values.
829
+ f1_scores: Per-column F1 values.
830
+ cohen_kappa_scores: Per-column Cohen's kappa values.
831
+ krippendorff_alpha_scores: Per-column Krippendorff's alpha values.
832
+ percentage_agreement_scores: Per-column exact agreement values.
833
+ spearman_corr_scores: Per-column Spearman correlations for Likert fields.
834
+ quadratic_kappa_scores: Per-column quadratic weighted kappa values.
835
+ norm_levenshtein_scores: Per-column normalized Levenshtein scores.
836
+ bleu_scores: Per-column BLEU scores.
837
+ rouge1_f_scores: Per-column ROUGE-1 F values.
838
+ rouge2_f_scores: Per-column ROUGE-2 F values.
839
+ rougeL_f_scores: Per-column ROUGE-L F values.
840
+ cosine_scores: Per-column cosine similarity scores.
841
+ bertscore_p_scores: Per-column BERTScore precision values.
842
+ bertscore_r_scores: Per-column BERTScore recall values.
843
+ bertscore_f1_scores: Per-column BERTScore F1 values.
844
+ column_info: Annotation metadata mapping from the codebook.
845
+ prompt_type: Optional prompt wrapper name stored as metadata.
846
+ use_examples: Optional boolean-like metadata flag.
847
+ process_textbox: Optional boolean-like metadata flag.
848
+ emissions: Optional emissions estimate in kilograms of CO2 equivalent.
849
+ energy_consumed: Optional energy consumption in kilowatt-hours.
850
+ cpu_model: Optional CPU model string.
851
+ gpu_model: Optional GPU model string.
852
+ total_inference_time: Optional total inference time in seconds.
853
+ avg_inference_time: Optional average inference time in seconds.
854
+ input_chars: Optional total prompt character count.
855
+ output_chars: Optional total response character count.
856
+ timestamp: Optional timestamp string.
857
+ experiment_directory: Optional path to the per-run output directory.
858
+ """
859
+ if timestamp is None:
860
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
861
+
862
+ file_exists = os.path.isfile(output_csv)
863
+
864
+ # Create base header with metadata columns - ADD TIMING AND CHARACTER COUNT COLUMNS
865
+ base_header = ['Timestamp', 'Label', 'Model ID', 'Quantization Type', 'Temperature', 'Top_P',
866
+ 'Prompt Type', 'Use Examples', 'Process Textbox', 'Codebook Path', 'Experiment Directory',
867
+ 'CPU Model', 'GPU Model', 'Emissions (kg CO₂eq)', 'Energy Consumed (kWh)',
868
+ 'Total Inference Time (s)', 'Avg Inference Time (s)',
869
+ 'Total Input Chars', 'Total Output Chars']
870
+
871
+ # Dynamically create header with only relevant metrics for each column
872
+ metric_header = []
873
+ for col in columns_to_compare:
874
+ annotation_type = column_info.get(col, {}).get('type', 'dropdown')
875
+
876
+ # Common metric for all types
877
+ metric_header.append(f'{col}_percentage_agreement')
878
+
879
+ # Metrics for categorical fields (dropdown, checkbox)
880
+ if annotation_type in ['dropdown', 'checkbox'] or annotation_type == 'likert':
881
+ metric_header.extend([
882
+ f'{col}_accuracy',
883
+ f'{col}_precision',
884
+ f'{col}_recall',
885
+ f'{col}_f1',
886
+ f'{col}_cohen_kappa',
887
+ f'{col}_krippendorff_alpha'
888
+ ])
889
+
890
+ # Metrics for likert fields
891
+ if annotation_type == 'likert':
892
+ metric_header.extend([
893
+ f'{col}_spearman_corr',
894
+ f'{col}_quadratic_kappa'
895
+ ])
896
+
897
+ # Metrics for textbox fields
898
+ if annotation_type == 'textbox':
899
+ metric_header.extend([
900
+ f'{col}_norm_levenshtein',
901
+ f'{col}_bleu',
902
+ f'{col}_rouge1_f',
903
+ f'{col}_rouge2_f',
904
+ f'{col}_rougeL_f',
905
+ f'{col}_cosine_similarity',
906
+ f'{col}_bertscore_precision',
907
+ f'{col}_bertscore_recall',
908
+ f'{col}_bertscore_f1'
909
+ ])
910
+
911
+ # Combine base and metric headers
912
+ new_header = base_header + metric_header
913
+
914
+ if file_exists:
915
+ try:
916
+ df = pd.read_csv(output_csv) # Use pandas for easier data manipulation
917
+ except pd.errors.EmptyDataError: # Handle empty file
918
+ df = pd.DataFrame(columns=[])
919
+
920
+ if set(new_header) != set(df.columns): # Compare sets of headers using pandas
921
+ # If new columns are added, add them to the existing dataframe
922
+ missing_cols = set(new_header) - set(df.columns)
923
+ for col in missing_cols:
924
+ df[col] = pd.NA
925
+
926
+ # If columns are removed, keep them in the dataframe for backward compatibility
927
+ # but we'll maintain the right order
928
+ for col in new_header:
929
+ if col not in df.columns:
930
+ df[col] = pd.NA
931
+
932
+ # Reorder columns to match new_header plus any existing columns
933
+ all_cols = new_header + [c for c in df.columns if c not in new_header]
934
+ df = df[all_cols]
935
+
936
+ df.to_csv(output_csv, index=False) # Write to CSV, overwriting
937
+
938
+ # Create new row with base metadata
939
+ new_row = {
940
+ 'Timestamp': timestamp,
941
+ 'Label': label,
942
+ 'Model ID': model_id,
943
+ 'Quantization Type': quantization_type,
944
+ 'Temperature': temperature,
945
+ 'Top_P': top_p,
946
+ 'Prompt Type': prompt_type,
947
+ 'Use Examples': use_examples,
948
+ 'Process Textbox': process_textbox,
949
+ 'Codebook Path': codebook_path,
950
+ 'Experiment Directory': experiment_directory,
951
+ 'CPU Model': cpu_model,
952
+ 'GPU Model': gpu_model,
953
+ 'Emissions (kg CO₂eq)': emissions,
954
+ 'Energy Consumed (kWh)': energy_consumed,
955
+ 'Total Inference Time (s)': total_inference_time,
956
+ 'Avg Inference Time (s)': avg_inference_time,
957
+ 'Total Input Chars': input_chars,
958
+ 'Total Output Chars': output_chars,
959
+ }
960
+
961
+ # Add metrics based on column type
962
+ for col in columns_to_compare:
963
+ annotation_type = column_info.get(col, {}).get('type', 'dropdown')
964
+
965
+ # Common metric for all types
966
+ new_row[f'{col}_percentage_agreement'] = percentage_agreement_scores[col]
967
+
968
+ # Metrics for categorical fields (dropdown, checkbox)
969
+ if annotation_type in ['dropdown', 'checkbox'] or annotation_type == 'likert':
970
+ new_row[f'{col}_accuracy'] = accuracy_scores[col]
971
+ new_row[f'{col}_precision'] = precision_scores[col]
972
+ new_row[f'{col}_recall'] = recall_scores[col]
973
+ new_row[f'{col}_f1'] = f1_scores[col]
974
+ new_row[f'{col}_cohen_kappa'] = cohen_kappa_scores[col]
975
+ new_row[f'{col}_krippendorff_alpha'] = krippendorff_alpha_scores[col]
976
+
977
+ # Metrics for likert fields
978
+ if annotation_type == 'likert':
979
+ new_row[f'{col}_spearman_corr'] = spearman_corr_scores[col]
980
+ new_row[f'{col}_quadratic_kappa'] = quadratic_kappa_scores[col]
981
+
982
+ # Metrics for textbox fields
983
+ if annotation_type == 'textbox':
984
+ new_row[f'{col}_norm_levenshtein'] = norm_levenshtein_scores[col]
985
+ new_row[f'{col}_bleu'] = bleu_scores[col]
986
+ new_row[f'{col}_rouge1_f'] = rouge1_f_scores[col]
987
+ new_row[f'{col}_rouge2_f'] = rouge2_f_scores[col]
988
+ new_row[f'{col}_rougeL_f'] = rougeL_f_scores[col]
989
+ new_row[f'{col}_cosine_similarity'] = cosine_scores[col]
990
+ new_row[f'{col}_bertscore_precision'] = bertscore_p_scores[col]
991
+ new_row[f'{col}_bertscore_recall'] = bertscore_r_scores[col]
992
+ new_row[f'{col}_bertscore_f1'] = bertscore_f1_scores[col]
993
+
994
+ # Add any missing columns with NaN values
995
+ for col in df.columns:
996
+ if col not in new_row:
997
+ new_row[col] = pd.NA
998
+
999
+ df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True) # Append using pandas
1000
+ df.to_csv(output_csv, index=False) # Write to CSV
1001
+ else: # File doesn't exist, create it
1002
+ # Create new row with base metadata
1003
+ new_row = {
1004
+ 'Timestamp': timestamp,
1005
+ 'Label': label,
1006
+ 'Model ID': model_id,
1007
+ 'Quantization Type': quantization_type,
1008
+ 'Temperature': temperature,
1009
+ 'Top_P': top_p,
1010
+ 'Prompt Type': prompt_type,
1011
+ 'Use Examples': use_examples,
1012
+ 'Process Textbox': process_textbox,
1013
+ 'Codebook Path': codebook_path,
1014
+ 'Experiment Directory': experiment_directory,
1015
+ 'CPU Model': cpu_model,
1016
+ 'GPU Model': gpu_model,
1017
+ 'Emissions (kg CO₂eq)': emissions,
1018
+ 'Energy Consumed (kWh)': energy_consumed,
1019
+ 'Total Inference Time (s)': total_inference_time,
1020
+ 'Avg Inference Time (s)': avg_inference_time,
1021
+ 'Total Input Chars': input_chars,
1022
+ 'Total Output Chars': output_chars,
1023
+ }
1024
+
1025
+ # Add metrics based on column type
1026
+ for col in columns_to_compare:
1027
+ annotation_type = column_info.get(col, {}).get('type', 'dropdown')
1028
+
1029
+ # Common metric for all types
1030
+ new_row[f'{col}_percentage_agreement'] = percentage_agreement_scores[col]
1031
+
1032
+ # Metrics for categorical fields (dropdown, checkbox)
1033
+ if annotation_type in ['dropdown', 'checkbox'] or annotation_type == 'likert':
1034
+ new_row[f'{col}_accuracy'] = accuracy_scores[col]
1035
+ new_row[f'{col}_precision'] = precision_scores[col]
1036
+ new_row[f'{col}_recall'] = recall_scores[col]
1037
+ new_row[f'{col}_f1'] = f1_scores[col]
1038
+ new_row[f'{col}_cohen_kappa'] = cohen_kappa_scores[col]
1039
+ new_row[f'{col}_krippendorff_alpha'] = krippendorff_alpha_scores[col]
1040
+
1041
+ # Metrics for likert fields
1042
+ if annotation_type == 'likert':
1043
+ new_row[f'{col}_spearman_corr'] = spearman_corr_scores[col]
1044
+ new_row[f'{col}_quadratic_kappa'] = quadratic_kappa_scores[col]
1045
+
1046
+ # Metrics for textbox fields
1047
+ if annotation_type == 'textbox':
1048
+ new_row[f'{col}_norm_levenshtein'] = norm_levenshtein_scores[col]
1049
+ new_row[f'{col}_bleu'] = bleu_scores[col]
1050
+ new_row[f'{col}_rouge1_f'] = rouge1_f_scores[col]
1051
+ new_row[f'{col}_rouge2_f'] = rouge2_f_scores[col]
1052
+ new_row[f'{col}_rougeL_f'] = rougeL_f_scores[col]
1053
+ new_row[f'{col}_cosine_similarity'] = cosine_scores[col]
1054
+ new_row[f'{col}_bertscore_precision'] = bertscore_p_scores[col]
1055
+ new_row[f'{col}_bertscore_recall'] = bertscore_r_scores[col]
1056
+ new_row[f'{col}_bertscore_f1'] = bertscore_f1_scores[col]
1057
+
1058
+ df = pd.DataFrame([new_row], columns=new_header)
1059
+ df.to_csv(output_csv, index=False)
1060
+
1061
+ def write_classification_reports(output_report_file, columns_to_compare, reports):
1062
+ """Write human-readable per-column classification reports to a text file.
1063
+
1064
+ Args:
1065
+ output_report_file: Destination text file path.
1066
+ columns_to_compare: Ordered list of annotation columns to include.
1067
+ reports: Mapping of annotation column name to report text.
1068
+ """
1069
+ with open(output_report_file, mode='w') as file:
1070
+ for column in columns_to_compare:
1071
+ file.write(f"Classification Report for '{column}':\n")
1072
+ file.write(reports[column])
1073
+ file.write("\n" + ("-" * 50) + "\n")
1074
+
1075
+
1076
+ def _build_metrics_by_column(
1077
+ columns_to_compare,
1078
+ column_info,
1079
+ accuracy_scores,
1080
+ precision_scores,
1081
+ recall_scores,
1082
+ f1_scores,
1083
+ cohen_kappa_scores,
1084
+ krippendorff_alpha_scores,
1085
+ percentage_agreement_scores,
1086
+ spearman_corr_scores,
1087
+ quadratic_kappa_scores,
1088
+ norm_levenshtein_scores,
1089
+ bleu_scores,
1090
+ rouge1_f_scores,
1091
+ rouge2_f_scores,
1092
+ rougeL_f_scores,
1093
+ cosine_scores,
1094
+ bertscore_p_scores,
1095
+ bertscore_r_scores,
1096
+ bertscore_f1_scores,
1097
+ ):
1098
+ """Assemble a compact nested metrics dictionary keyed by column name.
1099
+
1100
+ Args:
1101
+ columns_to_compare: Ordered list of annotation columns to summarize.
1102
+ column_info: Annotation metadata mapping from the codebook.
1103
+ accuracy_scores: Per-column accuracy values.
1104
+ precision_scores: Per-column precision values.
1105
+ recall_scores: Per-column recall values.
1106
+ f1_scores: Per-column F1 values.
1107
+ cohen_kappa_scores: Per-column Cohen's kappa values.
1108
+ krippendorff_alpha_scores: Per-column Krippendorff's alpha values.
1109
+ percentage_agreement_scores: Per-column exact agreement values.
1110
+ spearman_corr_scores: Per-column Spearman correlations.
1111
+ quadratic_kappa_scores: Per-column quadratic weighted kappa values.
1112
+ norm_levenshtein_scores: Per-column normalized Levenshtein scores.
1113
+ bleu_scores: Per-column BLEU scores.
1114
+ rouge1_f_scores: Per-column ROUGE-1 F values.
1115
+ rouge2_f_scores: Per-column ROUGE-2 F values.
1116
+ rougeL_f_scores: Per-column ROUGE-L F values.
1117
+ cosine_scores: Per-column cosine similarity values.
1118
+ bertscore_p_scores: Per-column BERTScore precision values.
1119
+ bertscore_r_scores: Per-column BERTScore recall values.
1120
+ bertscore_f1_scores: Per-column BERTScore F1 values.
1121
+
1122
+ Returns:
1123
+ Nested metrics dictionary keyed by annotation column name.
1124
+ """
1125
+ metrics_by_column = {}
1126
+ for column in columns_to_compare:
1127
+ annotation_type = column_info.get(column, {}).get('type', 'dropdown')
1128
+ column_metrics = {
1129
+ "annotation_type": annotation_type,
1130
+ "percentage_agreement": percentage_agreement_scores[column],
1131
+ }
1132
+
1133
+ if annotation_type in ['dropdown', 'checkbox', 'likert']:
1134
+ column_metrics.update({
1135
+ "accuracy": accuracy_scores[column],
1136
+ "precision": precision_scores[column],
1137
+ "recall": recall_scores[column],
1138
+ "f1": f1_scores[column],
1139
+ "cohen_kappa": cohen_kappa_scores[column],
1140
+ "krippendorff_alpha": krippendorff_alpha_scores[column],
1141
+ })
1142
+
1143
+ if annotation_type == 'likert':
1144
+ column_metrics.update({
1145
+ "spearman_corr": spearman_corr_scores[column],
1146
+ "quadratic_kappa": quadratic_kappa_scores[column],
1147
+ })
1148
+
1149
+ if annotation_type == 'textbox':
1150
+ column_metrics.update({
1151
+ "norm_levenshtein": norm_levenshtein_scores[column],
1152
+ "bleu": bleu_scores[column],
1153
+ "rouge1_f": rouge1_f_scores[column],
1154
+ "rouge2_f": rouge2_f_scores[column],
1155
+ "rougeL_f": rougeL_f_scores[column],
1156
+ "cosine_similarity": cosine_scores[column],
1157
+ "bertscore_precision": bertscore_p_scores[column],
1158
+ "bertscore_recall": bertscore_r_scores[column],
1159
+ "bertscore_f1": bertscore_f1_scores[column],
1160
+ })
1161
+
1162
+ metrics_by_column[column] = column_metrics
1163
+
1164
+ return metrics_by_column
1165
+
1166
+
1167
+ def _format_metric_value(value) -> str:
1168
+ """Format a scalar metric for compact human-readable summaries."""
1169
+ if value is None:
1170
+ return "n/a"
1171
+ try:
1172
+ if pd.isna(value):
1173
+ return "n/a"
1174
+ except TypeError:
1175
+ pass
1176
+
1177
+ if isinstance(value, (int, float, np.floating)):
1178
+ return f"{float(value):.3f}"
1179
+ return str(value)
1180
+
1181
+
1182
+ def _format_count_value(value) -> str:
1183
+ """Format integer-like counters for human-readable summaries."""
1184
+ if value is None:
1185
+ return "n/a"
1186
+ try:
1187
+ if pd.isna(value):
1188
+ return "n/a"
1189
+ except TypeError:
1190
+ pass
1191
+ return f"{int(value):,}"
1192
+
1193
+
1194
+ def format_metrics_summary(
1195
+ metrics_by_column: dict[str, dict],
1196
+ *,
1197
+ total_inference_time=None,
1198
+ avg_inference_time=None,
1199
+ input_chars=None,
1200
+ output_chars=None,
1201
+ energy_consumed=None,
1202
+ emissions=None,
1203
+ cpu_model=None,
1204
+ gpu_model=None,
1205
+ ) -> str:
1206
+ """Render a compact plain-text summary of performance and efficiency metrics."""
1207
+ if not metrics_by_column:
1208
+ return "No metrics were produced."
1209
+
1210
+ lines = ["Run Summary", "", "Performance"]
1211
+ for column, metrics in metrics_by_column.items():
1212
+ annotation_type = metrics.get("annotation_type", "unknown")
1213
+ if annotation_type == "textbox":
1214
+ metric_order = [
1215
+ ("agreement", "percentage_agreement"),
1216
+ ("levenshtein", "norm_levenshtein"),
1217
+ ("bleu", "bleu"),
1218
+ ("rougeL", "rougeL_f"),
1219
+ ("cosine", "cosine_similarity"),
1220
+ ("bertscore_f1", "bertscore_f1"),
1221
+ ]
1222
+ elif annotation_type == "likert":
1223
+ metric_order = [
1224
+ ("agreement", "percentage_agreement"),
1225
+ ("accuracy", "accuracy"),
1226
+ ("f1", "f1"),
1227
+ ("spearman", "spearman_corr"),
1228
+ ("quadratic_kappa", "quadratic_kappa"),
1229
+ ]
1230
+ else:
1231
+ metric_order = [
1232
+ ("agreement", "percentage_agreement"),
1233
+ ("accuracy", "accuracy"),
1234
+ ("f1", "f1"),
1235
+ ("kappa", "cohen_kappa"),
1236
+ ("alpha", "krippendorff_alpha"),
1237
+ ]
1238
+
1239
+ rendered = ", ".join(
1240
+ f"{label}={_format_metric_value(metrics.get(key))}"
1241
+ for label, key in metric_order
1242
+ if key in metrics
1243
+ )
1244
+ lines.append(f"- {column} [{annotation_type}]: {rendered}")
1245
+
1246
+ lines.extend([
1247
+ "",
1248
+ "Efficiency",
1249
+ f"- Total inference time (s): {_format_metric_value(total_inference_time)}",
1250
+ f"- Average inference time per annotation (s): {_format_metric_value(avg_inference_time)}",
1251
+ f"- Prompt characters: {_format_count_value(input_chars)}",
1252
+ f"- Output characters: {_format_count_value(output_chars)}",
1253
+ f"- Energy consumed (kWh): {_format_metric_value(energy_consumed)}",
1254
+ f"- Emissions (kg CO2eq): {_format_metric_value(emissions)}",
1255
+ ])
1256
+
1257
+ if cpu_model:
1258
+ lines.append(f"- CPU: {cpu_model}")
1259
+ if gpu_model and str(gpu_model).lower() != "none":
1260
+ lines.append(f"- GPU: {gpu_model}")
1261
+
1262
+ return "\n".join(lines)
1263
+
1264
+ def run_metrics(
1265
+ *,
1266
+ ground_truth_csv,
1267
+ llm_output_csv,
1268
+ label,
1269
+ output_csv,
1270
+ model_id,
1271
+ codebook_path,
1272
+ report_file,
1273
+ columns=None,
1274
+ quantization_type=None,
1275
+ temperature=None,
1276
+ top_p=None,
1277
+ prompt_type=None,
1278
+ use_examples=None,
1279
+ process_textbox=False,
1280
+ emissions_file=None,
1281
+ experiment_directory=None,
1282
+ timestamp=None,
1283
+ timing_file=None,
1284
+ char_counts_file=None,
1285
+ ):
1286
+ """Evaluate one model-output CSV against ground truth and persist the results.
1287
+
1288
+ Args:
1289
+ ground_truth_csv: Path to the human-labeled reference CSV.
1290
+ llm_output_csv: Path to the model-generated annotation CSV.
1291
+ label: Experiment label written to the metrics CSV, usually the task name.
1292
+ output_csv: Path to the aggregate metrics CSV to create or update.
1293
+ model_id: Stable identifier for the model and prompt configuration.
1294
+ codebook_path: Path to the codebook used for the run.
1295
+ report_file: Path to the per-column report text file.
1296
+ columns: Optional subset of annotation column names to evaluate.
1297
+ quantization_type: Optional quantization metadata string.
1298
+ temperature: Optional temperature metadata value.
1299
+ top_p: Optional top-p metadata value.
1300
+ prompt_type: Optional prompt wrapper name stored as metadata.
1301
+ use_examples: Optional boolean-like metadata flag.
1302
+ process_textbox: Whether textbox metrics should be computed.
1303
+ emissions_file: Optional path to ``emissions.csv``.
1304
+ experiment_directory: Optional path to the per-run output directory.
1305
+ timestamp: Optional timestamp string.
1306
+ timing_file: Optional path to ``timing_data.json``.
1307
+ char_counts_file: Optional path to ``char_counts.json``.
1308
+
1309
+ Returns:
1310
+ :class:`codebook_lab.types.MetricsRunResult` for the completed evaluation.
1311
+ """
1312
+ process_textbox = str(process_textbox).lower() in ('true', 'yes', '1', 't', 'y')
1313
+
1314
+ output_csv = Path(output_csv)
1315
+ report_file = Path(report_file)
1316
+ output_csv.parent.mkdir(parents=True, exist_ok=True)
1317
+ report_file.parent.mkdir(parents=True, exist_ok=True)
1318
+
1319
+ emissions = energy_consumed = cpu_model = gpu_model = None
1320
+ if emissions_file:
1321
+ emissions, energy_consumed, cpu_model, gpu_model = read_emissions_data(emissions_file)
1322
+
1323
+ total_inference_time = avg_inference_time = None
1324
+ if timing_file:
1325
+ total_inference_time, avg_inference_time = read_timing_data(timing_file)
1326
+
1327
+ input_chars = output_chars = None
1328
+ if char_counts_file:
1329
+ input_chars, output_chars = read_char_counts(char_counts_file)
1330
+
1331
+ column_info = extract_column_info_from_codebook(codebook_path)
1332
+ columns_to_compare = list(columns) if columns else list(column_info.keys())
1333
+ if not columns_to_compare:
1334
+ raise ValueError("No columns could be extracted from the codebook and none were provided.")
1335
+
1336
+ merged_df = load_data(ground_truth_csv, llm_output_csv, columns_to_compare)
1337
+ all_columns = [f'{col}_gt' for col in columns_to_compare] + [f'{col}_llm' for col in columns_to_compare]
1338
+ fill_missing_values(merged_df, all_columns, fill_value='')
1339
+
1340
+ results = evaluate_performance(merged_df, columns_to_compare, column_info, process_textbox)
1341
+ (
1342
+ accuracy_scores, precision_scores, recall_scores, f1_scores,
1343
+ cohen_kappa_scores, krippendorff_alpha_scores, percentage_agreement_scores,
1344
+ spearman_corr_scores, quadratic_kappa_scores,
1345
+ norm_levenshtein_scores, bleu_scores,
1346
+ rouge1_f_scores, rouge2_f_scores, rougeL_f_scores,
1347
+ cosine_scores,
1348
+ bertscore_p_scores, bertscore_r_scores, bertscore_f1_scores,
1349
+ reports
1350
+ ) = results
1351
+
1352
+ append_metrics_to_csv(
1353
+ str(output_csv), label, model_id, quantization_type,
1354
+ temperature, top_p, codebook_path, columns_to_compare,
1355
+ accuracy_scores, precision_scores, recall_scores, f1_scores,
1356
+ cohen_kappa_scores, krippendorff_alpha_scores, percentage_agreement_scores,
1357
+ spearman_corr_scores, quadratic_kappa_scores,
1358
+ norm_levenshtein_scores, bleu_scores,
1359
+ rouge1_f_scores, rouge2_f_scores, rougeL_f_scores,
1360
+ cosine_scores,
1361
+ bertscore_p_scores, bertscore_r_scores, bertscore_f1_scores,
1362
+ column_info,
1363
+ prompt_type=prompt_type, use_examples=use_examples,
1364
+ process_textbox=str(process_textbox).lower(),
1365
+ emissions=emissions, energy_consumed=energy_consumed,
1366
+ cpu_model=cpu_model, gpu_model=gpu_model,
1367
+ total_inference_time=total_inference_time, avg_inference_time=avg_inference_time,
1368
+ input_chars=input_chars, output_chars=output_chars,
1369
+ timestamp=timestamp, experiment_directory=experiment_directory
1370
+ )
1371
+
1372
+ write_classification_reports(str(report_file), columns_to_compare, reports)
1373
+ metrics_by_column = _build_metrics_by_column(
1374
+ columns_to_compare,
1375
+ column_info,
1376
+ accuracy_scores,
1377
+ precision_scores,
1378
+ recall_scores,
1379
+ f1_scores,
1380
+ cohen_kappa_scores,
1381
+ krippendorff_alpha_scores,
1382
+ percentage_agreement_scores,
1383
+ spearman_corr_scores,
1384
+ quadratic_kappa_scores,
1385
+ norm_levenshtein_scores,
1386
+ bleu_scores,
1387
+ rouge1_f_scores,
1388
+ rouge2_f_scores,
1389
+ rougeL_f_scores,
1390
+ cosine_scores,
1391
+ bertscore_p_scores,
1392
+ bertscore_r_scores,
1393
+ bertscore_f1_scores,
1394
+ )
1395
+
1396
+ logger.info("Results successfully written to %s and %s", output_csv, report_file)
1397
+ return MetricsRunResult(
1398
+ output_csv=output_csv,
1399
+ report_file=report_file,
1400
+ columns_to_compare=columns_to_compare,
1401
+ metrics_by_column=metrics_by_column,
1402
+ reports=reports,
1403
+ total_inference_time=total_inference_time,
1404
+ avg_inference_time=avg_inference_time,
1405
+ input_chars=input_chars,
1406
+ output_chars=output_chars,
1407
+ energy_consumed=energy_consumed,
1408
+ emissions=emissions,
1409
+ cpu_model=cpu_model,
1410
+ gpu_model=gpu_model,
1411
+ summary_text=format_metrics_summary(
1412
+ metrics_by_column,
1413
+ total_inference_time=total_inference_time,
1414
+ avg_inference_time=avg_inference_time,
1415
+ input_chars=input_chars,
1416
+ output_chars=output_chars,
1417
+ energy_consumed=energy_consumed,
1418
+ emissions=emissions,
1419
+ cpu_model=cpu_model,
1420
+ gpu_model=gpu_model,
1421
+ ),
1422
+ )