bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,398 @@
|
|
|
1
|
+
"""Inter-annotator agreement metrics.
|
|
2
|
+
|
|
3
|
+
This module provides inter-annotator agreement metrics for assessing
|
|
4
|
+
reliability and consistency across multiple human annotators.
|
|
5
|
+
Uses sklearn.metrics for Cohen's kappa, statsmodels for Fleiss' kappa,
|
|
6
|
+
and krippendorff package for Krippendorff's alpha.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from itertools import combinations
|
|
12
|
+
from typing import Literal
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from krippendorff import alpha as krippendorff_alpha
|
|
16
|
+
from sklearn.metrics import cohen_kappa_score
|
|
17
|
+
from statsmodels.stats.inter_rater import fleiss_kappa as statsmodels_fleiss_kappa
|
|
18
|
+
|
|
19
|
+
# Type alias for krippendorff metric levels
|
|
20
|
+
type KrippendorffMetric = Literal["nominal", "ordinal", "interval", "ratio"]
|
|
21
|
+
|
|
22
|
+
# Type alias for rating values (categorical, ordinal, interval, or ratio)
|
|
23
|
+
type Label = int | str | float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class InterAnnotatorMetrics:
|
|
27
|
+
"""Inter-annotator agreement metrics for reliability assessment.
|
|
28
|
+
|
|
29
|
+
Provides static methods for computing various agreement metrics:
|
|
30
|
+
- Percentage agreement (simple)
|
|
31
|
+
- Cohen's kappa (2 raters, categorical)
|
|
32
|
+
- Fleiss' kappa (multiple raters, categorical)
|
|
33
|
+
- Krippendorff's alpha (general, multiple data types)
|
|
34
|
+
- Pairwise agreement (all pairs of raters)
|
|
35
|
+
|
|
36
|
+
Examples
|
|
37
|
+
--------
|
|
38
|
+
>>> # Cohen's kappa for 2 raters
|
|
39
|
+
>>> rater1 = [0, 1, 0, 1, 1]
|
|
40
|
+
>>> rater2 = [0, 1, 1, 1, 1]
|
|
41
|
+
>>> InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
|
|
42
|
+
0.6
|
|
43
|
+
>>> # Percentage agreement
|
|
44
|
+
>>> InterAnnotatorMetrics.percentage_agreement(rater1, rater2)
|
|
45
|
+
0.8
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
@staticmethod
|
|
49
|
+
def percentage_agreement(rater1: list[Label], rater2: list[Label]) -> float:
|
|
50
|
+
"""Compute simple percentage agreement between two raters.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
rater1 : list[Label]
|
|
55
|
+
Ratings from first rater.
|
|
56
|
+
rater2 : list[Label]
|
|
57
|
+
Ratings from second rater.
|
|
58
|
+
|
|
59
|
+
Returns
|
|
60
|
+
-------
|
|
61
|
+
float
|
|
62
|
+
Percentage agreement (0.0 to 1.0).
|
|
63
|
+
|
|
64
|
+
Raises
|
|
65
|
+
------
|
|
66
|
+
ValueError
|
|
67
|
+
If rater lists have different lengths.
|
|
68
|
+
|
|
69
|
+
Examples
|
|
70
|
+
--------
|
|
71
|
+
>>> rater1 = [1, 2, 3, 1, 2]
|
|
72
|
+
>>> rater2 = [1, 2, 2, 1, 2]
|
|
73
|
+
>>> InterAnnotatorMetrics.percentage_agreement(rater1, rater2)
|
|
74
|
+
0.8
|
|
75
|
+
"""
|
|
76
|
+
if len(rater1) != len(rater2):
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"Rater lists must have same length: {len(rater1)} != {len(rater2)}"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if not rater1:
|
|
82
|
+
return 1.0
|
|
83
|
+
|
|
84
|
+
agreements = sum(r1 == r2 for r1, r2 in zip(rater1, rater2, strict=True))
|
|
85
|
+
return agreements / len(rater1)
|
|
86
|
+
|
|
87
|
+
@staticmethod
|
|
88
|
+
def cohens_kappa(rater1: list[Label], rater2: list[Label]) -> float:
|
|
89
|
+
"""Compute Cohen's kappa for two raters.
|
|
90
|
+
|
|
91
|
+
Cohen's kappa measures agreement between two raters beyond chance.
|
|
92
|
+
Values range from -1 (complete disagreement) to 1 (perfect agreement),
|
|
93
|
+
with 0 indicating chance-level agreement.
|
|
94
|
+
|
|
95
|
+
Parameters
|
|
96
|
+
----------
|
|
97
|
+
rater1 : list[Label]
|
|
98
|
+
Ratings from first rater.
|
|
99
|
+
rater2 : list[Label]
|
|
100
|
+
Ratings from second rater.
|
|
101
|
+
|
|
102
|
+
Returns
|
|
103
|
+
-------
|
|
104
|
+
float
|
|
105
|
+
Cohen's kappa coefficient.
|
|
106
|
+
|
|
107
|
+
Raises
|
|
108
|
+
------
|
|
109
|
+
ValueError
|
|
110
|
+
If rater lists have different lengths or are empty.
|
|
111
|
+
|
|
112
|
+
Examples
|
|
113
|
+
--------
|
|
114
|
+
>>> # Perfect agreement
|
|
115
|
+
>>> rater1 = [0, 1, 0, 1]
|
|
116
|
+
>>> rater2 = [0, 1, 0, 1]
|
|
117
|
+
>>> InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
|
|
118
|
+
1.0
|
|
119
|
+
>>> # No agreement beyond chance
|
|
120
|
+
>>> rater1 = [0, 0, 1, 1]
|
|
121
|
+
>>> rater2 = [1, 1, 0, 0]
|
|
122
|
+
>>> kappa = InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
|
|
123
|
+
>>> abs(kappa - (-1.0)) < 0.01
|
|
124
|
+
True
|
|
125
|
+
"""
|
|
126
|
+
if len(rater1) != len(rater2):
|
|
127
|
+
raise ValueError(
|
|
128
|
+
f"Rater lists must have same length: {len(rater1)} != {len(rater2)}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
if not rater1:
|
|
132
|
+
raise ValueError("Rater lists cannot be empty")
|
|
133
|
+
|
|
134
|
+
# Check for single category case (sklearn returns NaN)
|
|
135
|
+
unique_values = set(rater1) | set(rater2)
|
|
136
|
+
if len(unique_values) == 1:
|
|
137
|
+
return 1.0 # Perfect agreement by definition
|
|
138
|
+
|
|
139
|
+
result = cohen_kappa_score(rater1, rater2)
|
|
140
|
+
# Handle NaN case (can happen with extreme distributions)
|
|
141
|
+
if np.isnan(result):
|
|
142
|
+
return 1.0
|
|
143
|
+
return float(result)
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def fleiss_kappa(ratings_matrix: np.ndarray[int, np.dtype[np.int_]]) -> float: # type: ignore[type-arg]
|
|
147
|
+
"""Compute Fleiss' kappa for multiple raters.
|
|
148
|
+
|
|
149
|
+
Fleiss' kappa generalizes Cohen's kappa to multiple raters. It measures
|
|
150
|
+
agreement beyond chance when multiple raters assign categorical ratings
|
|
151
|
+
to a set of items.
|
|
152
|
+
|
|
153
|
+
Parameters
|
|
154
|
+
----------
|
|
155
|
+
ratings_matrix : np.ndarray
|
|
156
|
+
Matrix of shape (n_items, n_categories) where element [i, j]
|
|
157
|
+
contains the number of raters who assigned item i to category j.
|
|
158
|
+
|
|
159
|
+
Returns
|
|
160
|
+
-------
|
|
161
|
+
float
|
|
162
|
+
Fleiss' kappa coefficient.
|
|
163
|
+
|
|
164
|
+
Raises
|
|
165
|
+
------
|
|
166
|
+
ValueError
|
|
167
|
+
If matrix is empty or has wrong shape.
|
|
168
|
+
ImportError
|
|
169
|
+
If statsmodels is not installed.
|
|
170
|
+
|
|
171
|
+
Examples
|
|
172
|
+
--------
|
|
173
|
+
>>> # 4 items, 3 categories, 5 raters each
|
|
174
|
+
>>> # Item 1: 3 raters chose cat 0, 2 chose cat 1, 0 chose cat 2
|
|
175
|
+
>>> ratings = np.array([
|
|
176
|
+
... [3, 2, 0], # Item 1
|
|
177
|
+
... [0, 0, 5], # Item 2
|
|
178
|
+
... [2, 3, 0], # Item 3
|
|
179
|
+
... [1, 1, 3], # Item 4
|
|
180
|
+
... ])
|
|
181
|
+
>>> kappa = InterAnnotatorMetrics.fleiss_kappa(ratings)
|
|
182
|
+
>>> 0.0 <= kappa <= 1.0
|
|
183
|
+
True
|
|
184
|
+
"""
|
|
185
|
+
if statsmodels_fleiss_kappa is None:
|
|
186
|
+
msg = "statsmodels required for Fleiss' kappa. pip install statsmodels"
|
|
187
|
+
raise ImportError(msg)
|
|
188
|
+
|
|
189
|
+
if ratings_matrix.size == 0:
|
|
190
|
+
raise ValueError("Ratings matrix cannot be empty")
|
|
191
|
+
|
|
192
|
+
n_items, n_categories = ratings_matrix.shape
|
|
193
|
+
|
|
194
|
+
if n_items == 0 or n_categories == 0:
|
|
195
|
+
raise ValueError(f"Invalid matrix shape: ({n_items}, {n_categories})")
|
|
196
|
+
|
|
197
|
+
# Check that all items have the same number of raters
|
|
198
|
+
rater_counts = ratings_matrix.sum(axis=1)
|
|
199
|
+
if not np.allclose(rater_counts, rater_counts[0]):
|
|
200
|
+
raise ValueError(
|
|
201
|
+
"All items must have same number of raters. "
|
|
202
|
+
f"Got counts: {rater_counts.tolist()}"
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
return float(statsmodels_fleiss_kappa(ratings_matrix))
|
|
206
|
+
|
|
207
|
+
@staticmethod
|
|
208
|
+
def krippendorff_alpha(
|
|
209
|
+
reliability_data: dict[str, list[Label | None]],
|
|
210
|
+
metric: str = "nominal",
|
|
211
|
+
) -> float:
|
|
212
|
+
"""Compute Krippendorff's alpha for multiple raters.
|
|
213
|
+
|
|
214
|
+
Krippendorff's alpha is the most general inter-rater reliability
|
|
215
|
+
measure. It handles:
|
|
216
|
+
- Any number of raters
|
|
217
|
+
- Missing data
|
|
218
|
+
- Different data types (nominal, ordinal, interval, ratio)
|
|
219
|
+
|
|
220
|
+
Parameters
|
|
221
|
+
----------
|
|
222
|
+
reliability_data : dict[str, list[Label | None]]
|
|
223
|
+
Dictionary mapping rater IDs to their ratings. Each rater's
|
|
224
|
+
ratings list must have same length (use None for missing values).
|
|
225
|
+
metric : str, default="nominal"
|
|
226
|
+
Distance metric to use:
|
|
227
|
+
- "nominal": for categorical data (default)
|
|
228
|
+
- "ordinal": for ordered categories
|
|
229
|
+
- "interval": for interval-scaled data
|
|
230
|
+
- "ratio": for ratio-scaled data
|
|
231
|
+
|
|
232
|
+
Returns
|
|
233
|
+
-------
|
|
234
|
+
float
|
|
235
|
+
Krippendorff's alpha coefficient (1.0 = perfect agreement,
|
|
236
|
+
0.0 = chance agreement, < 0.0 = systematic disagreement).
|
|
237
|
+
|
|
238
|
+
Raises
|
|
239
|
+
------
|
|
240
|
+
ValueError
|
|
241
|
+
If reliability_data is empty or rater lists have different lengths.
|
|
242
|
+
|
|
243
|
+
Examples
|
|
244
|
+
--------
|
|
245
|
+
>>> # 3 raters, 5 items (with one missing value)
|
|
246
|
+
>>> data = {
|
|
247
|
+
... 'rater1': [1, 2, 3, 4, 5],
|
|
248
|
+
... 'rater2': [1, 2, 3, 4, 5],
|
|
249
|
+
... 'rater3': [1, 2, None, 4, 5]
|
|
250
|
+
... }
|
|
251
|
+
>>> alpha = InterAnnotatorMetrics.krippendorff_alpha(data)
|
|
252
|
+
>>> alpha > 0.8 # High agreement
|
|
253
|
+
True
|
|
254
|
+
"""
|
|
255
|
+
if not reliability_data:
|
|
256
|
+
raise ValueError("reliability_data cannot be empty")
|
|
257
|
+
|
|
258
|
+
# Convert to reliability matrix (items × raters)
|
|
259
|
+
rater_ids = list(reliability_data.keys())
|
|
260
|
+
n_items = len(reliability_data[rater_ids[0]])
|
|
261
|
+
|
|
262
|
+
# Check all raters have same number of items
|
|
263
|
+
for rater_id, ratings in reliability_data.items():
|
|
264
|
+
if len(ratings) != n_items:
|
|
265
|
+
raise ValueError(
|
|
266
|
+
f"All raters must rate same number of items: "
|
|
267
|
+
f"{rater_id} has {len(ratings)}, expected {n_items}"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Convert to format expected by krippendorff package
|
|
271
|
+
# Format: rows are coders/raters, columns are units/items
|
|
272
|
+
# Missing values should be np.nan
|
|
273
|
+
reliability_matrix: list[list[float]] = []
|
|
274
|
+
all_values: list[Label] = []
|
|
275
|
+
for rater_id in rater_ids:
|
|
276
|
+
rater_ratings: list[float] = []
|
|
277
|
+
for rating in reliability_data[rater_id]:
|
|
278
|
+
if rating is None:
|
|
279
|
+
rater_ratings.append(np.nan)
|
|
280
|
+
else:
|
|
281
|
+
is_numeric = isinstance(rating, int | float)
|
|
282
|
+
val = float(rating) if is_numeric else hash(rating)
|
|
283
|
+
rater_ratings.append(val)
|
|
284
|
+
all_values.append(rating)
|
|
285
|
+
reliability_matrix.append(rater_ratings)
|
|
286
|
+
|
|
287
|
+
# Handle edge cases that krippendorff package doesn't handle
|
|
288
|
+
if len(all_values) == 0:
|
|
289
|
+
# All missing data
|
|
290
|
+
return 0.0
|
|
291
|
+
|
|
292
|
+
# Check if there are any pairwise comparisons possible
|
|
293
|
+
# (at least one item must have ratings from at least 2 raters)
|
|
294
|
+
comparisons_possible = False
|
|
295
|
+
for item_idx in range(n_items):
|
|
296
|
+
n_raters_for_item = sum(
|
|
297
|
+
1
|
|
298
|
+
for rater_id in rater_ids
|
|
299
|
+
if reliability_data[rater_id][item_idx] is not None
|
|
300
|
+
)
|
|
301
|
+
if n_raters_for_item >= 2:
|
|
302
|
+
comparisons_possible = True
|
|
303
|
+
break
|
|
304
|
+
|
|
305
|
+
if not comparisons_possible:
|
|
306
|
+
# No pairwise comparisons possible
|
|
307
|
+
return 0.0
|
|
308
|
+
|
|
309
|
+
unique_values = set(all_values)
|
|
310
|
+
if len(unique_values) <= 1:
|
|
311
|
+
# All same value - perfect agreement by definition
|
|
312
|
+
return 1.0
|
|
313
|
+
|
|
314
|
+
# Map metric names to krippendorff package names
|
|
315
|
+
metric_map: dict[str, KrippendorffMetric] = {
|
|
316
|
+
"nominal": "nominal",
|
|
317
|
+
"ordinal": "ordinal",
|
|
318
|
+
"interval": "interval",
|
|
319
|
+
"ratio": "ratio",
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
if metric not in metric_map:
|
|
323
|
+
raise ValueError(
|
|
324
|
+
f"Unknown metric: {metric}. Must be one of: "
|
|
325
|
+
"'nominal', 'ordinal', 'interval', 'ratio'"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
return float(
|
|
329
|
+
krippendorff_alpha(
|
|
330
|
+
reliability_matrix,
|
|
331
|
+
level_of_measurement=metric_map[metric],
|
|
332
|
+
)
|
|
333
|
+
)
|
|
334
|
+
|
|
335
|
+
@staticmethod
|
|
336
|
+
def pairwise_agreement(
|
|
337
|
+
ratings: dict[str, list[Label]],
|
|
338
|
+
) -> dict[str, dict[str, float]]:
|
|
339
|
+
"""Compute pairwise agreement metrics for all rater pairs.
|
|
340
|
+
|
|
341
|
+
Parameters
|
|
342
|
+
----------
|
|
343
|
+
ratings : dict[str, list[Label]]
|
|
344
|
+
Dictionary mapping rater IDs to their ratings.
|
|
345
|
+
|
|
346
|
+
Returns
|
|
347
|
+
-------
|
|
348
|
+
dict[str, dict[str, float]]
|
|
349
|
+
Nested dictionary with structure:
|
|
350
|
+
{
|
|
351
|
+
'percentage_agreement': {('rater1', 'rater2'): 0.85, ...},
|
|
352
|
+
'cohens_kappa': {('rater1', 'rater2'): 0.75, ...}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
Examples
|
|
356
|
+
--------
|
|
357
|
+
>>> ratings = {
|
|
358
|
+
... 'rater1': [1, 2, 3],
|
|
359
|
+
... 'rater2': [1, 2, 3],
|
|
360
|
+
... 'rater3': [1, 2, 2]
|
|
361
|
+
... }
|
|
362
|
+
>>> result = InterAnnotatorMetrics.pairwise_agreement(ratings)
|
|
363
|
+
>>> result['percentage_agreement'][('rater1', 'rater2')]
|
|
364
|
+
1.0
|
|
365
|
+
>>> result['cohens_kappa'][('rater1', 'rater2')]
|
|
366
|
+
1.0
|
|
367
|
+
"""
|
|
368
|
+
rater_ids = list(ratings.keys())
|
|
369
|
+
|
|
370
|
+
if len(rater_ids) < 2:
|
|
371
|
+
return {
|
|
372
|
+
"percentage_agreement": {},
|
|
373
|
+
"cohens_kappa": {},
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
percentage_agreements = {}
|
|
377
|
+
kappas = {}
|
|
378
|
+
|
|
379
|
+
# Compute for all pairs
|
|
380
|
+
for rater1_id, rater2_id in combinations(rater_ids, 2):
|
|
381
|
+
pair = (rater1_id, rater2_id)
|
|
382
|
+
|
|
383
|
+
# Percentage agreement
|
|
384
|
+
perc_agr = InterAnnotatorMetrics.percentage_agreement(
|
|
385
|
+
ratings[rater1_id], ratings[rater2_id]
|
|
386
|
+
)
|
|
387
|
+
percentage_agreements[pair] = perc_agr
|
|
388
|
+
|
|
389
|
+
# Cohen's kappa
|
|
390
|
+
kappa = InterAnnotatorMetrics.cohens_kappa(
|
|
391
|
+
ratings[rater1_id], ratings[rater2_id]
|
|
392
|
+
)
|
|
393
|
+
kappas[pair] = kappa
|
|
394
|
+
|
|
395
|
+
return {
|
|
396
|
+
"percentage_agreement": percentage_agreements,
|
|
397
|
+
"cohens_kappa": kappas,
|
|
398
|
+
}
|
bead/items/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Item models for experimental stimuli."""
|
|
2
|
+
|
|
3
|
+
from bead.items.item import Item, ItemCollection, ModelOutput, UnfilledSlot
|
|
4
|
+
from bead.items.item_template import (
|
|
5
|
+
ChunkingSpec,
|
|
6
|
+
ChunkingUnit,
|
|
7
|
+
ElementRefType,
|
|
8
|
+
ItemElement,
|
|
9
|
+
ItemTemplate,
|
|
10
|
+
ItemTemplateCollection,
|
|
11
|
+
JudgmentType,
|
|
12
|
+
ParseType,
|
|
13
|
+
PresentationMode,
|
|
14
|
+
PresentationSpec,
|
|
15
|
+
TaskSpec,
|
|
16
|
+
TaskType,
|
|
17
|
+
TimingParams,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
# Item template types
|
|
22
|
+
"ChunkingSpec",
|
|
23
|
+
"ChunkingUnit",
|
|
24
|
+
"ElementRefType",
|
|
25
|
+
"ItemElement",
|
|
26
|
+
"ItemTemplate",
|
|
27
|
+
"ItemTemplateCollection",
|
|
28
|
+
"JudgmentType",
|
|
29
|
+
"ParseType",
|
|
30
|
+
"PresentationMode",
|
|
31
|
+
"PresentationSpec",
|
|
32
|
+
"TaskSpec",
|
|
33
|
+
"TaskType",
|
|
34
|
+
"TimingParams",
|
|
35
|
+
# Item types
|
|
36
|
+
"Item",
|
|
37
|
+
"ItemCollection",
|
|
38
|
+
"ModelOutput",
|
|
39
|
+
"UnfilledSlot",
|
|
40
|
+
]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Model adapters for judgment prediction during item construction.
|
|
2
|
+
|
|
3
|
+
Integrates HuggingFace transformers, OpenAI, Anthropic, Google, and Together
|
|
4
|
+
AI models. Separate from template filling adapters (Stage 2).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# API utilities - explicit re-exports for type checkers
|
|
8
|
+
from bead.items.adapters.api_utils import (
|
|
9
|
+
RateLimiter,
|
|
10
|
+
rate_limit,
|
|
11
|
+
retry_with_backoff,
|
|
12
|
+
)
|
|
13
|
+
from bead.items.adapters.base import ModelAdapter
|
|
14
|
+
from bead.items.adapters.huggingface import (
|
|
15
|
+
HuggingFaceLanguageModel,
|
|
16
|
+
HuggingFaceMaskedLanguageModel,
|
|
17
|
+
HuggingFaceNLI,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
# Registry - explicit re-exports for type checkers
|
|
21
|
+
from bead.items.adapters.registry import (
|
|
22
|
+
ModelAdapterRegistry,
|
|
23
|
+
default_registry,
|
|
24
|
+
)
|
|
25
|
+
from bead.items.adapters.sentence_transformers import (
|
|
26
|
+
HuggingFaceSentenceTransformer,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# API adapters (optional, may not be available if dependencies not installed)
|
|
30
|
+
try:
|
|
31
|
+
from bead.items.adapters.openai import OpenAIAdapter
|
|
32
|
+
except ImportError:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from bead.items.adapters.anthropic import AnthropicAdapter
|
|
37
|
+
except ImportError:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
from bead.items.adapters.google import GoogleAdapter
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
from bead.items.adapters.togetherai import TogetherAIAdapter
|
|
47
|
+
except ImportError:
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Base
|
|
52
|
+
"ModelAdapter",
|
|
53
|
+
# HuggingFace adapters
|
|
54
|
+
"HuggingFaceLanguageModel",
|
|
55
|
+
"HuggingFaceMaskedLanguageModel",
|
|
56
|
+
"HuggingFaceNLI",
|
|
57
|
+
"HuggingFaceSentenceTransformer",
|
|
58
|
+
# API utilities
|
|
59
|
+
"RateLimiter",
|
|
60
|
+
"rate_limit",
|
|
61
|
+
"retry_with_backoff",
|
|
62
|
+
# Registry
|
|
63
|
+
"ModelAdapterRegistry",
|
|
64
|
+
"default_registry",
|
|
65
|
+
# API adapters (conditionally exported based on available dependencies)
|
|
66
|
+
"OpenAIAdapter",
|
|
67
|
+
"AnthropicAdapter",
|
|
68
|
+
"GoogleAdapter",
|
|
69
|
+
"TogetherAIAdapter",
|
|
70
|
+
]
|