bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,398 @@
1
+ """Inter-annotator agreement metrics.
2
+
3
+ This module provides inter-annotator agreement metrics for assessing
4
+ reliability and consistency across multiple human annotators.
5
+ Uses sklearn.metrics for Cohen's kappa, statsmodels for Fleiss' kappa,
6
+ and krippendorff package for Krippendorff's alpha.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from itertools import combinations
12
+ from typing import Literal
13
+
14
+ import numpy as np
15
+ from krippendorff import alpha as krippendorff_alpha
16
+ from sklearn.metrics import cohen_kappa_score
17
+ from statsmodels.stats.inter_rater import fleiss_kappa as statsmodels_fleiss_kappa
18
+
19
+ # Type alias for krippendorff metric levels
20
+ type KrippendorffMetric = Literal["nominal", "ordinal", "interval", "ratio"]
21
+
22
+ # Type alias for rating values (categorical, ordinal, interval, or ratio)
23
+ type Label = int | str | float
24
+
25
+
26
+ class InterAnnotatorMetrics:
27
+ """Inter-annotator agreement metrics for reliability assessment.
28
+
29
+ Provides static methods for computing various agreement metrics:
30
+ - Percentage agreement (simple)
31
+ - Cohen's kappa (2 raters, categorical)
32
+ - Fleiss' kappa (multiple raters, categorical)
33
+ - Krippendorff's alpha (general, multiple data types)
34
+ - Pairwise agreement (all pairs of raters)
35
+
36
+ Examples
37
+ --------
38
+ >>> # Cohen's kappa for 2 raters
39
+ >>> rater1 = [0, 1, 0, 1, 1]
40
+ >>> rater2 = [0, 1, 1, 1, 1]
41
+ >>> InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
42
+ 0.6
43
+ >>> # Percentage agreement
44
+ >>> InterAnnotatorMetrics.percentage_agreement(rater1, rater2)
45
+ 0.8
46
+ """
47
+
48
+ @staticmethod
49
+ def percentage_agreement(rater1: list[Label], rater2: list[Label]) -> float:
50
+ """Compute simple percentage agreement between two raters.
51
+
52
+ Parameters
53
+ ----------
54
+ rater1 : list[Label]
55
+ Ratings from first rater.
56
+ rater2 : list[Label]
57
+ Ratings from second rater.
58
+
59
+ Returns
60
+ -------
61
+ float
62
+ Percentage agreement (0.0 to 1.0).
63
+
64
+ Raises
65
+ ------
66
+ ValueError
67
+ If rater lists have different lengths.
68
+
69
+ Examples
70
+ --------
71
+ >>> rater1 = [1, 2, 3, 1, 2]
72
+ >>> rater2 = [1, 2, 2, 1, 2]
73
+ >>> InterAnnotatorMetrics.percentage_agreement(rater1, rater2)
74
+ 0.8
75
+ """
76
+ if len(rater1) != len(rater2):
77
+ raise ValueError(
78
+ f"Rater lists must have same length: {len(rater1)} != {len(rater2)}"
79
+ )
80
+
81
+ if not rater1:
82
+ return 1.0
83
+
84
+ agreements = sum(r1 == r2 for r1, r2 in zip(rater1, rater2, strict=True))
85
+ return agreements / len(rater1)
86
+
87
+ @staticmethod
88
+ def cohens_kappa(rater1: list[Label], rater2: list[Label]) -> float:
89
+ """Compute Cohen's kappa for two raters.
90
+
91
+ Cohen's kappa measures agreement between two raters beyond chance.
92
+ Values range from -1 (complete disagreement) to 1 (perfect agreement),
93
+ with 0 indicating chance-level agreement.
94
+
95
+ Parameters
96
+ ----------
97
+ rater1 : list[Label]
98
+ Ratings from first rater.
99
+ rater2 : list[Label]
100
+ Ratings from second rater.
101
+
102
+ Returns
103
+ -------
104
+ float
105
+ Cohen's kappa coefficient.
106
+
107
+ Raises
108
+ ------
109
+ ValueError
110
+ If rater lists have different lengths or are empty.
111
+
112
+ Examples
113
+ --------
114
+ >>> # Perfect agreement
115
+ >>> rater1 = [0, 1, 0, 1]
116
+ >>> rater2 = [0, 1, 0, 1]
117
+ >>> InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
118
+ 1.0
119
+ >>> # No agreement beyond chance
120
+ >>> rater1 = [0, 0, 1, 1]
121
+ >>> rater2 = [1, 1, 0, 0]
122
+ >>> kappa = InterAnnotatorMetrics.cohens_kappa(rater1, rater2)
123
+ >>> abs(kappa - (-1.0)) < 0.01
124
+ True
125
+ """
126
+ if len(rater1) != len(rater2):
127
+ raise ValueError(
128
+ f"Rater lists must have same length: {len(rater1)} != {len(rater2)}"
129
+ )
130
+
131
+ if not rater1:
132
+ raise ValueError("Rater lists cannot be empty")
133
+
134
+ # Check for single category case (sklearn returns NaN)
135
+ unique_values = set(rater1) | set(rater2)
136
+ if len(unique_values) == 1:
137
+ return 1.0 # Perfect agreement by definition
138
+
139
+ result = cohen_kappa_score(rater1, rater2)
140
+ # Handle NaN case (can happen with extreme distributions)
141
+ if np.isnan(result):
142
+ return 1.0
143
+ return float(result)
144
+
145
+ @staticmethod
146
+ def fleiss_kappa(ratings_matrix: np.ndarray[int, np.dtype[np.int_]]) -> float: # type: ignore[type-arg]
147
+ """Compute Fleiss' kappa for multiple raters.
148
+
149
+ Fleiss' kappa generalizes Cohen's kappa to multiple raters. It measures
150
+ agreement beyond chance when multiple raters assign categorical ratings
151
+ to a set of items.
152
+
153
+ Parameters
154
+ ----------
155
+ ratings_matrix : np.ndarray
156
+ Matrix of shape (n_items, n_categories) where element [i, j]
157
+ contains the number of raters who assigned item i to category j.
158
+
159
+ Returns
160
+ -------
161
+ float
162
+ Fleiss' kappa coefficient.
163
+
164
+ Raises
165
+ ------
166
+ ValueError
167
+ If matrix is empty or has wrong shape.
168
+ ImportError
169
+ If statsmodels is not installed.
170
+
171
+ Examples
172
+ --------
173
+ >>> # 4 items, 3 categories, 5 raters each
174
+ >>> # Item 1: 3 raters chose cat 0, 2 chose cat 1, 0 chose cat 2
175
+ >>> ratings = np.array([
176
+ ... [3, 2, 0], # Item 1
177
+ ... [0, 0, 5], # Item 2
178
+ ... [2, 3, 0], # Item 3
179
+ ... [1, 1, 3], # Item 4
180
+ ... ])
181
+ >>> kappa = InterAnnotatorMetrics.fleiss_kappa(ratings)
182
+ >>> 0.0 <= kappa <= 1.0
183
+ True
184
+ """
185
+ if statsmodels_fleiss_kappa is None:
186
+ msg = "statsmodels required for Fleiss' kappa. pip install statsmodels"
187
+ raise ImportError(msg)
188
+
189
+ if ratings_matrix.size == 0:
190
+ raise ValueError("Ratings matrix cannot be empty")
191
+
192
+ n_items, n_categories = ratings_matrix.shape
193
+
194
+ if n_items == 0 or n_categories == 0:
195
+ raise ValueError(f"Invalid matrix shape: ({n_items}, {n_categories})")
196
+
197
+ # Check that all items have the same number of raters
198
+ rater_counts = ratings_matrix.sum(axis=1)
199
+ if not np.allclose(rater_counts, rater_counts[0]):
200
+ raise ValueError(
201
+ "All items must have same number of raters. "
202
+ f"Got counts: {rater_counts.tolist()}"
203
+ )
204
+
205
+ return float(statsmodels_fleiss_kappa(ratings_matrix))
206
+
207
+ @staticmethod
208
+ def krippendorff_alpha(
209
+ reliability_data: dict[str, list[Label | None]],
210
+ metric: str = "nominal",
211
+ ) -> float:
212
+ """Compute Krippendorff's alpha for multiple raters.
213
+
214
+ Krippendorff's alpha is the most general inter-rater reliability
215
+ measure. It handles:
216
+ - Any number of raters
217
+ - Missing data
218
+ - Different data types (nominal, ordinal, interval, ratio)
219
+
220
+ Parameters
221
+ ----------
222
+ reliability_data : dict[str, list[Label | None]]
223
+ Dictionary mapping rater IDs to their ratings. Each rater's
224
+ ratings list must have same length (use None for missing values).
225
+ metric : str, default="nominal"
226
+ Distance metric to use:
227
+ - "nominal": for categorical data (default)
228
+ - "ordinal": for ordered categories
229
+ - "interval": for interval-scaled data
230
+ - "ratio": for ratio-scaled data
231
+
232
+ Returns
233
+ -------
234
+ float
235
+ Krippendorff's alpha coefficient (1.0 = perfect agreement,
236
+ 0.0 = chance agreement, < 0.0 = systematic disagreement).
237
+
238
+ Raises
239
+ ------
240
+ ValueError
241
+ If reliability_data is empty or rater lists have different lengths.
242
+
243
+ Examples
244
+ --------
245
+ >>> # 3 raters, 5 items (with one missing value)
246
+ >>> data = {
247
+ ... 'rater1': [1, 2, 3, 4, 5],
248
+ ... 'rater2': [1, 2, 3, 4, 5],
249
+ ... 'rater3': [1, 2, None, 4, 5]
250
+ ... }
251
+ >>> alpha = InterAnnotatorMetrics.krippendorff_alpha(data)
252
+ >>> alpha > 0.8 # High agreement
253
+ True
254
+ """
255
+ if not reliability_data:
256
+ raise ValueError("reliability_data cannot be empty")
257
+
258
+ # Convert to reliability matrix (items × raters)
259
+ rater_ids = list(reliability_data.keys())
260
+ n_items = len(reliability_data[rater_ids[0]])
261
+
262
+ # Check all raters have same number of items
263
+ for rater_id, ratings in reliability_data.items():
264
+ if len(ratings) != n_items:
265
+ raise ValueError(
266
+ f"All raters must rate same number of items: "
267
+ f"{rater_id} has {len(ratings)}, expected {n_items}"
268
+ )
269
+
270
+ # Convert to format expected by krippendorff package
271
+ # Format: rows are coders/raters, columns are units/items
272
+ # Missing values should be np.nan
273
+ reliability_matrix: list[list[float]] = []
274
+ all_values: list[Label] = []
275
+ for rater_id in rater_ids:
276
+ rater_ratings: list[float] = []
277
+ for rating in reliability_data[rater_id]:
278
+ if rating is None:
279
+ rater_ratings.append(np.nan)
280
+ else:
281
+ is_numeric = isinstance(rating, int | float)
282
+ val = float(rating) if is_numeric else hash(rating)
283
+ rater_ratings.append(val)
284
+ all_values.append(rating)
285
+ reliability_matrix.append(rater_ratings)
286
+
287
+ # Handle edge cases that krippendorff package doesn't handle
288
+ if len(all_values) == 0:
289
+ # All missing data
290
+ return 0.0
291
+
292
+ # Check if there are any pairwise comparisons possible
293
+ # (at least one item must have ratings from at least 2 raters)
294
+ comparisons_possible = False
295
+ for item_idx in range(n_items):
296
+ n_raters_for_item = sum(
297
+ 1
298
+ for rater_id in rater_ids
299
+ if reliability_data[rater_id][item_idx] is not None
300
+ )
301
+ if n_raters_for_item >= 2:
302
+ comparisons_possible = True
303
+ break
304
+
305
+ if not comparisons_possible:
306
+ # No pairwise comparisons possible
307
+ return 0.0
308
+
309
+ unique_values = set(all_values)
310
+ if len(unique_values) <= 1:
311
+ # All same value - perfect agreement by definition
312
+ return 1.0
313
+
314
+ # Map metric names to krippendorff package names
315
+ metric_map: dict[str, KrippendorffMetric] = {
316
+ "nominal": "nominal",
317
+ "ordinal": "ordinal",
318
+ "interval": "interval",
319
+ "ratio": "ratio",
320
+ }
321
+
322
+ if metric not in metric_map:
323
+ raise ValueError(
324
+ f"Unknown metric: {metric}. Must be one of: "
325
+ "'nominal', 'ordinal', 'interval', 'ratio'"
326
+ )
327
+
328
+ return float(
329
+ krippendorff_alpha(
330
+ reliability_matrix,
331
+ level_of_measurement=metric_map[metric],
332
+ )
333
+ )
334
+
335
+ @staticmethod
336
+ def pairwise_agreement(
337
+ ratings: dict[str, list[Label]],
338
+ ) -> dict[str, dict[str, float]]:
339
+ """Compute pairwise agreement metrics for all rater pairs.
340
+
341
+ Parameters
342
+ ----------
343
+ ratings : dict[str, list[Label]]
344
+ Dictionary mapping rater IDs to their ratings.
345
+
346
+ Returns
347
+ -------
348
+ dict[str, dict[str, float]]
349
+ Nested dictionary with structure:
350
+ {
351
+ 'percentage_agreement': {('rater1', 'rater2'): 0.85, ...},
352
+ 'cohens_kappa': {('rater1', 'rater2'): 0.75, ...}
353
+ }
354
+
355
+ Examples
356
+ --------
357
+ >>> ratings = {
358
+ ... 'rater1': [1, 2, 3],
359
+ ... 'rater2': [1, 2, 3],
360
+ ... 'rater3': [1, 2, 2]
361
+ ... }
362
+ >>> result = InterAnnotatorMetrics.pairwise_agreement(ratings)
363
+ >>> result['percentage_agreement'][('rater1', 'rater2')]
364
+ 1.0
365
+ >>> result['cohens_kappa'][('rater1', 'rater2')]
366
+ 1.0
367
+ """
368
+ rater_ids = list(ratings.keys())
369
+
370
+ if len(rater_ids) < 2:
371
+ return {
372
+ "percentage_agreement": {},
373
+ "cohens_kappa": {},
374
+ }
375
+
376
+ percentage_agreements = {}
377
+ kappas = {}
378
+
379
+ # Compute for all pairs
380
+ for rater1_id, rater2_id in combinations(rater_ids, 2):
381
+ pair = (rater1_id, rater2_id)
382
+
383
+ # Percentage agreement
384
+ perc_agr = InterAnnotatorMetrics.percentage_agreement(
385
+ ratings[rater1_id], ratings[rater2_id]
386
+ )
387
+ percentage_agreements[pair] = perc_agr
388
+
389
+ # Cohen's kappa
390
+ kappa = InterAnnotatorMetrics.cohens_kappa(
391
+ ratings[rater1_id], ratings[rater2_id]
392
+ )
393
+ kappas[pair] = kappa
394
+
395
+ return {
396
+ "percentage_agreement": percentage_agreements,
397
+ "cohens_kappa": kappas,
398
+ }
bead/items/__init__.py ADDED
@@ -0,0 +1,40 @@
1
+ """Item models for experimental stimuli."""
2
+
3
+ from bead.items.item import Item, ItemCollection, ModelOutput, UnfilledSlot
4
+ from bead.items.item_template import (
5
+ ChunkingSpec,
6
+ ChunkingUnit,
7
+ ElementRefType,
8
+ ItemElement,
9
+ ItemTemplate,
10
+ ItemTemplateCollection,
11
+ JudgmentType,
12
+ ParseType,
13
+ PresentationMode,
14
+ PresentationSpec,
15
+ TaskSpec,
16
+ TaskType,
17
+ TimingParams,
18
+ )
19
+
20
+ __all__ = [
21
+ # Item template types
22
+ "ChunkingSpec",
23
+ "ChunkingUnit",
24
+ "ElementRefType",
25
+ "ItemElement",
26
+ "ItemTemplate",
27
+ "ItemTemplateCollection",
28
+ "JudgmentType",
29
+ "ParseType",
30
+ "PresentationMode",
31
+ "PresentationSpec",
32
+ "TaskSpec",
33
+ "TaskType",
34
+ "TimingParams",
35
+ # Item types
36
+ "Item",
37
+ "ItemCollection",
38
+ "ModelOutput",
39
+ "UnfilledSlot",
40
+ ]
@@ -0,0 +1,70 @@
1
+ """Model adapters for judgment prediction during item construction.
2
+
3
+ Integrates HuggingFace transformers, OpenAI, Anthropic, Google, and Together
4
+ AI models. Separate from template filling adapters (Stage 2).
5
+ """
6
+
7
+ # API utilities - explicit re-exports for type checkers
8
+ from bead.items.adapters.api_utils import (
9
+ RateLimiter,
10
+ rate_limit,
11
+ retry_with_backoff,
12
+ )
13
+ from bead.items.adapters.base import ModelAdapter
14
+ from bead.items.adapters.huggingface import (
15
+ HuggingFaceLanguageModel,
16
+ HuggingFaceMaskedLanguageModel,
17
+ HuggingFaceNLI,
18
+ )
19
+
20
+ # Registry - explicit re-exports for type checkers
21
+ from bead.items.adapters.registry import (
22
+ ModelAdapterRegistry,
23
+ default_registry,
24
+ )
25
+ from bead.items.adapters.sentence_transformers import (
26
+ HuggingFaceSentenceTransformer,
27
+ )
28
+
29
+ # API adapters (optional, may not be available if dependencies not installed)
30
+ try:
31
+ from bead.items.adapters.openai import OpenAIAdapter
32
+ except ImportError:
33
+ pass
34
+
35
+ try:
36
+ from bead.items.adapters.anthropic import AnthropicAdapter
37
+ except ImportError:
38
+ pass
39
+
40
+ try:
41
+ from bead.items.adapters.google import GoogleAdapter
42
+ except ImportError:
43
+ pass
44
+
45
+ try:
46
+ from bead.items.adapters.togetherai import TogetherAIAdapter
47
+ except ImportError:
48
+ pass
49
+
50
+ __all__ = [
51
+ # Base
52
+ "ModelAdapter",
53
+ # HuggingFace adapters
54
+ "HuggingFaceLanguageModel",
55
+ "HuggingFaceMaskedLanguageModel",
56
+ "HuggingFaceNLI",
57
+ "HuggingFaceSentenceTransformer",
58
+ # API utilities
59
+ "RateLimiter",
60
+ "rate_limit",
61
+ "retry_with_backoff",
62
+ # Registry
63
+ "ModelAdapterRegistry",
64
+ "default_registry",
65
+ # API adapters (conditionally exported based on available dependencies)
66
+ "OpenAIAdapter",
67
+ "AnthropicAdapter",
68
+ "GoogleAdapter",
69
+ "TogetherAIAdapter",
70
+ ]