bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,13 @@
1
+ """Evaluation module for model and human performance assessment.
2
+
3
+ Provides cross-validation, inter-annotator agreement metrics, model
4
+ performance metrics, and convergence detection for active learning.
5
+ """
6
+
7
+ from bead.evaluation.convergence import ConvergenceDetector
8
+ from bead.evaluation.interannotator import InterAnnotatorMetrics
9
+
10
+ __all__ = [
11
+ "InterAnnotatorMetrics",
12
+ "ConvergenceDetector",
13
+ ]
@@ -0,0 +1,485 @@
1
+ """Convergence detection for active learning.
2
+
3
+ This module provides tools for detecting when a model has converged to
4
+ human-level performance, which serves as a stopping criterion for active
5
+ learning loops.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TypedDict
11
+
12
+ import numpy as np
13
+ from scipy.stats import binomtest, ttest_rel # type: ignore[import-untyped]
14
+
15
+ from bead.evaluation.interannotator import InterAnnotatorMetrics
16
+
17
+ # Type alias for classification labels (categorical, ordinal, or numeric)
18
+ type Label = int | str | float
19
+
20
+
21
+ class ConvergenceReport(TypedDict):
22
+ """Convergence report structure.
23
+
24
+ Attributes
25
+ ----------
26
+ converged : bool
27
+ Whether model has converged.
28
+ model_accuracy : float
29
+ Model's current accuracy.
30
+ human_agreement : float
31
+ Human agreement score.
32
+ gap : float
33
+ Difference between human agreement and model accuracy.
34
+ required_accuracy : float
35
+ Minimum accuracy required for convergence.
36
+ threshold : float
37
+ Convergence threshold.
38
+ iteration : int
39
+ Current iteration number.
40
+ meets_min_iterations : bool
41
+ Whether minimum iterations requirement is met.
42
+ min_iterations_required : int
43
+ Minimum iterations required before checking convergence.
44
+ """
45
+
46
+ converged: bool
47
+ model_accuracy: float
48
+ human_agreement: float
49
+ gap: float
50
+ required_accuracy: float
51
+ threshold: float
52
+ iteration: int
53
+ meets_min_iterations: bool
54
+ min_iterations_required: int
55
+
56
+
57
+ class ConvergenceDetector:
58
+ """Detect convergence of model performance to human agreement.
59
+
60
+ This class monitors model performance and compares it to human
61
+ inter-annotator agreement to determine when active learning can stop.
62
+ Convergence is achieved when the model's accuracy matches or exceeds
63
+ human agreement within a specified threshold.
64
+
65
+ Parameters
66
+ ----------
67
+ human_agreement_metric : str, default="krippendorff_alpha"
68
+ Which inter-annotator agreement metric to use as baseline:
69
+ - "krippendorff_alpha": Most general (handles missing data, multiple raters)
70
+ - "fleiss_kappa": Multiple raters, no missing data
71
+ - "cohens_kappa": Two raters only
72
+ - "percentage_agreement": Simple agreement rate
73
+ convergence_threshold : float, default=0.05
74
+ Model must be within this threshold of human agreement to converge.
75
+ For example, 0.05 means model accuracy must be >= (human_agreement - 0.05).
76
+ min_iterations : int, default=3
77
+ Minimum number of iterations before checking convergence.
78
+ Prevents premature stopping.
79
+ statistical_test : bool, default=True
80
+ Whether to run statistical significance test comparing model to humans.
81
+ alpha : float, default=0.05
82
+ Significance level for statistical tests.
83
+
84
+ Attributes
85
+ ----------
86
+ human_agreement_metric : str
87
+ Agreement metric being used.
88
+ convergence_threshold : float
89
+ Threshold for convergence.
90
+ min_iterations : int
91
+ Minimum iterations required.
92
+ statistical_test : bool
93
+ Whether to run significance tests.
94
+ alpha : float
95
+ Significance level.
96
+ human_baseline : float | None
97
+ Computed human agreement baseline (set via compute_human_baseline).
98
+
99
+ Examples
100
+ --------
101
+ >>> detector = ConvergenceDetector(
102
+ ... human_agreement_metric='krippendorff_alpha',
103
+ ... convergence_threshold=0.05,
104
+ ... min_iterations=3
105
+ ... )
106
+ >>> # Compute human baseline from ratings
107
+ >>> ratings = {
108
+ ... 'human1': [1, 1, 0, 1, 0],
109
+ ... 'human2': [1, 1, 0, 0, 0],
110
+ ... 'human3': [1, 0, 0, 1, 0]
111
+ ... }
112
+ >>> detector.compute_human_baseline(ratings)
113
+ >>> detector.human_baseline > 0.0
114
+ True
115
+ >>> # Check if model converged
116
+ >>> converged = detector.check_convergence(
117
+ ... model_accuracy=0.75,
118
+ ... iteration=5
119
+ ... )
120
+ >>> isinstance(converged, bool)
121
+ True
122
+ """
123
+
124
+ def __init__(
125
+ self,
126
+ human_agreement_metric: str = "krippendorff_alpha",
127
+ convergence_threshold: float = 0.05,
128
+ min_iterations: int = 3,
129
+ statistical_test: bool = True,
130
+ alpha: float = 0.05,
131
+ ) -> None:
132
+ """Initialize convergence detector.
133
+
134
+ Parameters
135
+ ----------
136
+ human_agreement_metric : str
137
+ Inter-annotator agreement metric to use.
138
+ convergence_threshold : float
139
+ Threshold for convergence (model must be within this of human).
140
+ min_iterations : int
141
+ Minimum iterations before checking convergence.
142
+ statistical_test : bool
143
+ Whether to run statistical tests.
144
+ alpha : float
145
+ Significance level for tests.
146
+
147
+ Raises
148
+ ------
149
+ ValueError
150
+ If parameters are invalid.
151
+ """
152
+ valid_metrics = {
153
+ "krippendorff_alpha",
154
+ "fleiss_kappa",
155
+ "cohens_kappa",
156
+ "percentage_agreement",
157
+ }
158
+
159
+ if human_agreement_metric not in valid_metrics:
160
+ raise ValueError(
161
+ f"human_agreement_metric must be one of {valid_metrics}, "
162
+ f"got '{human_agreement_metric}'"
163
+ )
164
+
165
+ if convergence_threshold < 0.0 or convergence_threshold > 1.0:
166
+ raise ValueError(
167
+ f"convergence_threshold must be in [0, 1], got {convergence_threshold}"
168
+ )
169
+
170
+ if min_iterations < 1:
171
+ raise ValueError(f"min_iterations must be >= 1, got {min_iterations}")
172
+
173
+ if alpha <= 0.0 or alpha >= 1.0:
174
+ raise ValueError(f"alpha must be in (0, 1), got {alpha}")
175
+
176
+ self.human_agreement_metric = human_agreement_metric
177
+ self.convergence_threshold = convergence_threshold
178
+ self.min_iterations = min_iterations
179
+ self.statistical_test = statistical_test
180
+ self.alpha = alpha
181
+ self.human_baseline: float | None = None
182
+
183
+ def compute_human_baseline(
184
+ self,
185
+ human_ratings: dict[str, list[Label | None]],
186
+ **kwargs: str | int | float | bool | None,
187
+ ) -> float:
188
+ """Compute human inter-rater agreement baseline.
189
+
190
+ Parameters
191
+ ----------
192
+ human_ratings : dict[str, list[Label | None]]
193
+ Dictionary mapping human rater IDs to their ratings.
194
+ For example: {'rater1': [1, 0, 1, ...], 'rater2': [1, 1, 1, ...]}.
195
+ Missing ratings can be represented as None.
196
+ **kwargs : str | int | float | bool | None
197
+ Additional arguments passed to agreement metric function.
198
+ For example, metric='nominal' for Krippendorff's alpha.
199
+
200
+ Returns
201
+ -------
202
+ float
203
+ Human agreement score.
204
+
205
+ Raises
206
+ ------
207
+ ValueError
208
+ If human_ratings is empty or has fewer than 2 raters.
209
+
210
+ Examples
211
+ --------
212
+ >>> detector = ConvergenceDetector()
213
+ >>> ratings = {
214
+ ... 'human1': [1, 1, 0, 1],
215
+ ... 'human2': [1, 1, 0, 0],
216
+ ... 'human3': [1, 0, 0, 1]
217
+ ... }
218
+ >>> baseline = detector.compute_human_baseline(ratings)
219
+ >>> 0.0 <= baseline <= 1.0
220
+ True
221
+ """
222
+ if not human_ratings:
223
+ raise ValueError("human_ratings cannot be empty")
224
+
225
+ if len(human_ratings) < 2:
226
+ raise ValueError("human_ratings must have at least 2 raters")
227
+
228
+ # Compute agreement using specified metric
229
+ if self.human_agreement_metric == "krippendorff_alpha":
230
+ # Extract metric parameter, defaulting to 'nominal'
231
+ metric = kwargs.get("metric", "nominal")
232
+ if not isinstance(metric, str):
233
+ metric = "nominal"
234
+ agreement = InterAnnotatorMetrics.krippendorff_alpha(
235
+ human_ratings, metric=metric
236
+ )
237
+ elif self.human_agreement_metric == "percentage_agreement":
238
+ # Use mean of pairwise percentage agreements
239
+ # Filter out None values for percentage agreement
240
+ filtered_ratings = {
241
+ rater_id: [r for r in ratings if r is not None]
242
+ for rater_id, ratings in human_ratings.items()
243
+ }
244
+ pairwise = InterAnnotatorMetrics.pairwise_agreement(filtered_ratings)
245
+ agreements = list(pairwise["percentage_agreement"].values())
246
+ agreement = float(np.mean(agreements)) if agreements else 0.0
247
+ elif self.human_agreement_metric == "cohens_kappa":
248
+ if len(human_ratings) != 2:
249
+ raise ValueError("cohens_kappa requires exactly 2 raters")
250
+ rater_ids = list(human_ratings.keys())
251
+ # Filter out None values for Cohen's kappa
252
+ ratings_1 = human_ratings[rater_ids[0]]
253
+ ratings_2 = human_ratings[rater_ids[1]]
254
+ filtered_ratings_1 = [r for r in ratings_1 if r is not None]
255
+ filtered_ratings_2 = [r for r in ratings_2 if r is not None]
256
+ agreement = InterAnnotatorMetrics.cohens_kappa(
257
+ filtered_ratings_1, filtered_ratings_2
258
+ )
259
+ elif self.human_agreement_metric == "fleiss_kappa":
260
+ # Convert ratings to Fleiss format (items × categories matrix)
261
+ # This requires categorical data
262
+ raise NotImplementedError(
263
+ "fleiss_kappa not yet implemented in compute_human_baseline. "
264
+ "Use krippendorff_alpha instead."
265
+ )
266
+ else:
267
+ raise ValueError(f"Unknown metric: {self.human_agreement_metric}")
268
+
269
+ self.human_baseline = agreement
270
+ return agreement
271
+
272
+ def check_convergence(
273
+ self,
274
+ model_accuracy: float,
275
+ iteration: int,
276
+ human_agreement: float | None = None,
277
+ ) -> bool:
278
+ """Check if model has converged to human performance.
279
+
280
+ Parameters
281
+ ----------
282
+ model_accuracy : float
283
+ Model's accuracy on the task.
284
+ iteration : int
285
+ Current iteration number (1-indexed).
286
+ human_agreement : float | None
287
+ Human agreement score. If None, uses self.human_baseline
288
+ (which must have been set via compute_human_baseline).
289
+
290
+ Returns
291
+ -------
292
+ bool
293
+ True if model has converged, False otherwise.
294
+
295
+ Raises
296
+ ------
297
+ ValueError
298
+ If human_agreement is None and human_baseline not set.
299
+
300
+ Examples
301
+ --------
302
+ >>> detector = ConvergenceDetector(min_iterations=2, convergence_threshold=0.05)
303
+ >>> detector.human_baseline = 0.80
304
+ >>> # Too early (iteration 1 < min_iterations 2)
305
+ >>> detector.check_convergence(0.79, iteration=1)
306
+ False
307
+ >>> # Still not converged (0.74 < 0.80 - 0.05)
308
+ >>> detector.check_convergence(0.74, iteration=3)
309
+ False
310
+ >>> # Converged (0.77 >= 0.80 - 0.05)
311
+ >>> detector.check_convergence(0.77, iteration=3)
312
+ True
313
+ """
314
+ # Check minimum iterations
315
+ if iteration < self.min_iterations:
316
+ return False
317
+
318
+ # Get human baseline
319
+ if human_agreement is None:
320
+ if self.human_baseline is None:
321
+ raise ValueError(
322
+ "human_agreement is None and human_baseline not set. "
323
+ "Call compute_human_baseline first or pass human_agreement."
324
+ )
325
+ human_agreement = self.human_baseline
326
+
327
+ # Check if model is within threshold of human performance
328
+ required_accuracy = human_agreement - self.convergence_threshold
329
+ return model_accuracy >= required_accuracy
330
+
331
+ def compute_statistical_test(
332
+ self,
333
+ model_predictions: list[Label],
334
+ human_consensus: list[Label],
335
+ test_type: str = "mcnemar",
336
+ ) -> dict[str, float]:
337
+ """Run statistical test comparing model to human performance.
338
+
339
+ Parameters
340
+ ----------
341
+ model_predictions : list[Label]
342
+ Model's predictions.
343
+ human_consensus : list[Label]
344
+ Human consensus labels (e.g., majority vote).
345
+ test_type : str, default="mcnemar"
346
+ Type of statistical test:
347
+ - "mcnemar": McNemar's test for paired nominal data
348
+ - "ttest": Paired t-test (requires multiple samples)
349
+
350
+ Returns
351
+ -------
352
+ dict[str, float]
353
+ Dictionary with keys 'statistic' and 'p_value'.
354
+
355
+ Raises
356
+ ------
357
+ ValueError
358
+ If predictions and consensus have different lengths.
359
+
360
+ Examples
361
+ --------
362
+ >>> detector = ConvergenceDetector()
363
+ >>> model_preds = [1, 1, 0, 1, 0]
364
+ >>> human_consensus = [1, 1, 0, 0, 0]
365
+ >>> result = detector.compute_statistical_test(model_preds, human_consensus)
366
+ >>> 'statistic' in result and 'p_value' in result
367
+ True
368
+ """
369
+ if len(model_predictions) != len(human_consensus):
370
+ raise ValueError(
371
+ f"model_predictions and human_consensus must have same length: "
372
+ f"{len(model_predictions)} != {len(human_consensus)}"
373
+ )
374
+
375
+ if test_type == "mcnemar":
376
+ # McNemar's test for paired predictions
377
+ # Contingency table: [correct_model, incorrect_model] ×
378
+ # [correct_human, incorrect_human]
379
+
380
+ # Actually, for McNemar we need a reference (ground truth)
381
+ # Instead, we'll use a binomial test to check if model accuracy
382
+ # differs significantly from human accuracy
383
+
384
+ model_correct = [
385
+ mp == hc
386
+ for mp, hc in zip(model_predictions, human_consensus, strict=True)
387
+ ]
388
+ model_accuracy = sum(model_correct) / len(model_correct)
389
+ human_accuracy = 1.0 # Assuming human_consensus is "correct"
390
+
391
+ # Binomial test: is model accuracy significantly different from human?
392
+ n = len(model_correct)
393
+ k = sum(model_correct)
394
+
395
+ # Two-tailed test
396
+ result = binomtest(k, n, human_accuracy, alternative="two-sided")
397
+ p_value = result.pvalue
398
+
399
+ return {
400
+ "statistic": float(model_accuracy),
401
+ "p_value": float(p_value),
402
+ }
403
+
404
+ elif test_type == "ttest":
405
+ # Paired t-test comparing model predictions to human consensus
406
+ # Convert predictions to correctness scores (1 if match, 0 if not)
407
+ model_scores = np.array(
408
+ [
409
+ 1.0 if mp == hc else 0.0
410
+ for mp, hc in zip(model_predictions, human_consensus, strict=True)
411
+ ]
412
+ )
413
+ # Human consensus is always "correct" (1.0) by definition
414
+ human_scores = np.ones(len(human_consensus), dtype=float)
415
+
416
+ # Paired t-test: test if model scores differ from human scores
417
+ statistic, p_value = ttest_rel(model_scores, human_scores)
418
+
419
+ return {
420
+ "statistic": float(statistic),
421
+ "p_value": float(p_value),
422
+ }
423
+
424
+ else:
425
+ raise ValueError(
426
+ f"Unknown test_type: {test_type}. Must be 'mcnemar' or 'ttest'."
427
+ )
428
+
429
+ def get_convergence_report(
430
+ self,
431
+ model_accuracy: float,
432
+ iteration: int,
433
+ human_agreement: float | None = None,
434
+ ) -> ConvergenceReport:
435
+ """Generate convergence report with status and metrics.
436
+
437
+ Parameters
438
+ ----------
439
+ model_accuracy : float
440
+ Model's current accuracy.
441
+ iteration : int
442
+ Current iteration number.
443
+ human_agreement : float | None
444
+ Human agreement score (uses baseline if None).
445
+
446
+ Returns
447
+ -------
448
+ ConvergenceReport
449
+ Report with convergence status and metrics.
450
+
451
+ Examples
452
+ --------
453
+ >>> detector = ConvergenceDetector(convergence_threshold=0.05)
454
+ >>> detector.human_baseline = 0.80
455
+ >>> report = detector.get_convergence_report(0.77, iteration=5)
456
+ >>> report['converged']
457
+ True
458
+ >>> report['gap']
459
+ 0.03
460
+ """
461
+ # Get human baseline
462
+ if human_agreement is None:
463
+ if self.human_baseline is None:
464
+ raise ValueError("human_agreement is None and human_baseline not set")
465
+ human_agreement = self.human_baseline
466
+
467
+ # Check convergence
468
+ converged = self.check_convergence(model_accuracy, iteration, human_agreement)
469
+
470
+ # Compute metrics
471
+ gap = human_agreement - model_accuracy
472
+ required_accuracy = human_agreement - self.convergence_threshold
473
+ meets_min_iterations = iteration >= self.min_iterations
474
+
475
+ return {
476
+ "converged": converged,
477
+ "model_accuracy": model_accuracy,
478
+ "human_agreement": human_agreement,
479
+ "gap": gap,
480
+ "required_accuracy": required_accuracy,
481
+ "threshold": self.convergence_threshold,
482
+ "iteration": iteration,
483
+ "meets_min_iterations": meets_min_iterations,
484
+ "min_iterations_required": self.min_iterations,
485
+ }