bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Evaluation module for model and human performance assessment.
|
|
2
|
+
|
|
3
|
+
Provides cross-validation, inter-annotator agreement metrics, model
|
|
4
|
+
performance metrics, and convergence detection for active learning.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from bead.evaluation.convergence import ConvergenceDetector
|
|
8
|
+
from bead.evaluation.interannotator import InterAnnotatorMetrics
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"InterAnnotatorMetrics",
|
|
12
|
+
"ConvergenceDetector",
|
|
13
|
+
]
|
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
"""Convergence detection for active learning.
|
|
2
|
+
|
|
3
|
+
This module provides tools for detecting when a model has converged to
|
|
4
|
+
human-level performance, which serves as a stopping criterion for active
|
|
5
|
+
learning loops.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import TypedDict
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from scipy.stats import binomtest, ttest_rel # type: ignore[import-untyped]
|
|
14
|
+
|
|
15
|
+
from bead.evaluation.interannotator import InterAnnotatorMetrics
|
|
16
|
+
|
|
17
|
+
# Type alias for classification labels (categorical, ordinal, or numeric)
|
|
18
|
+
type Label = int | str | float
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class ConvergenceReport(TypedDict):
|
|
22
|
+
"""Convergence report structure.
|
|
23
|
+
|
|
24
|
+
Attributes
|
|
25
|
+
----------
|
|
26
|
+
converged : bool
|
|
27
|
+
Whether model has converged.
|
|
28
|
+
model_accuracy : float
|
|
29
|
+
Model's current accuracy.
|
|
30
|
+
human_agreement : float
|
|
31
|
+
Human agreement score.
|
|
32
|
+
gap : float
|
|
33
|
+
Difference between human agreement and model accuracy.
|
|
34
|
+
required_accuracy : float
|
|
35
|
+
Minimum accuracy required for convergence.
|
|
36
|
+
threshold : float
|
|
37
|
+
Convergence threshold.
|
|
38
|
+
iteration : int
|
|
39
|
+
Current iteration number.
|
|
40
|
+
meets_min_iterations : bool
|
|
41
|
+
Whether minimum iterations requirement is met.
|
|
42
|
+
min_iterations_required : int
|
|
43
|
+
Minimum iterations required before checking convergence.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
converged: bool
|
|
47
|
+
model_accuracy: float
|
|
48
|
+
human_agreement: float
|
|
49
|
+
gap: float
|
|
50
|
+
required_accuracy: float
|
|
51
|
+
threshold: float
|
|
52
|
+
iteration: int
|
|
53
|
+
meets_min_iterations: bool
|
|
54
|
+
min_iterations_required: int
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ConvergenceDetector:
|
|
58
|
+
"""Detect convergence of model performance to human agreement.
|
|
59
|
+
|
|
60
|
+
This class monitors model performance and compares it to human
|
|
61
|
+
inter-annotator agreement to determine when active learning can stop.
|
|
62
|
+
Convergence is achieved when the model's accuracy matches or exceeds
|
|
63
|
+
human agreement within a specified threshold.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
----------
|
|
67
|
+
human_agreement_metric : str, default="krippendorff_alpha"
|
|
68
|
+
Which inter-annotator agreement metric to use as baseline:
|
|
69
|
+
- "krippendorff_alpha": Most general (handles missing data, multiple raters)
|
|
70
|
+
- "fleiss_kappa": Multiple raters, no missing data
|
|
71
|
+
- "cohens_kappa": Two raters only
|
|
72
|
+
- "percentage_agreement": Simple agreement rate
|
|
73
|
+
convergence_threshold : float, default=0.05
|
|
74
|
+
Model must be within this threshold of human agreement to converge.
|
|
75
|
+
For example, 0.05 means model accuracy must be >= (human_agreement - 0.05).
|
|
76
|
+
min_iterations : int, default=3
|
|
77
|
+
Minimum number of iterations before checking convergence.
|
|
78
|
+
Prevents premature stopping.
|
|
79
|
+
statistical_test : bool, default=True
|
|
80
|
+
Whether to run statistical significance test comparing model to humans.
|
|
81
|
+
alpha : float, default=0.05
|
|
82
|
+
Significance level for statistical tests.
|
|
83
|
+
|
|
84
|
+
Attributes
|
|
85
|
+
----------
|
|
86
|
+
human_agreement_metric : str
|
|
87
|
+
Agreement metric being used.
|
|
88
|
+
convergence_threshold : float
|
|
89
|
+
Threshold for convergence.
|
|
90
|
+
min_iterations : int
|
|
91
|
+
Minimum iterations required.
|
|
92
|
+
statistical_test : bool
|
|
93
|
+
Whether to run significance tests.
|
|
94
|
+
alpha : float
|
|
95
|
+
Significance level.
|
|
96
|
+
human_baseline : float | None
|
|
97
|
+
Computed human agreement baseline (set via compute_human_baseline).
|
|
98
|
+
|
|
99
|
+
Examples
|
|
100
|
+
--------
|
|
101
|
+
>>> detector = ConvergenceDetector(
|
|
102
|
+
... human_agreement_metric='krippendorff_alpha',
|
|
103
|
+
... convergence_threshold=0.05,
|
|
104
|
+
... min_iterations=3
|
|
105
|
+
... )
|
|
106
|
+
>>> # Compute human baseline from ratings
|
|
107
|
+
>>> ratings = {
|
|
108
|
+
... 'human1': [1, 1, 0, 1, 0],
|
|
109
|
+
... 'human2': [1, 1, 0, 0, 0],
|
|
110
|
+
... 'human3': [1, 0, 0, 1, 0]
|
|
111
|
+
... }
|
|
112
|
+
>>> detector.compute_human_baseline(ratings)
|
|
113
|
+
>>> detector.human_baseline > 0.0
|
|
114
|
+
True
|
|
115
|
+
>>> # Check if model converged
|
|
116
|
+
>>> converged = detector.check_convergence(
|
|
117
|
+
... model_accuracy=0.75,
|
|
118
|
+
... iteration=5
|
|
119
|
+
... )
|
|
120
|
+
>>> isinstance(converged, bool)
|
|
121
|
+
True
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
def __init__(
|
|
125
|
+
self,
|
|
126
|
+
human_agreement_metric: str = "krippendorff_alpha",
|
|
127
|
+
convergence_threshold: float = 0.05,
|
|
128
|
+
min_iterations: int = 3,
|
|
129
|
+
statistical_test: bool = True,
|
|
130
|
+
alpha: float = 0.05,
|
|
131
|
+
) -> None:
|
|
132
|
+
"""Initialize convergence detector.
|
|
133
|
+
|
|
134
|
+
Parameters
|
|
135
|
+
----------
|
|
136
|
+
human_agreement_metric : str
|
|
137
|
+
Inter-annotator agreement metric to use.
|
|
138
|
+
convergence_threshold : float
|
|
139
|
+
Threshold for convergence (model must be within this of human).
|
|
140
|
+
min_iterations : int
|
|
141
|
+
Minimum iterations before checking convergence.
|
|
142
|
+
statistical_test : bool
|
|
143
|
+
Whether to run statistical tests.
|
|
144
|
+
alpha : float
|
|
145
|
+
Significance level for tests.
|
|
146
|
+
|
|
147
|
+
Raises
|
|
148
|
+
------
|
|
149
|
+
ValueError
|
|
150
|
+
If parameters are invalid.
|
|
151
|
+
"""
|
|
152
|
+
valid_metrics = {
|
|
153
|
+
"krippendorff_alpha",
|
|
154
|
+
"fleiss_kappa",
|
|
155
|
+
"cohens_kappa",
|
|
156
|
+
"percentage_agreement",
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if human_agreement_metric not in valid_metrics:
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"human_agreement_metric must be one of {valid_metrics}, "
|
|
162
|
+
f"got '{human_agreement_metric}'"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if convergence_threshold < 0.0 or convergence_threshold > 1.0:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"convergence_threshold must be in [0, 1], got {convergence_threshold}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
if min_iterations < 1:
|
|
171
|
+
raise ValueError(f"min_iterations must be >= 1, got {min_iterations}")
|
|
172
|
+
|
|
173
|
+
if alpha <= 0.0 or alpha >= 1.0:
|
|
174
|
+
raise ValueError(f"alpha must be in (0, 1), got {alpha}")
|
|
175
|
+
|
|
176
|
+
self.human_agreement_metric = human_agreement_metric
|
|
177
|
+
self.convergence_threshold = convergence_threshold
|
|
178
|
+
self.min_iterations = min_iterations
|
|
179
|
+
self.statistical_test = statistical_test
|
|
180
|
+
self.alpha = alpha
|
|
181
|
+
self.human_baseline: float | None = None
|
|
182
|
+
|
|
183
|
+
def compute_human_baseline(
|
|
184
|
+
self,
|
|
185
|
+
human_ratings: dict[str, list[Label | None]],
|
|
186
|
+
**kwargs: str | int | float | bool | None,
|
|
187
|
+
) -> float:
|
|
188
|
+
"""Compute human inter-rater agreement baseline.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
human_ratings : dict[str, list[Label | None]]
|
|
193
|
+
Dictionary mapping human rater IDs to their ratings.
|
|
194
|
+
For example: {'rater1': [1, 0, 1, ...], 'rater2': [1, 1, 1, ...]}.
|
|
195
|
+
Missing ratings can be represented as None.
|
|
196
|
+
**kwargs : str | int | float | bool | None
|
|
197
|
+
Additional arguments passed to agreement metric function.
|
|
198
|
+
For example, metric='nominal' for Krippendorff's alpha.
|
|
199
|
+
|
|
200
|
+
Returns
|
|
201
|
+
-------
|
|
202
|
+
float
|
|
203
|
+
Human agreement score.
|
|
204
|
+
|
|
205
|
+
Raises
|
|
206
|
+
------
|
|
207
|
+
ValueError
|
|
208
|
+
If human_ratings is empty or has fewer than 2 raters.
|
|
209
|
+
|
|
210
|
+
Examples
|
|
211
|
+
--------
|
|
212
|
+
>>> detector = ConvergenceDetector()
|
|
213
|
+
>>> ratings = {
|
|
214
|
+
... 'human1': [1, 1, 0, 1],
|
|
215
|
+
... 'human2': [1, 1, 0, 0],
|
|
216
|
+
... 'human3': [1, 0, 0, 1]
|
|
217
|
+
... }
|
|
218
|
+
>>> baseline = detector.compute_human_baseline(ratings)
|
|
219
|
+
>>> 0.0 <= baseline <= 1.0
|
|
220
|
+
True
|
|
221
|
+
"""
|
|
222
|
+
if not human_ratings:
|
|
223
|
+
raise ValueError("human_ratings cannot be empty")
|
|
224
|
+
|
|
225
|
+
if len(human_ratings) < 2:
|
|
226
|
+
raise ValueError("human_ratings must have at least 2 raters")
|
|
227
|
+
|
|
228
|
+
# Compute agreement using specified metric
|
|
229
|
+
if self.human_agreement_metric == "krippendorff_alpha":
|
|
230
|
+
# Extract metric parameter, defaulting to 'nominal'
|
|
231
|
+
metric = kwargs.get("metric", "nominal")
|
|
232
|
+
if not isinstance(metric, str):
|
|
233
|
+
metric = "nominal"
|
|
234
|
+
agreement = InterAnnotatorMetrics.krippendorff_alpha(
|
|
235
|
+
human_ratings, metric=metric
|
|
236
|
+
)
|
|
237
|
+
elif self.human_agreement_metric == "percentage_agreement":
|
|
238
|
+
# Use mean of pairwise percentage agreements
|
|
239
|
+
# Filter out None values for percentage agreement
|
|
240
|
+
filtered_ratings = {
|
|
241
|
+
rater_id: [r for r in ratings if r is not None]
|
|
242
|
+
for rater_id, ratings in human_ratings.items()
|
|
243
|
+
}
|
|
244
|
+
pairwise = InterAnnotatorMetrics.pairwise_agreement(filtered_ratings)
|
|
245
|
+
agreements = list(pairwise["percentage_agreement"].values())
|
|
246
|
+
agreement = float(np.mean(agreements)) if agreements else 0.0
|
|
247
|
+
elif self.human_agreement_metric == "cohens_kappa":
|
|
248
|
+
if len(human_ratings) != 2:
|
|
249
|
+
raise ValueError("cohens_kappa requires exactly 2 raters")
|
|
250
|
+
rater_ids = list(human_ratings.keys())
|
|
251
|
+
# Filter out None values for Cohen's kappa
|
|
252
|
+
ratings_1 = human_ratings[rater_ids[0]]
|
|
253
|
+
ratings_2 = human_ratings[rater_ids[1]]
|
|
254
|
+
filtered_ratings_1 = [r for r in ratings_1 if r is not None]
|
|
255
|
+
filtered_ratings_2 = [r for r in ratings_2 if r is not None]
|
|
256
|
+
agreement = InterAnnotatorMetrics.cohens_kappa(
|
|
257
|
+
filtered_ratings_1, filtered_ratings_2
|
|
258
|
+
)
|
|
259
|
+
elif self.human_agreement_metric == "fleiss_kappa":
|
|
260
|
+
# Convert ratings to Fleiss format (items × categories matrix)
|
|
261
|
+
# This requires categorical data
|
|
262
|
+
raise NotImplementedError(
|
|
263
|
+
"fleiss_kappa not yet implemented in compute_human_baseline. "
|
|
264
|
+
"Use krippendorff_alpha instead."
|
|
265
|
+
)
|
|
266
|
+
else:
|
|
267
|
+
raise ValueError(f"Unknown metric: {self.human_agreement_metric}")
|
|
268
|
+
|
|
269
|
+
self.human_baseline = agreement
|
|
270
|
+
return agreement
|
|
271
|
+
|
|
272
|
+
def check_convergence(
|
|
273
|
+
self,
|
|
274
|
+
model_accuracy: float,
|
|
275
|
+
iteration: int,
|
|
276
|
+
human_agreement: float | None = None,
|
|
277
|
+
) -> bool:
|
|
278
|
+
"""Check if model has converged to human performance.
|
|
279
|
+
|
|
280
|
+
Parameters
|
|
281
|
+
----------
|
|
282
|
+
model_accuracy : float
|
|
283
|
+
Model's accuracy on the task.
|
|
284
|
+
iteration : int
|
|
285
|
+
Current iteration number (1-indexed).
|
|
286
|
+
human_agreement : float | None
|
|
287
|
+
Human agreement score. If None, uses self.human_baseline
|
|
288
|
+
(which must have been set via compute_human_baseline).
|
|
289
|
+
|
|
290
|
+
Returns
|
|
291
|
+
-------
|
|
292
|
+
bool
|
|
293
|
+
True if model has converged, False otherwise.
|
|
294
|
+
|
|
295
|
+
Raises
|
|
296
|
+
------
|
|
297
|
+
ValueError
|
|
298
|
+
If human_agreement is None and human_baseline not set.
|
|
299
|
+
|
|
300
|
+
Examples
|
|
301
|
+
--------
|
|
302
|
+
>>> detector = ConvergenceDetector(min_iterations=2, convergence_threshold=0.05)
|
|
303
|
+
>>> detector.human_baseline = 0.80
|
|
304
|
+
>>> # Too early (iteration 1 < min_iterations 2)
|
|
305
|
+
>>> detector.check_convergence(0.79, iteration=1)
|
|
306
|
+
False
|
|
307
|
+
>>> # Still not converged (0.74 < 0.80 - 0.05)
|
|
308
|
+
>>> detector.check_convergence(0.74, iteration=3)
|
|
309
|
+
False
|
|
310
|
+
>>> # Converged (0.77 >= 0.80 - 0.05)
|
|
311
|
+
>>> detector.check_convergence(0.77, iteration=3)
|
|
312
|
+
True
|
|
313
|
+
"""
|
|
314
|
+
# Check minimum iterations
|
|
315
|
+
if iteration < self.min_iterations:
|
|
316
|
+
return False
|
|
317
|
+
|
|
318
|
+
# Get human baseline
|
|
319
|
+
if human_agreement is None:
|
|
320
|
+
if self.human_baseline is None:
|
|
321
|
+
raise ValueError(
|
|
322
|
+
"human_agreement is None and human_baseline not set. "
|
|
323
|
+
"Call compute_human_baseline first or pass human_agreement."
|
|
324
|
+
)
|
|
325
|
+
human_agreement = self.human_baseline
|
|
326
|
+
|
|
327
|
+
# Check if model is within threshold of human performance
|
|
328
|
+
required_accuracy = human_agreement - self.convergence_threshold
|
|
329
|
+
return model_accuracy >= required_accuracy
|
|
330
|
+
|
|
331
|
+
def compute_statistical_test(
|
|
332
|
+
self,
|
|
333
|
+
model_predictions: list[Label],
|
|
334
|
+
human_consensus: list[Label],
|
|
335
|
+
test_type: str = "mcnemar",
|
|
336
|
+
) -> dict[str, float]:
|
|
337
|
+
"""Run statistical test comparing model to human performance.
|
|
338
|
+
|
|
339
|
+
Parameters
|
|
340
|
+
----------
|
|
341
|
+
model_predictions : list[Label]
|
|
342
|
+
Model's predictions.
|
|
343
|
+
human_consensus : list[Label]
|
|
344
|
+
Human consensus labels (e.g., majority vote).
|
|
345
|
+
test_type : str, default="mcnemar"
|
|
346
|
+
Type of statistical test:
|
|
347
|
+
- "mcnemar": McNemar's test for paired nominal data
|
|
348
|
+
- "ttest": Paired t-test (requires multiple samples)
|
|
349
|
+
|
|
350
|
+
Returns
|
|
351
|
+
-------
|
|
352
|
+
dict[str, float]
|
|
353
|
+
Dictionary with keys 'statistic' and 'p_value'.
|
|
354
|
+
|
|
355
|
+
Raises
|
|
356
|
+
------
|
|
357
|
+
ValueError
|
|
358
|
+
If predictions and consensus have different lengths.
|
|
359
|
+
|
|
360
|
+
Examples
|
|
361
|
+
--------
|
|
362
|
+
>>> detector = ConvergenceDetector()
|
|
363
|
+
>>> model_preds = [1, 1, 0, 1, 0]
|
|
364
|
+
>>> human_consensus = [1, 1, 0, 0, 0]
|
|
365
|
+
>>> result = detector.compute_statistical_test(model_preds, human_consensus)
|
|
366
|
+
>>> 'statistic' in result and 'p_value' in result
|
|
367
|
+
True
|
|
368
|
+
"""
|
|
369
|
+
if len(model_predictions) != len(human_consensus):
|
|
370
|
+
raise ValueError(
|
|
371
|
+
f"model_predictions and human_consensus must have same length: "
|
|
372
|
+
f"{len(model_predictions)} != {len(human_consensus)}"
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
if test_type == "mcnemar":
|
|
376
|
+
# McNemar's test for paired predictions
|
|
377
|
+
# Contingency table: [correct_model, incorrect_model] ×
|
|
378
|
+
# [correct_human, incorrect_human]
|
|
379
|
+
|
|
380
|
+
# Actually, for McNemar we need a reference (ground truth)
|
|
381
|
+
# Instead, we'll use a binomial test to check if model accuracy
|
|
382
|
+
# differs significantly from human accuracy
|
|
383
|
+
|
|
384
|
+
model_correct = [
|
|
385
|
+
mp == hc
|
|
386
|
+
for mp, hc in zip(model_predictions, human_consensus, strict=True)
|
|
387
|
+
]
|
|
388
|
+
model_accuracy = sum(model_correct) / len(model_correct)
|
|
389
|
+
human_accuracy = 1.0 # Assuming human_consensus is "correct"
|
|
390
|
+
|
|
391
|
+
# Binomial test: is model accuracy significantly different from human?
|
|
392
|
+
n = len(model_correct)
|
|
393
|
+
k = sum(model_correct)
|
|
394
|
+
|
|
395
|
+
# Two-tailed test
|
|
396
|
+
result = binomtest(k, n, human_accuracy, alternative="two-sided")
|
|
397
|
+
p_value = result.pvalue
|
|
398
|
+
|
|
399
|
+
return {
|
|
400
|
+
"statistic": float(model_accuracy),
|
|
401
|
+
"p_value": float(p_value),
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
elif test_type == "ttest":
|
|
405
|
+
# Paired t-test comparing model predictions to human consensus
|
|
406
|
+
# Convert predictions to correctness scores (1 if match, 0 if not)
|
|
407
|
+
model_scores = np.array(
|
|
408
|
+
[
|
|
409
|
+
1.0 if mp == hc else 0.0
|
|
410
|
+
for mp, hc in zip(model_predictions, human_consensus, strict=True)
|
|
411
|
+
]
|
|
412
|
+
)
|
|
413
|
+
# Human consensus is always "correct" (1.0) by definition
|
|
414
|
+
human_scores = np.ones(len(human_consensus), dtype=float)
|
|
415
|
+
|
|
416
|
+
# Paired t-test: test if model scores differ from human scores
|
|
417
|
+
statistic, p_value = ttest_rel(model_scores, human_scores)
|
|
418
|
+
|
|
419
|
+
return {
|
|
420
|
+
"statistic": float(statistic),
|
|
421
|
+
"p_value": float(p_value),
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
else:
|
|
425
|
+
raise ValueError(
|
|
426
|
+
f"Unknown test_type: {test_type}. Must be 'mcnemar' or 'ttest'."
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
def get_convergence_report(
|
|
430
|
+
self,
|
|
431
|
+
model_accuracy: float,
|
|
432
|
+
iteration: int,
|
|
433
|
+
human_agreement: float | None = None,
|
|
434
|
+
) -> ConvergenceReport:
|
|
435
|
+
"""Generate convergence report with status and metrics.
|
|
436
|
+
|
|
437
|
+
Parameters
|
|
438
|
+
----------
|
|
439
|
+
model_accuracy : float
|
|
440
|
+
Model's current accuracy.
|
|
441
|
+
iteration : int
|
|
442
|
+
Current iteration number.
|
|
443
|
+
human_agreement : float | None
|
|
444
|
+
Human agreement score (uses baseline if None).
|
|
445
|
+
|
|
446
|
+
Returns
|
|
447
|
+
-------
|
|
448
|
+
ConvergenceReport
|
|
449
|
+
Report with convergence status and metrics.
|
|
450
|
+
|
|
451
|
+
Examples
|
|
452
|
+
--------
|
|
453
|
+
>>> detector = ConvergenceDetector(convergence_threshold=0.05)
|
|
454
|
+
>>> detector.human_baseline = 0.80
|
|
455
|
+
>>> report = detector.get_convergence_report(0.77, iteration=5)
|
|
456
|
+
>>> report['converged']
|
|
457
|
+
True
|
|
458
|
+
>>> report['gap']
|
|
459
|
+
0.03
|
|
460
|
+
"""
|
|
461
|
+
# Get human baseline
|
|
462
|
+
if human_agreement is None:
|
|
463
|
+
if self.human_baseline is None:
|
|
464
|
+
raise ValueError("human_agreement is None and human_baseline not set")
|
|
465
|
+
human_agreement = self.human_baseline
|
|
466
|
+
|
|
467
|
+
# Check convergence
|
|
468
|
+
converged = self.check_convergence(model_accuracy, iteration, human_agreement)
|
|
469
|
+
|
|
470
|
+
# Compute metrics
|
|
471
|
+
gap = human_agreement - model_accuracy
|
|
472
|
+
required_accuracy = human_agreement - self.convergence_threshold
|
|
473
|
+
meets_min_iterations = iteration >= self.min_iterations
|
|
474
|
+
|
|
475
|
+
return {
|
|
476
|
+
"converged": converged,
|
|
477
|
+
"model_accuracy": model_accuracy,
|
|
478
|
+
"human_agreement": human_agreement,
|
|
479
|
+
"gap": gap,
|
|
480
|
+
"required_accuracy": required_accuracy,
|
|
481
|
+
"threshold": self.convergence_threshold,
|
|
482
|
+
"iteration": iteration,
|
|
483
|
+
"meets_min_iterations": meets_min_iterations,
|
|
484
|
+
"min_iterations_required": self.min_iterations,
|
|
485
|
+
}
|