bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
bead/items/scoring.py
ADDED
|
@@ -0,0 +1,448 @@
|
|
|
1
|
+
"""Abstract base classes for item scoring with language models.
|
|
2
|
+
|
|
3
|
+
This module provides language-agnostic base classes for scoring items
|
|
4
|
+
using various metrics (log probability, perplexity, embeddings).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from bead.items.adapters.huggingface import HuggingFaceLanguageModel
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from uuid import UUID, uuid4
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
|
|
20
|
+
from bead.items.cache import ModelOutputCache
|
|
21
|
+
from bead.items.item import Item
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ItemScorer(ABC):
|
|
25
|
+
"""Abstract base class for item scoring.
|
|
26
|
+
|
|
27
|
+
ItemScorer provides a framework for assigning numeric scores to items
|
|
28
|
+
based on various criteria (language model probability, acceptability,
|
|
29
|
+
similarity, etc.).
|
|
30
|
+
|
|
31
|
+
Examples
|
|
32
|
+
--------
|
|
33
|
+
Implementing a custom scorer:
|
|
34
|
+
>>> class AcceptabilityScorer(ItemScorer):
|
|
35
|
+
... def score(self, item):
|
|
36
|
+
... # Score based on some acceptability metric
|
|
37
|
+
... text = item.rendered_elements.get("text", "")
|
|
38
|
+
... return self._compute_acceptability(text)
|
|
39
|
+
...
|
|
40
|
+
... def score_batch(self, items):
|
|
41
|
+
... return [self.score(item) for item in items]
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
@abstractmethod
|
|
45
|
+
def score(self, item: Item) -> float:
|
|
46
|
+
"""Compute score for a single item.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
item : Item
|
|
51
|
+
Item to score.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
float
|
|
56
|
+
Numeric score for the item.
|
|
57
|
+
"""
|
|
58
|
+
...
|
|
59
|
+
|
|
60
|
+
def score_batch(self, items: list[Item]) -> list[float]:
|
|
61
|
+
"""Compute scores for multiple items.
|
|
62
|
+
|
|
63
|
+
Default implementation calls score() for each item sequentially.
|
|
64
|
+
Subclasses can override for batch processing optimization.
|
|
65
|
+
|
|
66
|
+
Parameters
|
|
67
|
+
----------
|
|
68
|
+
items : list[Item]
|
|
69
|
+
Items to score.
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
list[float]
|
|
74
|
+
Scores for each item.
|
|
75
|
+
|
|
76
|
+
Examples
|
|
77
|
+
--------
|
|
78
|
+
>>> scorer = ConcreteScorer()
|
|
79
|
+
>>> items = [item1, item2, item3]
|
|
80
|
+
>>> scores = scorer.score_batch(items) # doctest: +SKIP
|
|
81
|
+
>>> len(scores) == len(items)
|
|
82
|
+
True
|
|
83
|
+
"""
|
|
84
|
+
return [self.score(item) for item in items]
|
|
85
|
+
|
|
86
|
+
def score_with_metadata(
|
|
87
|
+
self, items: list[Item]
|
|
88
|
+
) -> dict[UUID, dict[str, float | str]]:
|
|
89
|
+
"""Score items and return results with metadata.
|
|
90
|
+
|
|
91
|
+
Parameters
|
|
92
|
+
----------
|
|
93
|
+
items
|
|
94
|
+
Items to score.
|
|
95
|
+
|
|
96
|
+
Returns
|
|
97
|
+
-------
|
|
98
|
+
dict[UUID, dict[str, float | str]]
|
|
99
|
+
Dictionary mapping item UUIDs to score dictionaries.
|
|
100
|
+
Each score dict contains at least a "score" key.
|
|
101
|
+
|
|
102
|
+
Examples
|
|
103
|
+
--------
|
|
104
|
+
>>> scorer = ConcreteScorer()
|
|
105
|
+
>>> results = scorer.score_with_metadata([item1, item2]) # doctest: +SKIP
|
|
106
|
+
>>> results[item1.id]["score"] # doctest: +SKIP
|
|
107
|
+
-42.5
|
|
108
|
+
"""
|
|
109
|
+
scores = self.score_batch(items)
|
|
110
|
+
|
|
111
|
+
results: dict[UUID, dict[str, float | str]] = {}
|
|
112
|
+
for item, score in zip(items, scores, strict=True):
|
|
113
|
+
results[item.id] = {"score": score}
|
|
114
|
+
|
|
115
|
+
return results
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class LanguageModelScorer(ItemScorer):
|
|
119
|
+
"""Scorer using language model log probabilities.
|
|
120
|
+
|
|
121
|
+
Scores items based on their log probability under a language model.
|
|
122
|
+
Uses HuggingFace adapters for model inference and supports caching.
|
|
123
|
+
|
|
124
|
+
Parameters
|
|
125
|
+
----------
|
|
126
|
+
model_name : str
|
|
127
|
+
HuggingFace model identifier (e.g., "gpt2", "gpt2-medium").
|
|
128
|
+
cache_dir : Path | str | None
|
|
129
|
+
Directory for caching model outputs. If None, no caching.
|
|
130
|
+
device : str
|
|
131
|
+
Device to run model on ("cpu", "cuda", "mps").
|
|
132
|
+
text_key : str
|
|
133
|
+
Key in item.rendered_elements to use as text (default: "text").
|
|
134
|
+
model_version : str
|
|
135
|
+
Version string for cache tracking.
|
|
136
|
+
|
|
137
|
+
Examples
|
|
138
|
+
--------
|
|
139
|
+
>>> from pathlib import Path
|
|
140
|
+
>>> scorer = LanguageModelScorer(
|
|
141
|
+
... model_name="gpt2",
|
|
142
|
+
... cache_dir=Path(".cache"),
|
|
143
|
+
... device="cpu"
|
|
144
|
+
... ) # doctest: +SKIP
|
|
145
|
+
>>> score = scorer.score(item) # doctest: +SKIP
|
|
146
|
+
>>> score < 0 # Log probabilities are negative # doctest: +SKIP
|
|
147
|
+
True
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
model_name: str,
|
|
153
|
+
cache_dir: Path | str | None = None,
|
|
154
|
+
device: str = "cpu",
|
|
155
|
+
text_key: str = "text",
|
|
156
|
+
model_version: str = "unknown",
|
|
157
|
+
) -> None:
|
|
158
|
+
self.model_name = model_name
|
|
159
|
+
self.cache_dir = Path(cache_dir) if cache_dir else None
|
|
160
|
+
self.device = device
|
|
161
|
+
self.text_key = text_key
|
|
162
|
+
self.model_version = model_version
|
|
163
|
+
|
|
164
|
+
# lazy loading of model and cache
|
|
165
|
+
self._model: HuggingFaceLanguageModel | None = None
|
|
166
|
+
self._cache: ModelOutputCache | None = None
|
|
167
|
+
|
|
168
|
+
@property
|
|
169
|
+
def model(self) -> HuggingFaceLanguageModel:
|
|
170
|
+
"""Get the model, loading if necessary.
|
|
171
|
+
|
|
172
|
+
Returns
|
|
173
|
+
-------
|
|
174
|
+
HuggingFaceLanguageModel
|
|
175
|
+
The language model adapter.
|
|
176
|
+
"""
|
|
177
|
+
if self._model is None:
|
|
178
|
+
# import here to avoid circular dependency
|
|
179
|
+
from bead.items.adapters.huggingface import ( # noqa: PLC0415
|
|
180
|
+
HuggingFaceLanguageModel,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# set up cache
|
|
184
|
+
if self.cache_dir:
|
|
185
|
+
self._cache = ModelOutputCache(cache_dir=self.cache_dir)
|
|
186
|
+
else:
|
|
187
|
+
# create a no-op cache
|
|
188
|
+
self._cache = ModelOutputCache(cache_dir=Path(".cache/temp"))
|
|
189
|
+
|
|
190
|
+
self._model = HuggingFaceLanguageModel(
|
|
191
|
+
model_name=self.model_name,
|
|
192
|
+
cache=self._cache,
|
|
193
|
+
device=self.device, # type: ignore[arg-type]
|
|
194
|
+
model_version=self.model_version,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return self._model
|
|
198
|
+
|
|
199
|
+
def score(self, item: Item) -> float:
|
|
200
|
+
"""Compute log probability score for an item.
|
|
201
|
+
|
|
202
|
+
Parameters
|
|
203
|
+
----------
|
|
204
|
+
item : Item
|
|
205
|
+
Item to score.
|
|
206
|
+
|
|
207
|
+
Returns
|
|
208
|
+
-------
|
|
209
|
+
float
|
|
210
|
+
Log probability of the item's text under the language model.
|
|
211
|
+
|
|
212
|
+
Raises
|
|
213
|
+
------
|
|
214
|
+
KeyError
|
|
215
|
+
If text_key not found in item.rendered_elements.
|
|
216
|
+
"""
|
|
217
|
+
text = item.rendered_elements.get(self.text_key)
|
|
218
|
+
if text is None:
|
|
219
|
+
raise KeyError(f"Key '{self.text_key}' not found in item.rendered_elements")
|
|
220
|
+
|
|
221
|
+
return self.model.compute_log_probability(text)
|
|
222
|
+
|
|
223
|
+
def score_batch(
|
|
224
|
+
self, items: list[Item], batch_size: int | None = None
|
|
225
|
+
) -> list[float]:
|
|
226
|
+
"""Compute scores for multiple items efficiently using batched inference.
|
|
227
|
+
|
|
228
|
+
Parameters
|
|
229
|
+
----------
|
|
230
|
+
items : list[Item]
|
|
231
|
+
Items to score.
|
|
232
|
+
batch_size : int | None, default=None
|
|
233
|
+
Number of items to process in each batch. If None, automatically
|
|
234
|
+
infers optimal batch size based on available resources.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
list[float]
|
|
239
|
+
Log probabilities for each item.
|
|
240
|
+
"""
|
|
241
|
+
# Extract texts
|
|
242
|
+
texts: list[str] = []
|
|
243
|
+
for item in items:
|
|
244
|
+
text_val = item.rendered_elements.get(self.text_key)
|
|
245
|
+
if text_val is None:
|
|
246
|
+
msg = (
|
|
247
|
+
f"Key '{self.text_key}' not found in "
|
|
248
|
+
f"item {item.id}.rendered_elements"
|
|
249
|
+
)
|
|
250
|
+
raise KeyError(msg)
|
|
251
|
+
# Type narrowing - text_val is now known to be str after this check
|
|
252
|
+
assert isinstance(text_val, str), f"Expected str, got {type(text_val)}"
|
|
253
|
+
texts.append(text_val)
|
|
254
|
+
|
|
255
|
+
# Use batched scoring if available, otherwise fall back to sequential
|
|
256
|
+
if hasattr(self.model, "compute_log_probability_batch"):
|
|
257
|
+
scores = self.model.compute_log_probability_batch(
|
|
258
|
+
texts, batch_size=batch_size
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
# Fallback for models without batch support
|
|
262
|
+
scores = [self.model.compute_log_probability(text) for text in texts]
|
|
263
|
+
|
|
264
|
+
return scores
|
|
265
|
+
|
|
266
|
+
def score_with_metadata(
|
|
267
|
+
self, items: list[Item]
|
|
268
|
+
) -> dict[UUID, dict[str, float | str]]:
|
|
269
|
+
"""Score items and return results with additional metrics.
|
|
270
|
+
|
|
271
|
+
Returns log probability and perplexity for each item.
|
|
272
|
+
|
|
273
|
+
Parameters
|
|
274
|
+
----------
|
|
275
|
+
items
|
|
276
|
+
Items to score.
|
|
277
|
+
|
|
278
|
+
Returns
|
|
279
|
+
-------
|
|
280
|
+
dict[UUID, dict[str, float | str]]
|
|
281
|
+
Dictionary with "score" (log prob) and "perplexity" for each item.
|
|
282
|
+
"""
|
|
283
|
+
scores = self.score_batch(items)
|
|
284
|
+
|
|
285
|
+
results: dict[UUID, dict[str, float | str]] = {}
|
|
286
|
+
for item, score in zip(items, scores, strict=True):
|
|
287
|
+
# compute perplexity from log probability
|
|
288
|
+
# perplexity = exp(-log_prob / num_tokens)
|
|
289
|
+
# for now, just include log_prob; perplexity computation
|
|
290
|
+
# requires token count which we'd need to get from the model
|
|
291
|
+
results[item.id] = {
|
|
292
|
+
"score": score,
|
|
293
|
+
"log_probability": score,
|
|
294
|
+
"model": self.model_name,
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return results
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class ForcedChoiceScorer(ItemScorer):
|
|
301
|
+
"""Scorer for N-AFC (forced-choice) items with multiple options.
|
|
302
|
+
|
|
303
|
+
Computes comparison scores for forced-choice items by scoring each
|
|
304
|
+
option and applying a comparison function (e.g., max difference,
|
|
305
|
+
variance, entropy).
|
|
306
|
+
|
|
307
|
+
Parameters
|
|
308
|
+
----------
|
|
309
|
+
base_scorer : ItemScorer
|
|
310
|
+
Base scorer to use for individual options.
|
|
311
|
+
comparison_fn : callable | None
|
|
312
|
+
Function that takes list of scores and returns comparison metric.
|
|
313
|
+
Default is standard deviation (variance in scores).
|
|
314
|
+
option_prefix : str
|
|
315
|
+
Prefix for option names in rendered_elements (default: "option").
|
|
316
|
+
|
|
317
|
+
Examples
|
|
318
|
+
--------
|
|
319
|
+
>>> base = LanguageModelScorer("gpt2", device="cpu") # doctest: +SKIP
|
|
320
|
+
>>> fc_scorer = ForcedChoiceScorer(
|
|
321
|
+
... base_scorer=base,
|
|
322
|
+
... comparison_fn=lambda scores: max(scores) - min(scores) # Range
|
|
323
|
+
... ) # doctest: +SKIP
|
|
324
|
+
>>> # Item with option_a, option_b, option_c, ...
|
|
325
|
+
>>> score = fc_scorer.score(forced_choice_item) # doctest: +SKIP
|
|
326
|
+
"""
|
|
327
|
+
|
|
328
|
+
def __init__(
|
|
329
|
+
self,
|
|
330
|
+
base_scorer: ItemScorer,
|
|
331
|
+
comparison_fn: Callable[[list[float]], float] | None = None,
|
|
332
|
+
option_prefix: str = "option",
|
|
333
|
+
) -> None:
|
|
334
|
+
self.base_scorer = base_scorer
|
|
335
|
+
self.option_prefix = option_prefix
|
|
336
|
+
|
|
337
|
+
if comparison_fn is None:
|
|
338
|
+
# default: standard deviation of scores
|
|
339
|
+
self.comparison_fn: Callable[[list[float]], float] = (
|
|
340
|
+
self._default_comparison
|
|
341
|
+
)
|
|
342
|
+
else:
|
|
343
|
+
self.comparison_fn = comparison_fn
|
|
344
|
+
|
|
345
|
+
@staticmethod
|
|
346
|
+
def _default_comparison(scores: list[float]) -> float:
|
|
347
|
+
"""Compute standard deviation of scores."""
|
|
348
|
+
return float(np.std(scores))
|
|
349
|
+
|
|
350
|
+
def score(self, item: Item) -> float:
|
|
351
|
+
"""Score a forced-choice item.
|
|
352
|
+
|
|
353
|
+
Extracts all options from item.rendered_elements (option_a, option_b, ...),
|
|
354
|
+
scores each option, and applies comparison function.
|
|
355
|
+
|
|
356
|
+
Parameters
|
|
357
|
+
----------
|
|
358
|
+
item : Item
|
|
359
|
+
Forced-choice item with multiple options.
|
|
360
|
+
|
|
361
|
+
Returns
|
|
362
|
+
-------
|
|
363
|
+
float
|
|
364
|
+
Comparison score across all options.
|
|
365
|
+
|
|
366
|
+
Raises
|
|
367
|
+
------
|
|
368
|
+
ValueError
|
|
369
|
+
If item doesn't contain option elements or has precomputed scores.
|
|
370
|
+
"""
|
|
371
|
+
# try to get precomputed scores from metadata first
|
|
372
|
+
# look for lm_score_0, lm_score_1, ... or lm_score_a, lm_score_b, ...
|
|
373
|
+
precomputed_scores = self._extract_precomputed_scores(item)
|
|
374
|
+
if precomputed_scores:
|
|
375
|
+
return self.comparison_fn(precomputed_scores)
|
|
376
|
+
|
|
377
|
+
# otherwise score each option element
|
|
378
|
+
option_scores: list[float] = []
|
|
379
|
+
letters = "abcdefghijklmnopqrstuvwxyz"
|
|
380
|
+
|
|
381
|
+
for letter in letters:
|
|
382
|
+
option_name = f"{self.option_prefix}_{letter}"
|
|
383
|
+
if option_name not in item.rendered_elements:
|
|
384
|
+
break # no more options
|
|
385
|
+
|
|
386
|
+
# create temporary item for scoring this option
|
|
387
|
+
option_text = item.rendered_elements[option_name]
|
|
388
|
+
temp_item = Item(
|
|
389
|
+
item_template_id=uuid4(),
|
|
390
|
+
rendered_elements={"text": option_text},
|
|
391
|
+
)
|
|
392
|
+
score: float = self.base_scorer.score(temp_item)
|
|
393
|
+
option_scores.append(score)
|
|
394
|
+
|
|
395
|
+
if not option_scores:
|
|
396
|
+
raise ValueError(
|
|
397
|
+
f"Item has no options with prefix '{self.option_prefix}_' "
|
|
398
|
+
"in rendered_elements"
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return self.comparison_fn(option_scores)
|
|
402
|
+
|
|
403
|
+
def _extract_precomputed_scores(self, item: Item) -> list[float] | None:
|
|
404
|
+
"""Extract precomputed scores from item metadata if available.
|
|
405
|
+
|
|
406
|
+
Looks for keys like: lm_score_0, lm_score_1, ... or
|
|
407
|
+
lm_score_a, lm_score_b, ...
|
|
408
|
+
|
|
409
|
+
Parameters
|
|
410
|
+
----------
|
|
411
|
+
item : Item
|
|
412
|
+
Item to extract scores from.
|
|
413
|
+
|
|
414
|
+
Returns
|
|
415
|
+
-------
|
|
416
|
+
list[float] | None
|
|
417
|
+
List of scores if found, None otherwise.
|
|
418
|
+
"""
|
|
419
|
+
scores: list[float] = []
|
|
420
|
+
letters = "abcdefghijklmnopqrstuvwxyz"
|
|
421
|
+
|
|
422
|
+
# try numeric indices first (lm_score_0, lm_score_1, ...)
|
|
423
|
+
for i in range(26): # max 26 options
|
|
424
|
+
key = f"lm_score_{i}"
|
|
425
|
+
if key in item.item_metadata:
|
|
426
|
+
metadata_val = item.item_metadata[key]
|
|
427
|
+
if not isinstance(metadata_val, int | float | str):
|
|
428
|
+
raise TypeError(f"Expected numeric type, got {type(metadata_val)}")
|
|
429
|
+
scores.append(float(metadata_val))
|
|
430
|
+
else:
|
|
431
|
+
break
|
|
432
|
+
|
|
433
|
+
if scores:
|
|
434
|
+
return scores
|
|
435
|
+
|
|
436
|
+
# try letter indices (lm_score_a, lm_score_b, ...)
|
|
437
|
+
scores = []
|
|
438
|
+
for letter in letters:
|
|
439
|
+
key = f"lm_score_{letter}"
|
|
440
|
+
if key in item.item_metadata:
|
|
441
|
+
metadata_val = item.item_metadata[key]
|
|
442
|
+
if not isinstance(metadata_val, int | float | str):
|
|
443
|
+
raise TypeError(f"Expected numeric type, got {type(metadata_val)}")
|
|
444
|
+
scores.append(float(metadata_val))
|
|
445
|
+
else:
|
|
446
|
+
break
|
|
447
|
+
|
|
448
|
+
return scores if scores else None
|