bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,593 @@
|
|
|
1
|
+
"""Utilities for creating categorical experimental items.
|
|
2
|
+
|
|
3
|
+
This module provides language-agnostic utilities for creating categorical
|
|
4
|
+
items where participants select from N unordered categories (e.g., NLI labels,
|
|
5
|
+
POS tags, semantic relations).
|
|
6
|
+
|
|
7
|
+
Integration Points
|
|
8
|
+
------------------
|
|
9
|
+
- Active Learning: bead/active_learning/models/categorical.py
|
|
10
|
+
- Simulation: bead/simulation/strategies/categorical.py
|
|
11
|
+
- Deployment: bead/deployment/jspsych/ (dropdown or radio buttons)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from collections.abc import Callable, Hashable
|
|
18
|
+
from itertools import product
|
|
19
|
+
from uuid import UUID, uuid4
|
|
20
|
+
|
|
21
|
+
from bead.items.item import Item, MetadataValue
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_categorical_item(
|
|
25
|
+
text: str,
|
|
26
|
+
categories: list[str],
|
|
27
|
+
prompt: str | None = None,
|
|
28
|
+
item_template_id: UUID | None = None,
|
|
29
|
+
metadata: dict[str, MetadataValue] | None = None,
|
|
30
|
+
) -> Item:
|
|
31
|
+
"""Create a categorical classification item.
|
|
32
|
+
|
|
33
|
+
Parameters
|
|
34
|
+
----------
|
|
35
|
+
text : str
|
|
36
|
+
The stimulus text to classify.
|
|
37
|
+
categories : list[str]
|
|
38
|
+
List of category labels (unordered). Must have at least 2 categories.
|
|
39
|
+
prompt : str | None
|
|
40
|
+
Optional question/prompt for the classification.
|
|
41
|
+
If None, uses "Select a category:".
|
|
42
|
+
item_template_id : UUID | None
|
|
43
|
+
Template ID for the item. If None, generates new UUID.
|
|
44
|
+
metadata : dict[str, MetadataValue] | None
|
|
45
|
+
Additional metadata for item_metadata field.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
Item
|
|
50
|
+
Categorical item with text and prompt in rendered_elements.
|
|
51
|
+
|
|
52
|
+
Raises
|
|
53
|
+
------
|
|
54
|
+
ValueError
|
|
55
|
+
If text is empty or if fewer than 2 categories provided.
|
|
56
|
+
|
|
57
|
+
Examples
|
|
58
|
+
--------
|
|
59
|
+
>>> item = create_categorical_item(
|
|
60
|
+
... text="Premise: All dogs bark. Hypothesis: Some dogs bark.",
|
|
61
|
+
... categories=["entailment", "neutral", "contradiction"],
|
|
62
|
+
... prompt="What is the relationship?",
|
|
63
|
+
... metadata={"task": "nli"}
|
|
64
|
+
... )
|
|
65
|
+
>>> item.rendered_elements["text"]
|
|
66
|
+
'Premise: All dogs bark. Hypothesis: Some dogs bark.'
|
|
67
|
+
>>> item.rendered_elements["prompt"]
|
|
68
|
+
'What is the relationship?'
|
|
69
|
+
>>> item.item_metadata["categories"]
|
|
70
|
+
['entailment', 'neutral', 'contradiction']
|
|
71
|
+
|
|
72
|
+
>>> # POS tagging
|
|
73
|
+
>>> item = create_categorical_item(
|
|
74
|
+
... text="The cat sat on the mat.",
|
|
75
|
+
... categories=["noun", "verb", "adjective", "determiner", "preposition"],
|
|
76
|
+
... prompt="What is the part of speech of 'cat'?"
|
|
77
|
+
... )
|
|
78
|
+
>>> len(item.item_metadata["categories"])
|
|
79
|
+
5
|
|
80
|
+
"""
|
|
81
|
+
if not text or not text.strip():
|
|
82
|
+
raise ValueError("text cannot be empty")
|
|
83
|
+
|
|
84
|
+
if len(categories) < 2:
|
|
85
|
+
raise ValueError("At least 2 categories required for categorical item")
|
|
86
|
+
|
|
87
|
+
if item_template_id is None:
|
|
88
|
+
item_template_id = uuid4()
|
|
89
|
+
|
|
90
|
+
if prompt is None:
|
|
91
|
+
prompt = "Select a category:"
|
|
92
|
+
|
|
93
|
+
rendered_elements: dict[str, str] = {
|
|
94
|
+
"text": text,
|
|
95
|
+
"prompt": prompt,
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Build item metadata
|
|
99
|
+
item_metadata: dict[str, MetadataValue] = {
|
|
100
|
+
"categories": list(categories),
|
|
101
|
+
}
|
|
102
|
+
if metadata:
|
|
103
|
+
item_metadata.update(metadata)
|
|
104
|
+
|
|
105
|
+
return Item(
|
|
106
|
+
item_template_id=item_template_id,
|
|
107
|
+
rendered_elements=rendered_elements,
|
|
108
|
+
item_metadata=item_metadata,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def create_nli_item(
|
|
113
|
+
premise: str,
|
|
114
|
+
hypothesis: str,
|
|
115
|
+
categories: list[str] | None = None,
|
|
116
|
+
prompt: str | None = None,
|
|
117
|
+
item_template_id: UUID | None = None,
|
|
118
|
+
metadata: dict[str, MetadataValue] | None = None,
|
|
119
|
+
) -> Item:
|
|
120
|
+
"""Create a Natural Language Inference (NLI) item.
|
|
121
|
+
|
|
122
|
+
Specialized helper for NLI tasks with automatic formatting and default
|
|
123
|
+
categories.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
premise : str
|
|
128
|
+
The premise text.
|
|
129
|
+
hypothesis : str
|
|
130
|
+
The hypothesis text.
|
|
131
|
+
categories : list[str] | None
|
|
132
|
+
Category labels. If None, uses ["entailment", "neutral", "contradiction"].
|
|
133
|
+
prompt : str | None
|
|
134
|
+
Question/prompt. If None, uses "What is the relationship?".
|
|
135
|
+
item_template_id : UUID | None
|
|
136
|
+
Template ID for the item. If None, generates new UUID.
|
|
137
|
+
metadata : dict[str, MetadataValue] | None
|
|
138
|
+
Additional metadata for item_metadata field.
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
Item
|
|
143
|
+
NLI categorical item.
|
|
144
|
+
|
|
145
|
+
Examples
|
|
146
|
+
--------
|
|
147
|
+
>>> item = create_nli_item(
|
|
148
|
+
... premise="All dogs bark.",
|
|
149
|
+
... hypothesis="Some dogs bark."
|
|
150
|
+
... )
|
|
151
|
+
>>> "Premise:" in item.rendered_elements["text"]
|
|
152
|
+
True
|
|
153
|
+
>>> "Hypothesis:" in item.rendered_elements["text"]
|
|
154
|
+
True
|
|
155
|
+
>>> item.item_metadata["categories"]
|
|
156
|
+
['entailment', 'neutral', 'contradiction']
|
|
157
|
+
>>> item.item_metadata["premise"]
|
|
158
|
+
'All dogs bark.'
|
|
159
|
+
|
|
160
|
+
>>> # Custom categories
|
|
161
|
+
>>> item = create_nli_item(
|
|
162
|
+
... premise="The cat is on the mat.",
|
|
163
|
+
... hypothesis="There is an animal on the mat.",
|
|
164
|
+
... categories=["entails", "contradicts", "neither"]
|
|
165
|
+
... )
|
|
166
|
+
>>> item.item_metadata["categories"]
|
|
167
|
+
['entails', 'contradicts', 'neither']
|
|
168
|
+
"""
|
|
169
|
+
if categories is None:
|
|
170
|
+
categories = ["entailment", "neutral", "contradiction"]
|
|
171
|
+
|
|
172
|
+
if prompt is None:
|
|
173
|
+
prompt = "What is the relationship?"
|
|
174
|
+
|
|
175
|
+
# Format as premise-hypothesis pair
|
|
176
|
+
combined_text = f"Premise: {premise}\nHypothesis: {hypothesis}"
|
|
177
|
+
|
|
178
|
+
# Build metadata with premise and hypothesis
|
|
179
|
+
nli_metadata: dict[str, MetadataValue] = {
|
|
180
|
+
"premise": premise,
|
|
181
|
+
"hypothesis": hypothesis,
|
|
182
|
+
"task": "nli",
|
|
183
|
+
}
|
|
184
|
+
if metadata:
|
|
185
|
+
nli_metadata.update(metadata)
|
|
186
|
+
|
|
187
|
+
return create_categorical_item(
|
|
188
|
+
text=combined_text,
|
|
189
|
+
categories=categories,
|
|
190
|
+
prompt=prompt,
|
|
191
|
+
item_template_id=item_template_id,
|
|
192
|
+
metadata=nli_metadata,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def create_categorical_items_from_texts(
|
|
197
|
+
texts: list[str],
|
|
198
|
+
categories: list[str],
|
|
199
|
+
prompt: str | None = None,
|
|
200
|
+
*,
|
|
201
|
+
item_template_id: UUID | None = None,
|
|
202
|
+
metadata_fn: Callable[[str], dict[str, MetadataValue]] | None = None,
|
|
203
|
+
) -> list[Item]:
|
|
204
|
+
"""Create categorical items from a list of texts with the same categories.
|
|
205
|
+
|
|
206
|
+
Parameters
|
|
207
|
+
----------
|
|
208
|
+
texts : list[str]
|
|
209
|
+
List of stimulus texts.
|
|
210
|
+
categories : list[str]
|
|
211
|
+
Category labels for all items.
|
|
212
|
+
prompt : str | None
|
|
213
|
+
The question/prompt for all items.
|
|
214
|
+
item_template_id : UUID | None
|
|
215
|
+
Template ID for all created items. If None, generates one per item.
|
|
216
|
+
metadata_fn : Callable[[str], dict[str, MetadataValue]] | None
|
|
217
|
+
Function to generate metadata from each text.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
list[Item]
|
|
222
|
+
Categorical items for each text.
|
|
223
|
+
|
|
224
|
+
Examples
|
|
225
|
+
--------
|
|
226
|
+
>>> texts = ["The cat sat.", "The dog ran.", "The bird flew."]
|
|
227
|
+
>>> categories = ["past", "present", "future"]
|
|
228
|
+
>>> items = create_categorical_items_from_texts(
|
|
229
|
+
... texts,
|
|
230
|
+
... categories=categories,
|
|
231
|
+
... prompt="What is the tense?"
|
|
232
|
+
... )
|
|
233
|
+
>>> len(items)
|
|
234
|
+
3
|
|
235
|
+
>>> items[0].item_metadata["categories"]
|
|
236
|
+
['past', 'present', 'future']
|
|
237
|
+
"""
|
|
238
|
+
categorical_items: list[Item] = []
|
|
239
|
+
|
|
240
|
+
for text in texts:
|
|
241
|
+
metadata: dict[str, MetadataValue] = {}
|
|
242
|
+
if metadata_fn:
|
|
243
|
+
metadata = metadata_fn(text)
|
|
244
|
+
|
|
245
|
+
item = create_categorical_item(
|
|
246
|
+
text=text,
|
|
247
|
+
categories=categories,
|
|
248
|
+
prompt=prompt,
|
|
249
|
+
item_template_id=item_template_id,
|
|
250
|
+
metadata=metadata,
|
|
251
|
+
)
|
|
252
|
+
categorical_items.append(item)
|
|
253
|
+
|
|
254
|
+
return categorical_items
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def create_categorical_items_from_pairs(
|
|
258
|
+
pairs: list[tuple[str, str]],
|
|
259
|
+
categories: list[str],
|
|
260
|
+
prompt: str | None = None,
|
|
261
|
+
*,
|
|
262
|
+
pair_label1: str = "Text 1",
|
|
263
|
+
pair_label2: str = "Text 2",
|
|
264
|
+
item_template_id: UUID | None = None,
|
|
265
|
+
metadata_fn: (Callable[[str, str], dict[str, MetadataValue]] | None) = None,
|
|
266
|
+
) -> list[Item]:
|
|
267
|
+
"""Create categorical items from pairs of texts.
|
|
268
|
+
|
|
269
|
+
Useful for NLI, paraphrase detection, semantic similarity, etc.
|
|
270
|
+
|
|
271
|
+
Parameters
|
|
272
|
+
----------
|
|
273
|
+
pairs : list[tuple[str, str]]
|
|
274
|
+
List of (text1, text2) pairs.
|
|
275
|
+
categories : list[str]
|
|
276
|
+
Category labels for all items.
|
|
277
|
+
prompt : str | None
|
|
278
|
+
The question/prompt for all items.
|
|
279
|
+
pair_label1 : str
|
|
280
|
+
Label for first text in pair (default: "Text 1").
|
|
281
|
+
pair_label2 : str
|
|
282
|
+
Label for second text in pair (default: "Text 2").
|
|
283
|
+
item_template_id : UUID | None
|
|
284
|
+
Template ID for all created items. If None, generates one per item.
|
|
285
|
+
metadata_fn : Callable[[str, str], dict[str, MetadataValue]] | None
|
|
286
|
+
Function to generate metadata from (text1, text2).
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
list[Item]
|
|
291
|
+
Categorical items from pairs.
|
|
292
|
+
|
|
293
|
+
Examples
|
|
294
|
+
--------
|
|
295
|
+
>>> pairs = [
|
|
296
|
+
... ("All dogs bark.", "Some dogs bark."),
|
|
297
|
+
... ("The sky is blue.", "The sky is not blue.")
|
|
298
|
+
... ]
|
|
299
|
+
>>> items = create_categorical_items_from_pairs(
|
|
300
|
+
... pairs,
|
|
301
|
+
... categories=["entailment", "neutral", "contradiction"],
|
|
302
|
+
... prompt="What is the relationship?",
|
|
303
|
+
... pair_label1="Premise",
|
|
304
|
+
... pair_label2="Hypothesis"
|
|
305
|
+
... )
|
|
306
|
+
>>> len(items)
|
|
307
|
+
2
|
|
308
|
+
>>> "Premise:" in items[0].rendered_elements["text"]
|
|
309
|
+
True
|
|
310
|
+
"""
|
|
311
|
+
categorical_items: list[Item] = []
|
|
312
|
+
|
|
313
|
+
for text1, text2 in pairs:
|
|
314
|
+
# Combine pairs into single text
|
|
315
|
+
combined_text = f"{pair_label1}: {text1}\n{pair_label2}: {text2}"
|
|
316
|
+
|
|
317
|
+
metadata: dict[str, MetadataValue] = {
|
|
318
|
+
"text1": text1,
|
|
319
|
+
"text2": text2,
|
|
320
|
+
}
|
|
321
|
+
if metadata_fn:
|
|
322
|
+
metadata.update(metadata_fn(text1, text2))
|
|
323
|
+
|
|
324
|
+
item = create_categorical_item(
|
|
325
|
+
text=combined_text,
|
|
326
|
+
categories=categories,
|
|
327
|
+
prompt=prompt,
|
|
328
|
+
item_template_id=item_template_id,
|
|
329
|
+
metadata=metadata,
|
|
330
|
+
)
|
|
331
|
+
categorical_items.append(item)
|
|
332
|
+
|
|
333
|
+
return categorical_items
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def create_categorical_items_from_groups(
|
|
337
|
+
items: list[Item],
|
|
338
|
+
group_by: Callable[[Item], Hashable],
|
|
339
|
+
categories: list[str],
|
|
340
|
+
prompt: str | None = None,
|
|
341
|
+
*,
|
|
342
|
+
extract_text: Callable[[Item], str] | None = None,
|
|
343
|
+
include_group_metadata: bool = True,
|
|
344
|
+
item_template_id: UUID | None = None,
|
|
345
|
+
) -> list[Item]:
|
|
346
|
+
"""Create categorical items from grouped source items.
|
|
347
|
+
|
|
348
|
+
Groups items and creates one categorical item per source item, preserving
|
|
349
|
+
group information in metadata.
|
|
350
|
+
|
|
351
|
+
Parameters
|
|
352
|
+
----------
|
|
353
|
+
items : list[Item]
|
|
354
|
+
Source items to process.
|
|
355
|
+
group_by : Callable[[Item], Hashable]
|
|
356
|
+
Function to extract grouping key from items.
|
|
357
|
+
categories : list[str]
|
|
358
|
+
Category labels for all items.
|
|
359
|
+
prompt : str | None
|
|
360
|
+
The question/prompt for all items.
|
|
361
|
+
extract_text : Callable[[Item], str] | None
|
|
362
|
+
Function to extract text from item. If None, tries common keys.
|
|
363
|
+
include_group_metadata : bool
|
|
364
|
+
Whether to include group key in item metadata.
|
|
365
|
+
item_template_id : UUID | None
|
|
366
|
+
Template ID for all created items. If None, generates one per item.
|
|
367
|
+
|
|
368
|
+
Returns
|
|
369
|
+
-------
|
|
370
|
+
list[Item]
|
|
371
|
+
Categorical items from source items.
|
|
372
|
+
|
|
373
|
+
Examples
|
|
374
|
+
--------
|
|
375
|
+
>>> source_items = [
|
|
376
|
+
... Item(
|
|
377
|
+
... uuid4(),
|
|
378
|
+
... rendered_elements={"text": "The cat sat."},
|
|
379
|
+
... item_metadata={"tense": "past"}
|
|
380
|
+
... ),
|
|
381
|
+
... Item(
|
|
382
|
+
... uuid4(),
|
|
383
|
+
... rendered_elements={"text": "The dog runs."},
|
|
384
|
+
... item_metadata={"tense": "present"}
|
|
385
|
+
... )
|
|
386
|
+
... ]
|
|
387
|
+
>>> categorical_items = create_categorical_items_from_groups(
|
|
388
|
+
... source_items,
|
|
389
|
+
... group_by=lambda i: i.item_metadata["tense"],
|
|
390
|
+
... categories=["past", "present", "future"],
|
|
391
|
+
... prompt="What is the tense?"
|
|
392
|
+
... )
|
|
393
|
+
>>> len(categorical_items)
|
|
394
|
+
2
|
|
395
|
+
"""
|
|
396
|
+
# Group items
|
|
397
|
+
groups: dict[Hashable, list[Item]] = defaultdict(list)
|
|
398
|
+
for item in items:
|
|
399
|
+
group_key = group_by(item)
|
|
400
|
+
groups[group_key].append(item)
|
|
401
|
+
|
|
402
|
+
categorical_items: list[Item] = []
|
|
403
|
+
|
|
404
|
+
for group_key, group_items in groups.items():
|
|
405
|
+
for item in group_items:
|
|
406
|
+
# Extract text
|
|
407
|
+
if extract_text:
|
|
408
|
+
text: str = extract_text(item)
|
|
409
|
+
else:
|
|
410
|
+
text = _extract_text_from_item(item)
|
|
411
|
+
|
|
412
|
+
# Build metadata
|
|
413
|
+
metadata: dict[str, MetadataValue] = {
|
|
414
|
+
"source_item_id": str(item.id),
|
|
415
|
+
}
|
|
416
|
+
if include_group_metadata:
|
|
417
|
+
metadata["group_key"] = str(group_key)
|
|
418
|
+
|
|
419
|
+
# Create categorical item
|
|
420
|
+
categorical_item = create_categorical_item(
|
|
421
|
+
text=text,
|
|
422
|
+
categories=categories,
|
|
423
|
+
prompt=prompt,
|
|
424
|
+
item_template_id=item_template_id,
|
|
425
|
+
metadata=metadata,
|
|
426
|
+
)
|
|
427
|
+
categorical_items.append(categorical_item)
|
|
428
|
+
|
|
429
|
+
return categorical_items
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def create_categorical_items_cross_product(
|
|
433
|
+
texts: list[str],
|
|
434
|
+
prompts: list[str],
|
|
435
|
+
categories: list[str],
|
|
436
|
+
*,
|
|
437
|
+
item_template_id: UUID | None = None,
|
|
438
|
+
metadata_fn: (Callable[[str, str], dict[str, MetadataValue]] | None) = None,
|
|
439
|
+
) -> list[Item]:
|
|
440
|
+
"""Create categorical items from cross-product of texts and prompts.
|
|
441
|
+
|
|
442
|
+
Useful when you want to apply multiple prompts to each text.
|
|
443
|
+
|
|
444
|
+
Parameters
|
|
445
|
+
----------
|
|
446
|
+
texts : list[str]
|
|
447
|
+
List of stimulus texts.
|
|
448
|
+
prompts : list[str]
|
|
449
|
+
List of prompts to apply.
|
|
450
|
+
categories : list[str]
|
|
451
|
+
Category labels for all items.
|
|
452
|
+
item_template_id : UUID | None
|
|
453
|
+
Template ID for all created items.
|
|
454
|
+
metadata_fn : Callable[[str, str], dict[str, MetadataValue]] | None
|
|
455
|
+
Function to generate metadata from (text, prompt).
|
|
456
|
+
|
|
457
|
+
Returns
|
|
458
|
+
-------
|
|
459
|
+
list[Item]
|
|
460
|
+
Categorical items from cross-product.
|
|
461
|
+
|
|
462
|
+
Examples
|
|
463
|
+
--------
|
|
464
|
+
>>> texts = ["The cat sat.", "The dog ran."]
|
|
465
|
+
>>> prompts = ["What is the tense?", "What is the aspect?"]
|
|
466
|
+
>>> categories = ["past", "present", "future"]
|
|
467
|
+
>>> items = create_categorical_items_cross_product(
|
|
468
|
+
... texts, prompts, categories
|
|
469
|
+
... )
|
|
470
|
+
>>> len(items)
|
|
471
|
+
4
|
|
472
|
+
"""
|
|
473
|
+
categorical_items: list[Item] = []
|
|
474
|
+
|
|
475
|
+
for text, prompt in product(texts, prompts):
|
|
476
|
+
metadata: dict[str, MetadataValue] = {}
|
|
477
|
+
if metadata_fn:
|
|
478
|
+
metadata = metadata_fn(text, prompt)
|
|
479
|
+
|
|
480
|
+
item = create_categorical_item(
|
|
481
|
+
text=text,
|
|
482
|
+
categories=categories,
|
|
483
|
+
prompt=prompt,
|
|
484
|
+
item_template_id=item_template_id,
|
|
485
|
+
metadata=metadata,
|
|
486
|
+
)
|
|
487
|
+
categorical_items.append(item)
|
|
488
|
+
|
|
489
|
+
return categorical_items
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def create_filtered_categorical_items(
|
|
493
|
+
items: list[Item],
|
|
494
|
+
categories: list[str],
|
|
495
|
+
prompt: str | None = None,
|
|
496
|
+
*,
|
|
497
|
+
item_filter: Callable[[Item], bool] | None = None,
|
|
498
|
+
extract_text: Callable[[Item], str] | None = None,
|
|
499
|
+
item_template_id: UUID | None = None,
|
|
500
|
+
) -> list[Item]:
|
|
501
|
+
"""Create categorical items with filtering.
|
|
502
|
+
|
|
503
|
+
Parameters
|
|
504
|
+
----------
|
|
505
|
+
items : list[Item]
|
|
506
|
+
Source items.
|
|
507
|
+
categories : list[str]
|
|
508
|
+
Category labels for all items.
|
|
509
|
+
prompt : str | None
|
|
510
|
+
The question/prompt for all items.
|
|
511
|
+
item_filter : Callable[[Item], bool] | None
|
|
512
|
+
Filter individual items.
|
|
513
|
+
extract_text : Callable[[Item], str] | None
|
|
514
|
+
Text extraction function.
|
|
515
|
+
item_template_id : UUID | None
|
|
516
|
+
Template ID for created items.
|
|
517
|
+
|
|
518
|
+
Returns
|
|
519
|
+
-------
|
|
520
|
+
list[Item]
|
|
521
|
+
Filtered categorical items.
|
|
522
|
+
|
|
523
|
+
Examples
|
|
524
|
+
--------
|
|
525
|
+
>>> categorical_items = create_filtered_categorical_items(
|
|
526
|
+
... items,
|
|
527
|
+
... categories=["past", "present", "future"],
|
|
528
|
+
... prompt="What is the tense?",
|
|
529
|
+
... item_filter=lambda i: i.item_metadata.get("valid", True)
|
|
530
|
+
... ) # doctest: +SKIP
|
|
531
|
+
"""
|
|
532
|
+
# Filter items
|
|
533
|
+
filtered_items = items
|
|
534
|
+
if item_filter:
|
|
535
|
+
filtered_items = [item for item in items if item_filter(item)]
|
|
536
|
+
|
|
537
|
+
categorical_items: list[Item] = []
|
|
538
|
+
|
|
539
|
+
for item in filtered_items:
|
|
540
|
+
# Extract text
|
|
541
|
+
if extract_text:
|
|
542
|
+
text: str = extract_text(item)
|
|
543
|
+
else:
|
|
544
|
+
text = _extract_text_from_item(item)
|
|
545
|
+
|
|
546
|
+
# Create categorical item
|
|
547
|
+
metadata: dict[str, MetadataValue] = {
|
|
548
|
+
"source_item_id": str(item.id),
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
categorical_item = create_categorical_item(
|
|
552
|
+
text=text,
|
|
553
|
+
categories=categories,
|
|
554
|
+
prompt=prompt,
|
|
555
|
+
item_template_id=item_template_id,
|
|
556
|
+
metadata=metadata,
|
|
557
|
+
)
|
|
558
|
+
categorical_items.append(categorical_item)
|
|
559
|
+
|
|
560
|
+
return categorical_items
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _extract_text_from_item(item: Item) -> str:
|
|
564
|
+
"""Extract text from item's rendered_elements.
|
|
565
|
+
|
|
566
|
+
Tries common keys: "text", "sentence", "content".
|
|
567
|
+
Raises error if no suitable text found.
|
|
568
|
+
|
|
569
|
+
Parameters
|
|
570
|
+
----------
|
|
571
|
+
item : Item
|
|
572
|
+
Item to extract text from.
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
str
|
|
577
|
+
Extracted text.
|
|
578
|
+
|
|
579
|
+
Raises
|
|
580
|
+
------
|
|
581
|
+
ValueError
|
|
582
|
+
If no suitable text key found in rendered_elements.
|
|
583
|
+
"""
|
|
584
|
+
for key in ["text", "sentence", "content"]:
|
|
585
|
+
if key in item.rendered_elements:
|
|
586
|
+
return item.rendered_elements[key]
|
|
587
|
+
|
|
588
|
+
raise ValueError(
|
|
589
|
+
f"Cannot extract text from item {item.id}. "
|
|
590
|
+
f"Expected one of ['text', 'sentence', 'content'] in rendered_elements, "
|
|
591
|
+
f"but found keys: {list(item.rendered_elements.keys())}. "
|
|
592
|
+
f"Use the extract_text parameter to provide a custom extraction function."
|
|
593
|
+
)
|