bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,699 @@
|
|
|
1
|
+
"""Participant collection with JSONL I/O and DataFrame support.
|
|
2
|
+
|
|
3
|
+
This module provides ParticipantCollection and IDMappingCollection for
|
|
4
|
+
managing multiple participants with JSONL serialization and pandas/polars
|
|
5
|
+
DataFrame conversion for analysis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import TYPE_CHECKING, Literal
|
|
12
|
+
from uuid import UUID
|
|
13
|
+
|
|
14
|
+
import pandas as pd
|
|
15
|
+
import polars as pl
|
|
16
|
+
from pydantic import Field, field_validator
|
|
17
|
+
|
|
18
|
+
from bead.data.base import BeadBaseModel, JsonValue
|
|
19
|
+
from bead.data.serialization import read_jsonlines, write_jsonlines
|
|
20
|
+
from bead.participants.models import Participant, ParticipantIDMapping
|
|
21
|
+
|
|
22
|
+
if TYPE_CHECKING:
|
|
23
|
+
from bead.participants.metadata_spec import ParticipantMetadataSpec
|
|
24
|
+
|
|
25
|
+
# Type alias for supported DataFrame types (same pattern as bead/resources/lexicon.py)
|
|
26
|
+
DataFrame = pd.DataFrame | pl.DataFrame
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _empty_participant_list() -> list[Participant]:
|
|
30
|
+
"""Return empty participant list."""
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _empty_mapping_list() -> list[ParticipantIDMapping]:
|
|
35
|
+
"""Return empty mapping list."""
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ParticipantCollection(BeadBaseModel):
|
|
40
|
+
"""Collection of participants with JSONL I/O and DataFrame support.
|
|
41
|
+
|
|
42
|
+
Provides methods for managing multiple participants, saving/loading
|
|
43
|
+
from JSONL files, and converting to pandas/polars DataFrames for analysis.
|
|
44
|
+
|
|
45
|
+
Attributes
|
|
46
|
+
----------
|
|
47
|
+
name : str
|
|
48
|
+
Name of this collection.
|
|
49
|
+
participants : list[Participant]
|
|
50
|
+
List of participants.
|
|
51
|
+
metadata_spec_name : str | None
|
|
52
|
+
Name of the metadata spec used (for documentation).
|
|
53
|
+
|
|
54
|
+
Examples
|
|
55
|
+
--------
|
|
56
|
+
>>> collection = ParticipantCollection(name="study_001_participants")
|
|
57
|
+
>>> participant = Participant(
|
|
58
|
+
... participant_metadata={"age": 25, "education": "bachelors"}
|
|
59
|
+
... )
|
|
60
|
+
>>> collection.add_participant(participant)
|
|
61
|
+
>>> len(collection.participants)
|
|
62
|
+
1
|
|
63
|
+
>>> collection.to_jsonl("participants.jsonl") # doctest: +SKIP
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
name: str = Field(..., description="Collection name")
|
|
67
|
+
participants: list[Participant] = Field(
|
|
68
|
+
default_factory=_empty_participant_list, description="Participants"
|
|
69
|
+
)
|
|
70
|
+
metadata_spec_name: str | None = Field(
|
|
71
|
+
default=None, description="Metadata spec used"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
@field_validator("name")
|
|
75
|
+
@classmethod
|
|
76
|
+
def validate_name(cls, v: str) -> str:
|
|
77
|
+
"""Validate name is non-empty.
|
|
78
|
+
|
|
79
|
+
Parameters
|
|
80
|
+
----------
|
|
81
|
+
v : str
|
|
82
|
+
Collection name to validate.
|
|
83
|
+
|
|
84
|
+
Returns
|
|
85
|
+
-------
|
|
86
|
+
str
|
|
87
|
+
Validated collection name.
|
|
88
|
+
|
|
89
|
+
Raises
|
|
90
|
+
------
|
|
91
|
+
ValueError
|
|
92
|
+
If name is empty or whitespace only.
|
|
93
|
+
"""
|
|
94
|
+
if not v or not v.strip():
|
|
95
|
+
raise ValueError("Collection name cannot be empty")
|
|
96
|
+
return v.strip()
|
|
97
|
+
|
|
98
|
+
def __len__(self) -> int:
|
|
99
|
+
"""Return number of participants.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
int
|
|
104
|
+
Number of participants in the collection.
|
|
105
|
+
"""
|
|
106
|
+
return len(self.participants)
|
|
107
|
+
|
|
108
|
+
def add_participant(self, participant: Participant) -> None:
|
|
109
|
+
"""Add a participant to the collection.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
participant : Participant
|
|
114
|
+
Participant to add.
|
|
115
|
+
|
|
116
|
+
Examples
|
|
117
|
+
--------
|
|
118
|
+
>>> collection = ParticipantCollection(name="test")
|
|
119
|
+
>>> p = Participant(participant_metadata={"age": 25})
|
|
120
|
+
>>> collection.add_participant(p)
|
|
121
|
+
>>> len(collection)
|
|
122
|
+
1
|
|
123
|
+
"""
|
|
124
|
+
self.participants.append(participant)
|
|
125
|
+
self.update_modified_time()
|
|
126
|
+
|
|
127
|
+
def add_participants(self, participants: list[Participant]) -> None:
|
|
128
|
+
"""Add multiple participants to the collection.
|
|
129
|
+
|
|
130
|
+
Parameters
|
|
131
|
+
----------
|
|
132
|
+
participants : list[Participant]
|
|
133
|
+
Participants to add.
|
|
134
|
+
|
|
135
|
+
Examples
|
|
136
|
+
--------
|
|
137
|
+
>>> collection = ParticipantCollection(name="test")
|
|
138
|
+
>>> ps = [Participant(), Participant()]
|
|
139
|
+
>>> collection.add_participants(ps)
|
|
140
|
+
>>> len(collection)
|
|
141
|
+
2
|
|
142
|
+
"""
|
|
143
|
+
self.participants.extend(participants)
|
|
144
|
+
self.update_modified_time()
|
|
145
|
+
|
|
146
|
+
def get_by_id(self, participant_id: UUID) -> Participant | None:
|
|
147
|
+
"""Get participant by UUID.
|
|
148
|
+
|
|
149
|
+
Parameters
|
|
150
|
+
----------
|
|
151
|
+
participant_id : UUID
|
|
152
|
+
Participant UUID to find.
|
|
153
|
+
|
|
154
|
+
Returns
|
|
155
|
+
-------
|
|
156
|
+
Participant | None
|
|
157
|
+
Participant if found, None otherwise.
|
|
158
|
+
|
|
159
|
+
Examples
|
|
160
|
+
--------
|
|
161
|
+
>>> collection = ParticipantCollection(name="test")
|
|
162
|
+
>>> p = Participant()
|
|
163
|
+
>>> collection.add_participant(p)
|
|
164
|
+
>>> found = collection.get_by_id(p.id)
|
|
165
|
+
>>> found is not None
|
|
166
|
+
True
|
|
167
|
+
"""
|
|
168
|
+
for p in self.participants:
|
|
169
|
+
if p.id == participant_id:
|
|
170
|
+
return p
|
|
171
|
+
return None
|
|
172
|
+
|
|
173
|
+
def get_by_attribute(self, key: str, value: JsonValue) -> list[Participant]:
|
|
174
|
+
"""Get participants by metadata attribute value.
|
|
175
|
+
|
|
176
|
+
Parameters
|
|
177
|
+
----------
|
|
178
|
+
key : str
|
|
179
|
+
Attribute name.
|
|
180
|
+
value : JsonValue
|
|
181
|
+
Value to match.
|
|
182
|
+
|
|
183
|
+
Returns
|
|
184
|
+
-------
|
|
185
|
+
list[Participant]
|
|
186
|
+
Participants with matching attribute.
|
|
187
|
+
|
|
188
|
+
Examples
|
|
189
|
+
--------
|
|
190
|
+
>>> collection = ParticipantCollection(name="test")
|
|
191
|
+
>>> p1 = Participant(participant_metadata={"age": 25})
|
|
192
|
+
>>> p2 = Participant(participant_metadata={"age": 30})
|
|
193
|
+
>>> collection.add_participants([p1, p2])
|
|
194
|
+
>>> matches = collection.get_by_attribute("age", 25)
|
|
195
|
+
>>> len(matches)
|
|
196
|
+
1
|
|
197
|
+
"""
|
|
198
|
+
return [
|
|
199
|
+
p for p in self.participants if p.participant_metadata.get(key) == value
|
|
200
|
+
]
|
|
201
|
+
|
|
202
|
+
def validate_all(self, spec: ParticipantMetadataSpec) -> dict[UUID, list[str]]:
|
|
203
|
+
"""Validate all participants against a specification.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
spec : ParticipantMetadataSpec
|
|
208
|
+
Specification to validate against.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
dict[UUID, list[str]]
|
|
213
|
+
Mapping from participant ID to list of validation errors.
|
|
214
|
+
Empty dict if all valid.
|
|
215
|
+
|
|
216
|
+
Examples
|
|
217
|
+
--------
|
|
218
|
+
>>> from bead.participants.metadata_spec import (
|
|
219
|
+
... FieldSpec, ParticipantMetadataSpec
|
|
220
|
+
... )
|
|
221
|
+
>>> spec = ParticipantMetadataSpec(
|
|
222
|
+
... name="test",
|
|
223
|
+
... fields=[FieldSpec(name="age", field_type="int", required=True)]
|
|
224
|
+
... )
|
|
225
|
+
>>> collection = ParticipantCollection(name="test")
|
|
226
|
+
>>> p = Participant(participant_metadata={"age": 25})
|
|
227
|
+
>>> collection.add_participant(p)
|
|
228
|
+
>>> errors = collection.validate_all(spec)
|
|
229
|
+
>>> len(errors)
|
|
230
|
+
0
|
|
231
|
+
"""
|
|
232
|
+
errors: dict[UUID, list[str]] = {}
|
|
233
|
+
for p in self.participants:
|
|
234
|
+
is_valid, error_list = p.validate_against_spec(spec)
|
|
235
|
+
if not is_valid:
|
|
236
|
+
errors[p.id] = error_list
|
|
237
|
+
return errors
|
|
238
|
+
|
|
239
|
+
# JSONL I/O
|
|
240
|
+
|
|
241
|
+
def to_jsonl(self, path: Path | str) -> None:
|
|
242
|
+
"""Write participants to JSONL file.
|
|
243
|
+
|
|
244
|
+
Parameters
|
|
245
|
+
----------
|
|
246
|
+
path : Path | str
|
|
247
|
+
Path to output file.
|
|
248
|
+
|
|
249
|
+
Examples
|
|
250
|
+
--------
|
|
251
|
+
>>> collection = ParticipantCollection(name="test")
|
|
252
|
+
>>> collection.add_participant(Participant())
|
|
253
|
+
>>> collection.to_jsonl("/tmp/participants.jsonl") # doctest: +SKIP
|
|
254
|
+
"""
|
|
255
|
+
path = Path(path)
|
|
256
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
257
|
+
write_jsonlines(self.participants, path)
|
|
258
|
+
|
|
259
|
+
@classmethod
|
|
260
|
+
def from_jsonl(
|
|
261
|
+
cls,
|
|
262
|
+
path: Path | str,
|
|
263
|
+
name: str = "loaded_participants",
|
|
264
|
+
) -> ParticipantCollection:
|
|
265
|
+
"""Load participants from JSONL file.
|
|
266
|
+
|
|
267
|
+
Parameters
|
|
268
|
+
----------
|
|
269
|
+
path : Path | str
|
|
270
|
+
Path to JSONL file.
|
|
271
|
+
name : str
|
|
272
|
+
Name for the collection.
|
|
273
|
+
|
|
274
|
+
Returns
|
|
275
|
+
-------
|
|
276
|
+
ParticipantCollection
|
|
277
|
+
Collection with loaded participants.
|
|
278
|
+
|
|
279
|
+
Examples
|
|
280
|
+
--------
|
|
281
|
+
>>> collection = ParticipantCollection.from_jsonl(
|
|
282
|
+
... "participants.jsonl"
|
|
283
|
+
... ) # doctest: +SKIP
|
|
284
|
+
"""
|
|
285
|
+
participants = read_jsonlines(Path(path), Participant)
|
|
286
|
+
return cls(name=name, participants=participants)
|
|
287
|
+
|
|
288
|
+
# DataFrame conversion
|
|
289
|
+
|
|
290
|
+
def to_dataframe(
|
|
291
|
+
self,
|
|
292
|
+
backend: Literal["pandas", "polars"] = "pandas",
|
|
293
|
+
include_fields: list[str] | None = None,
|
|
294
|
+
exclude_fields: list[str] | None = None,
|
|
295
|
+
flatten_metadata: bool = True,
|
|
296
|
+
) -> DataFrame:
|
|
297
|
+
"""Convert to pandas or polars DataFrame.
|
|
298
|
+
|
|
299
|
+
Parameters
|
|
300
|
+
----------
|
|
301
|
+
backend : Literal["pandas", "polars"]
|
|
302
|
+
DataFrame backend to use (default: "pandas").
|
|
303
|
+
include_fields : list[str] | None
|
|
304
|
+
If provided, only include these metadata fields.
|
|
305
|
+
exclude_fields : list[str] | None
|
|
306
|
+
If provided, exclude these metadata fields.
|
|
307
|
+
flatten_metadata : bool
|
|
308
|
+
If True, flatten participant_metadata into top-level columns.
|
|
309
|
+
|
|
310
|
+
Returns
|
|
311
|
+
-------
|
|
312
|
+
DataFrame
|
|
313
|
+
pandas or polars DataFrame with participant data.
|
|
314
|
+
Always includes 'participant_id' column (as string).
|
|
315
|
+
|
|
316
|
+
Examples
|
|
317
|
+
--------
|
|
318
|
+
>>> collection = ParticipantCollection(name="test")
|
|
319
|
+
>>> p = Participant(participant_metadata={"age": 25})
|
|
320
|
+
>>> collection.add_participant(p)
|
|
321
|
+
>>> df = collection.to_dataframe()
|
|
322
|
+
>>> "participant_id" in df.columns
|
|
323
|
+
True
|
|
324
|
+
>>> "age" in df.columns
|
|
325
|
+
True
|
|
326
|
+
"""
|
|
327
|
+
if not self.participants:
|
|
328
|
+
# Return empty DataFrame with expected columns
|
|
329
|
+
columns = ["participant_id", "created_at", "study_id"]
|
|
330
|
+
if backend == "pandas":
|
|
331
|
+
return pd.DataFrame(columns=columns)
|
|
332
|
+
else:
|
|
333
|
+
schema: dict[str, type[pl.Utf8]] = dict.fromkeys(columns, pl.Utf8)
|
|
334
|
+
return pl.DataFrame(schema=schema)
|
|
335
|
+
|
|
336
|
+
records: list[dict[str, JsonValue]] = []
|
|
337
|
+
|
|
338
|
+
for p in self.participants:
|
|
339
|
+
record: dict[str, JsonValue] = {
|
|
340
|
+
"participant_id": str(p.id),
|
|
341
|
+
"created_at": p.created_at.isoformat(),
|
|
342
|
+
"study_id": p.study_id,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if flatten_metadata:
|
|
346
|
+
for key, value in p.participant_metadata.items():
|
|
347
|
+
# Apply include/exclude filters
|
|
348
|
+
if include_fields is not None and key not in include_fields:
|
|
349
|
+
continue
|
|
350
|
+
if exclude_fields is not None and key in exclude_fields:
|
|
351
|
+
continue
|
|
352
|
+
record[key] = value
|
|
353
|
+
else:
|
|
354
|
+
record["participant_metadata"] = p.participant_metadata
|
|
355
|
+
|
|
356
|
+
records.append(record)
|
|
357
|
+
|
|
358
|
+
if backend == "pandas":
|
|
359
|
+
return pd.DataFrame(records)
|
|
360
|
+
else:
|
|
361
|
+
return pl.DataFrame(records)
|
|
362
|
+
|
|
363
|
+
@classmethod
|
|
364
|
+
def from_dataframe(
|
|
365
|
+
cls,
|
|
366
|
+
df: DataFrame,
|
|
367
|
+
name: str,
|
|
368
|
+
id_column: str = "participant_id",
|
|
369
|
+
metadata_columns: list[str] | None = None,
|
|
370
|
+
) -> ParticipantCollection:
|
|
371
|
+
"""Create collection from pandas or polars DataFrame.
|
|
372
|
+
|
|
373
|
+
Parameters
|
|
374
|
+
----------
|
|
375
|
+
df : DataFrame
|
|
376
|
+
pandas or polars DataFrame with participant data.
|
|
377
|
+
name : str
|
|
378
|
+
Name for the collection.
|
|
379
|
+
id_column : str
|
|
380
|
+
Column containing participant IDs (default: "participant_id").
|
|
381
|
+
If column exists, uses those UUIDs; otherwise generates new ones.
|
|
382
|
+
metadata_columns : list[str] | None
|
|
383
|
+
Columns to include in participant_metadata.
|
|
384
|
+
If None, includes all columns except id_column.
|
|
385
|
+
|
|
386
|
+
Returns
|
|
387
|
+
-------
|
|
388
|
+
ParticipantCollection
|
|
389
|
+
Collection with participants from DataFrame.
|
|
390
|
+
|
|
391
|
+
Examples
|
|
392
|
+
--------
|
|
393
|
+
>>> import pandas as pd
|
|
394
|
+
>>> df = pd.DataFrame({
|
|
395
|
+
... "age": [25, 30],
|
|
396
|
+
... "education": ["bachelors", "masters"]
|
|
397
|
+
... })
|
|
398
|
+
>>> collection = ParticipantCollection.from_dataframe(df, "test")
|
|
399
|
+
>>> len(collection)
|
|
400
|
+
2
|
|
401
|
+
"""
|
|
402
|
+
# Check if it's a polars DataFrame
|
|
403
|
+
is_polars = isinstance(df, pl.DataFrame)
|
|
404
|
+
|
|
405
|
+
# Get columns, handling both pandas and polars
|
|
406
|
+
if is_polars:
|
|
407
|
+
assert isinstance(df, pl.DataFrame)
|
|
408
|
+
columns_list: list[str] = df.columns
|
|
409
|
+
else:
|
|
410
|
+
assert isinstance(df, pd.DataFrame)
|
|
411
|
+
columns_list = list(df.columns)
|
|
412
|
+
|
|
413
|
+
# Convert to dict format for iteration
|
|
414
|
+
rows: list[dict[str, JsonValue]]
|
|
415
|
+
if is_polars:
|
|
416
|
+
assert isinstance(df, pl.DataFrame)
|
|
417
|
+
rows = df.to_dicts() # type: ignore[assignment]
|
|
418
|
+
else:
|
|
419
|
+
assert isinstance(df, pd.DataFrame)
|
|
420
|
+
rows = df.to_dict("records") # type: ignore[assignment]
|
|
421
|
+
|
|
422
|
+
participants: list[Participant] = []
|
|
423
|
+
|
|
424
|
+
for row in rows:
|
|
425
|
+
# Handle participant ID
|
|
426
|
+
pid: UUID | None = None
|
|
427
|
+
if id_column in columns_list:
|
|
428
|
+
try:
|
|
429
|
+
pid = UUID(str(row[id_column]))
|
|
430
|
+
except (ValueError, TypeError):
|
|
431
|
+
pid = None # Will use auto-generated UUID
|
|
432
|
+
|
|
433
|
+
# Build metadata dict
|
|
434
|
+
metadata: dict[str, JsonValue] = {}
|
|
435
|
+
columns = metadata_columns or [c for c in columns_list if c != id_column]
|
|
436
|
+
for col in columns:
|
|
437
|
+
if col in row and row[col] is not None:
|
|
438
|
+
# Handle pandas NaN
|
|
439
|
+
value = row[col]
|
|
440
|
+
if is_polars:
|
|
441
|
+
# Polars uses None for nulls
|
|
442
|
+
metadata[col] = value
|
|
443
|
+
else:
|
|
444
|
+
# Pandas uses NaN - check for NaN (NaN != NaN)
|
|
445
|
+
is_nan = isinstance(value, float) and value != value
|
|
446
|
+
if not is_nan:
|
|
447
|
+
metadata[col] = value
|
|
448
|
+
|
|
449
|
+
# Create participant
|
|
450
|
+
if pid is not None:
|
|
451
|
+
participant = Participant(
|
|
452
|
+
id=pid,
|
|
453
|
+
participant_metadata=metadata,
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
participant = Participant(participant_metadata=metadata)
|
|
457
|
+
|
|
458
|
+
participants.append(participant)
|
|
459
|
+
|
|
460
|
+
return cls(name=name, participants=participants)
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
class IDMappingCollection(BeadBaseModel):
|
|
464
|
+
"""Collection of ID mappings (stored separately for privacy).
|
|
465
|
+
|
|
466
|
+
This collection should be stored in a SEPARATE file from participant
|
|
467
|
+
data for IRB/privacy compliance.
|
|
468
|
+
|
|
469
|
+
Attributes
|
|
470
|
+
----------
|
|
471
|
+
name : str
|
|
472
|
+
Name of this mapping collection.
|
|
473
|
+
mappings : list[ParticipantIDMapping]
|
|
474
|
+
List of ID mappings.
|
|
475
|
+
source : str
|
|
476
|
+
Primary source of external IDs (e.g., "prolific").
|
|
477
|
+
|
|
478
|
+
Examples
|
|
479
|
+
--------
|
|
480
|
+
>>> from uuid import uuid4
|
|
481
|
+
>>> collection = IDMappingCollection(name="study_001", source="prolific")
|
|
482
|
+
>>> mapping = collection.add_mapping("PROLIFIC_ABC123", uuid4())
|
|
483
|
+
>>> collection.get_participant_id("PROLIFIC_ABC123") is not None
|
|
484
|
+
True
|
|
485
|
+
"""
|
|
486
|
+
|
|
487
|
+
name: str = Field(..., description="Collection name")
|
|
488
|
+
mappings: list[ParticipantIDMapping] = Field(
|
|
489
|
+
default_factory=_empty_mapping_list, description="ID mappings"
|
|
490
|
+
)
|
|
491
|
+
source: str = Field(..., description="Primary external ID source")
|
|
492
|
+
|
|
493
|
+
@field_validator("name", "source")
|
|
494
|
+
@classmethod
|
|
495
|
+
def validate_non_empty(cls, v: str) -> str:
|
|
496
|
+
"""Validate string fields are non-empty.
|
|
497
|
+
|
|
498
|
+
Parameters
|
|
499
|
+
----------
|
|
500
|
+
v : str
|
|
501
|
+
String to validate.
|
|
502
|
+
|
|
503
|
+
Returns
|
|
504
|
+
-------
|
|
505
|
+
str
|
|
506
|
+
Validated string.
|
|
507
|
+
|
|
508
|
+
Raises
|
|
509
|
+
------
|
|
510
|
+
ValueError
|
|
511
|
+
If string is empty or whitespace only.
|
|
512
|
+
"""
|
|
513
|
+
if not v or not v.strip():
|
|
514
|
+
raise ValueError("Field cannot be empty")
|
|
515
|
+
return v.strip()
|
|
516
|
+
|
|
517
|
+
def __len__(self) -> int:
|
|
518
|
+
"""Return number of mappings.
|
|
519
|
+
|
|
520
|
+
Returns
|
|
521
|
+
-------
|
|
522
|
+
int
|
|
523
|
+
Number of mappings in the collection.
|
|
524
|
+
"""
|
|
525
|
+
return len(self.mappings)
|
|
526
|
+
|
|
527
|
+
def add_mapping(
|
|
528
|
+
self,
|
|
529
|
+
external_id: str,
|
|
530
|
+
participant_id: UUID,
|
|
531
|
+
external_source: str | None = None,
|
|
532
|
+
) -> ParticipantIDMapping:
|
|
533
|
+
"""Create and add a new ID mapping.
|
|
534
|
+
|
|
535
|
+
Parameters
|
|
536
|
+
----------
|
|
537
|
+
external_id : str
|
|
538
|
+
External participant ID.
|
|
539
|
+
participant_id : UUID
|
|
540
|
+
Internal participant UUID.
|
|
541
|
+
external_source : str | None
|
|
542
|
+
Source of external ID (defaults to collection's source).
|
|
543
|
+
|
|
544
|
+
Returns
|
|
545
|
+
-------
|
|
546
|
+
ParticipantIDMapping
|
|
547
|
+
The created mapping.
|
|
548
|
+
|
|
549
|
+
Examples
|
|
550
|
+
--------
|
|
551
|
+
>>> from uuid import uuid4
|
|
552
|
+
>>> collection = IDMappingCollection(name="test", source="prolific")
|
|
553
|
+
>>> mapping = collection.add_mapping("ABC123", uuid4())
|
|
554
|
+
>>> mapping.external_source
|
|
555
|
+
'prolific'
|
|
556
|
+
"""
|
|
557
|
+
mapping = ParticipantIDMapping(
|
|
558
|
+
external_id=external_id,
|
|
559
|
+
external_source=external_source or self.source,
|
|
560
|
+
participant_id=participant_id,
|
|
561
|
+
)
|
|
562
|
+
self.mappings.append(mapping)
|
|
563
|
+
self.update_modified_time()
|
|
564
|
+
return mapping
|
|
565
|
+
|
|
566
|
+
def get_participant_id(self, external_id: str) -> UUID | None:
|
|
567
|
+
"""Look up internal participant ID from external ID.
|
|
568
|
+
|
|
569
|
+
Parameters
|
|
570
|
+
----------
|
|
571
|
+
external_id : str
|
|
572
|
+
External ID to look up.
|
|
573
|
+
|
|
574
|
+
Returns
|
|
575
|
+
-------
|
|
576
|
+
UUID | None
|
|
577
|
+
Internal participant ID if found, None otherwise.
|
|
578
|
+
|
|
579
|
+
Examples
|
|
580
|
+
--------
|
|
581
|
+
>>> from uuid import uuid4
|
|
582
|
+
>>> collection = IDMappingCollection(name="test", source="prolific")
|
|
583
|
+
>>> pid = uuid4()
|
|
584
|
+
>>> collection.add_mapping("ABC123", pid)
|
|
585
|
+
>>> collection.get_participant_id("ABC123") == pid
|
|
586
|
+
True
|
|
587
|
+
>>> collection.get_participant_id("UNKNOWN") is None
|
|
588
|
+
True
|
|
589
|
+
"""
|
|
590
|
+
for m in self.mappings:
|
|
591
|
+
if m.external_id == external_id and m.is_active:
|
|
592
|
+
return m.participant_id
|
|
593
|
+
return None
|
|
594
|
+
|
|
595
|
+
def get_external_id(self, participant_id: UUID) -> str | None:
|
|
596
|
+
"""Look up external ID from internal participant ID.
|
|
597
|
+
|
|
598
|
+
Parameters
|
|
599
|
+
----------
|
|
600
|
+
participant_id : UUID
|
|
601
|
+
Internal participant ID to look up.
|
|
602
|
+
|
|
603
|
+
Returns
|
|
604
|
+
-------
|
|
605
|
+
str | None
|
|
606
|
+
External ID if found, None otherwise.
|
|
607
|
+
|
|
608
|
+
Examples
|
|
609
|
+
--------
|
|
610
|
+
>>> from uuid import uuid4
|
|
611
|
+
>>> collection = IDMappingCollection(name="test", source="prolific")
|
|
612
|
+
>>> pid = uuid4()
|
|
613
|
+
>>> collection.add_mapping("ABC123", pid)
|
|
614
|
+
>>> collection.get_external_id(pid)
|
|
615
|
+
'ABC123'
|
|
616
|
+
"""
|
|
617
|
+
for m in self.mappings:
|
|
618
|
+
if m.participant_id == participant_id and m.is_active:
|
|
619
|
+
return m.external_id
|
|
620
|
+
return None
|
|
621
|
+
|
|
622
|
+
def deactivate_all(self) -> int:
|
|
623
|
+
"""Deactivate all mappings (for bulk privacy removal).
|
|
624
|
+
|
|
625
|
+
Returns
|
|
626
|
+
-------
|
|
627
|
+
int
|
|
628
|
+
Number of mappings deactivated.
|
|
629
|
+
|
|
630
|
+
Examples
|
|
631
|
+
--------
|
|
632
|
+
>>> from uuid import uuid4
|
|
633
|
+
>>> collection = IDMappingCollection(name="test", source="prolific")
|
|
634
|
+
>>> collection.add_mapping("ABC123", uuid4())
|
|
635
|
+
>>> collection.add_mapping("DEF456", uuid4())
|
|
636
|
+
>>> count = collection.deactivate_all()
|
|
637
|
+
>>> count
|
|
638
|
+
2
|
|
639
|
+
"""
|
|
640
|
+
count = 0
|
|
641
|
+
for m in self.mappings:
|
|
642
|
+
if m.is_active:
|
|
643
|
+
m.deactivate()
|
|
644
|
+
count += 1
|
|
645
|
+
self.update_modified_time()
|
|
646
|
+
return count
|
|
647
|
+
|
|
648
|
+
# JSONL I/O
|
|
649
|
+
|
|
650
|
+
def to_jsonl(self, path: Path | str) -> None:
|
|
651
|
+
"""Write mappings to JSONL file.
|
|
652
|
+
|
|
653
|
+
Parameters
|
|
654
|
+
----------
|
|
655
|
+
path : Path | str
|
|
656
|
+
Path to output file.
|
|
657
|
+
|
|
658
|
+
Examples
|
|
659
|
+
--------
|
|
660
|
+
>>> from uuid import uuid4
|
|
661
|
+
>>> collection = IDMappingCollection(name="test", source="prolific")
|
|
662
|
+
>>> collection.add_mapping("ABC123", uuid4())
|
|
663
|
+
>>> collection.to_jsonl("/tmp/mappings.jsonl") # doctest: +SKIP
|
|
664
|
+
"""
|
|
665
|
+
path = Path(path)
|
|
666
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
667
|
+
write_jsonlines(self.mappings, path)
|
|
668
|
+
|
|
669
|
+
@classmethod
|
|
670
|
+
def from_jsonl(
|
|
671
|
+
cls,
|
|
672
|
+
path: Path | str,
|
|
673
|
+
name: str = "loaded_mappings",
|
|
674
|
+
source: str = "unknown",
|
|
675
|
+
) -> IDMappingCollection:
|
|
676
|
+
"""Load mappings from JSONL file.
|
|
677
|
+
|
|
678
|
+
Parameters
|
|
679
|
+
----------
|
|
680
|
+
path : Path | str
|
|
681
|
+
Path to JSONL file.
|
|
682
|
+
name : str
|
|
683
|
+
Name for the collection.
|
|
684
|
+
source : str
|
|
685
|
+
External ID source.
|
|
686
|
+
|
|
687
|
+
Returns
|
|
688
|
+
-------
|
|
689
|
+
IDMappingCollection
|
|
690
|
+
Collection with loaded mappings.
|
|
691
|
+
|
|
692
|
+
Examples
|
|
693
|
+
--------
|
|
694
|
+
>>> collection = IDMappingCollection.from_jsonl(
|
|
695
|
+
... "mappings.jsonl", source="prolific"
|
|
696
|
+
... ) # doctest: +SKIP
|
|
697
|
+
"""
|
|
698
|
+
mappings = read_jsonlines(Path(path), ParticipantIDMapping)
|
|
699
|
+
return cls(name=name, mappings=mappings, source=source)
|