bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,699 @@
1
+ """Participant collection with JSONL I/O and DataFrame support.
2
+
3
+ This module provides ParticipantCollection and IDMappingCollection for
4
+ managing multiple participants with JSONL serialization and pandas/polars
5
+ DataFrame conversion for analysis.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+ from typing import TYPE_CHECKING, Literal
12
+ from uuid import UUID
13
+
14
+ import pandas as pd
15
+ import polars as pl
16
+ from pydantic import Field, field_validator
17
+
18
+ from bead.data.base import BeadBaseModel, JsonValue
19
+ from bead.data.serialization import read_jsonlines, write_jsonlines
20
+ from bead.participants.models import Participant, ParticipantIDMapping
21
+
22
+ if TYPE_CHECKING:
23
+ from bead.participants.metadata_spec import ParticipantMetadataSpec
24
+
25
+ # Type alias for supported DataFrame types (same pattern as bead/resources/lexicon.py)
26
+ DataFrame = pd.DataFrame | pl.DataFrame
27
+
28
+
29
+ def _empty_participant_list() -> list[Participant]:
30
+ """Return empty participant list."""
31
+ return []
32
+
33
+
34
+ def _empty_mapping_list() -> list[ParticipantIDMapping]:
35
+ """Return empty mapping list."""
36
+ return []
37
+
38
+
39
+ class ParticipantCollection(BeadBaseModel):
40
+ """Collection of participants with JSONL I/O and DataFrame support.
41
+
42
+ Provides methods for managing multiple participants, saving/loading
43
+ from JSONL files, and converting to pandas/polars DataFrames for analysis.
44
+
45
+ Attributes
46
+ ----------
47
+ name : str
48
+ Name of this collection.
49
+ participants : list[Participant]
50
+ List of participants.
51
+ metadata_spec_name : str | None
52
+ Name of the metadata spec used (for documentation).
53
+
54
+ Examples
55
+ --------
56
+ >>> collection = ParticipantCollection(name="study_001_participants")
57
+ >>> participant = Participant(
58
+ ... participant_metadata={"age": 25, "education": "bachelors"}
59
+ ... )
60
+ >>> collection.add_participant(participant)
61
+ >>> len(collection.participants)
62
+ 1
63
+ >>> collection.to_jsonl("participants.jsonl") # doctest: +SKIP
64
+ """
65
+
66
+ name: str = Field(..., description="Collection name")
67
+ participants: list[Participant] = Field(
68
+ default_factory=_empty_participant_list, description="Participants"
69
+ )
70
+ metadata_spec_name: str | None = Field(
71
+ default=None, description="Metadata spec used"
72
+ )
73
+
74
+ @field_validator("name")
75
+ @classmethod
76
+ def validate_name(cls, v: str) -> str:
77
+ """Validate name is non-empty.
78
+
79
+ Parameters
80
+ ----------
81
+ v : str
82
+ Collection name to validate.
83
+
84
+ Returns
85
+ -------
86
+ str
87
+ Validated collection name.
88
+
89
+ Raises
90
+ ------
91
+ ValueError
92
+ If name is empty or whitespace only.
93
+ """
94
+ if not v or not v.strip():
95
+ raise ValueError("Collection name cannot be empty")
96
+ return v.strip()
97
+
98
+ def __len__(self) -> int:
99
+ """Return number of participants.
100
+
101
+ Returns
102
+ -------
103
+ int
104
+ Number of participants in the collection.
105
+ """
106
+ return len(self.participants)
107
+
108
+ def add_participant(self, participant: Participant) -> None:
109
+ """Add a participant to the collection.
110
+
111
+ Parameters
112
+ ----------
113
+ participant : Participant
114
+ Participant to add.
115
+
116
+ Examples
117
+ --------
118
+ >>> collection = ParticipantCollection(name="test")
119
+ >>> p = Participant(participant_metadata={"age": 25})
120
+ >>> collection.add_participant(p)
121
+ >>> len(collection)
122
+ 1
123
+ """
124
+ self.participants.append(participant)
125
+ self.update_modified_time()
126
+
127
+ def add_participants(self, participants: list[Participant]) -> None:
128
+ """Add multiple participants to the collection.
129
+
130
+ Parameters
131
+ ----------
132
+ participants : list[Participant]
133
+ Participants to add.
134
+
135
+ Examples
136
+ --------
137
+ >>> collection = ParticipantCollection(name="test")
138
+ >>> ps = [Participant(), Participant()]
139
+ >>> collection.add_participants(ps)
140
+ >>> len(collection)
141
+ 2
142
+ """
143
+ self.participants.extend(participants)
144
+ self.update_modified_time()
145
+
146
+ def get_by_id(self, participant_id: UUID) -> Participant | None:
147
+ """Get participant by UUID.
148
+
149
+ Parameters
150
+ ----------
151
+ participant_id : UUID
152
+ Participant UUID to find.
153
+
154
+ Returns
155
+ -------
156
+ Participant | None
157
+ Participant if found, None otherwise.
158
+
159
+ Examples
160
+ --------
161
+ >>> collection = ParticipantCollection(name="test")
162
+ >>> p = Participant()
163
+ >>> collection.add_participant(p)
164
+ >>> found = collection.get_by_id(p.id)
165
+ >>> found is not None
166
+ True
167
+ """
168
+ for p in self.participants:
169
+ if p.id == participant_id:
170
+ return p
171
+ return None
172
+
173
+ def get_by_attribute(self, key: str, value: JsonValue) -> list[Participant]:
174
+ """Get participants by metadata attribute value.
175
+
176
+ Parameters
177
+ ----------
178
+ key : str
179
+ Attribute name.
180
+ value : JsonValue
181
+ Value to match.
182
+
183
+ Returns
184
+ -------
185
+ list[Participant]
186
+ Participants with matching attribute.
187
+
188
+ Examples
189
+ --------
190
+ >>> collection = ParticipantCollection(name="test")
191
+ >>> p1 = Participant(participant_metadata={"age": 25})
192
+ >>> p2 = Participant(participant_metadata={"age": 30})
193
+ >>> collection.add_participants([p1, p2])
194
+ >>> matches = collection.get_by_attribute("age", 25)
195
+ >>> len(matches)
196
+ 1
197
+ """
198
+ return [
199
+ p for p in self.participants if p.participant_metadata.get(key) == value
200
+ ]
201
+
202
+ def validate_all(self, spec: ParticipantMetadataSpec) -> dict[UUID, list[str]]:
203
+ """Validate all participants against a specification.
204
+
205
+ Parameters
206
+ ----------
207
+ spec : ParticipantMetadataSpec
208
+ Specification to validate against.
209
+
210
+ Returns
211
+ -------
212
+ dict[UUID, list[str]]
213
+ Mapping from participant ID to list of validation errors.
214
+ Empty dict if all valid.
215
+
216
+ Examples
217
+ --------
218
+ >>> from bead.participants.metadata_spec import (
219
+ ... FieldSpec, ParticipantMetadataSpec
220
+ ... )
221
+ >>> spec = ParticipantMetadataSpec(
222
+ ... name="test",
223
+ ... fields=[FieldSpec(name="age", field_type="int", required=True)]
224
+ ... )
225
+ >>> collection = ParticipantCollection(name="test")
226
+ >>> p = Participant(participant_metadata={"age": 25})
227
+ >>> collection.add_participant(p)
228
+ >>> errors = collection.validate_all(spec)
229
+ >>> len(errors)
230
+ 0
231
+ """
232
+ errors: dict[UUID, list[str]] = {}
233
+ for p in self.participants:
234
+ is_valid, error_list = p.validate_against_spec(spec)
235
+ if not is_valid:
236
+ errors[p.id] = error_list
237
+ return errors
238
+
239
+ # JSONL I/O
240
+
241
+ def to_jsonl(self, path: Path | str) -> None:
242
+ """Write participants to JSONL file.
243
+
244
+ Parameters
245
+ ----------
246
+ path : Path | str
247
+ Path to output file.
248
+
249
+ Examples
250
+ --------
251
+ >>> collection = ParticipantCollection(name="test")
252
+ >>> collection.add_participant(Participant())
253
+ >>> collection.to_jsonl("/tmp/participants.jsonl") # doctest: +SKIP
254
+ """
255
+ path = Path(path)
256
+ path.parent.mkdir(parents=True, exist_ok=True)
257
+ write_jsonlines(self.participants, path)
258
+
259
+ @classmethod
260
+ def from_jsonl(
261
+ cls,
262
+ path: Path | str,
263
+ name: str = "loaded_participants",
264
+ ) -> ParticipantCollection:
265
+ """Load participants from JSONL file.
266
+
267
+ Parameters
268
+ ----------
269
+ path : Path | str
270
+ Path to JSONL file.
271
+ name : str
272
+ Name for the collection.
273
+
274
+ Returns
275
+ -------
276
+ ParticipantCollection
277
+ Collection with loaded participants.
278
+
279
+ Examples
280
+ --------
281
+ >>> collection = ParticipantCollection.from_jsonl(
282
+ ... "participants.jsonl"
283
+ ... ) # doctest: +SKIP
284
+ """
285
+ participants = read_jsonlines(Path(path), Participant)
286
+ return cls(name=name, participants=participants)
287
+
288
+ # DataFrame conversion
289
+
290
+ def to_dataframe(
291
+ self,
292
+ backend: Literal["pandas", "polars"] = "pandas",
293
+ include_fields: list[str] | None = None,
294
+ exclude_fields: list[str] | None = None,
295
+ flatten_metadata: bool = True,
296
+ ) -> DataFrame:
297
+ """Convert to pandas or polars DataFrame.
298
+
299
+ Parameters
300
+ ----------
301
+ backend : Literal["pandas", "polars"]
302
+ DataFrame backend to use (default: "pandas").
303
+ include_fields : list[str] | None
304
+ If provided, only include these metadata fields.
305
+ exclude_fields : list[str] | None
306
+ If provided, exclude these metadata fields.
307
+ flatten_metadata : bool
308
+ If True, flatten participant_metadata into top-level columns.
309
+
310
+ Returns
311
+ -------
312
+ DataFrame
313
+ pandas or polars DataFrame with participant data.
314
+ Always includes 'participant_id' column (as string).
315
+
316
+ Examples
317
+ --------
318
+ >>> collection = ParticipantCollection(name="test")
319
+ >>> p = Participant(participant_metadata={"age": 25})
320
+ >>> collection.add_participant(p)
321
+ >>> df = collection.to_dataframe()
322
+ >>> "participant_id" in df.columns
323
+ True
324
+ >>> "age" in df.columns
325
+ True
326
+ """
327
+ if not self.participants:
328
+ # Return empty DataFrame with expected columns
329
+ columns = ["participant_id", "created_at", "study_id"]
330
+ if backend == "pandas":
331
+ return pd.DataFrame(columns=columns)
332
+ else:
333
+ schema: dict[str, type[pl.Utf8]] = dict.fromkeys(columns, pl.Utf8)
334
+ return pl.DataFrame(schema=schema)
335
+
336
+ records: list[dict[str, JsonValue]] = []
337
+
338
+ for p in self.participants:
339
+ record: dict[str, JsonValue] = {
340
+ "participant_id": str(p.id),
341
+ "created_at": p.created_at.isoformat(),
342
+ "study_id": p.study_id,
343
+ }
344
+
345
+ if flatten_metadata:
346
+ for key, value in p.participant_metadata.items():
347
+ # Apply include/exclude filters
348
+ if include_fields is not None and key not in include_fields:
349
+ continue
350
+ if exclude_fields is not None and key in exclude_fields:
351
+ continue
352
+ record[key] = value
353
+ else:
354
+ record["participant_metadata"] = p.participant_metadata
355
+
356
+ records.append(record)
357
+
358
+ if backend == "pandas":
359
+ return pd.DataFrame(records)
360
+ else:
361
+ return pl.DataFrame(records)
362
+
363
+ @classmethod
364
+ def from_dataframe(
365
+ cls,
366
+ df: DataFrame,
367
+ name: str,
368
+ id_column: str = "participant_id",
369
+ metadata_columns: list[str] | None = None,
370
+ ) -> ParticipantCollection:
371
+ """Create collection from pandas or polars DataFrame.
372
+
373
+ Parameters
374
+ ----------
375
+ df : DataFrame
376
+ pandas or polars DataFrame with participant data.
377
+ name : str
378
+ Name for the collection.
379
+ id_column : str
380
+ Column containing participant IDs (default: "participant_id").
381
+ If column exists, uses those UUIDs; otherwise generates new ones.
382
+ metadata_columns : list[str] | None
383
+ Columns to include in participant_metadata.
384
+ If None, includes all columns except id_column.
385
+
386
+ Returns
387
+ -------
388
+ ParticipantCollection
389
+ Collection with participants from DataFrame.
390
+
391
+ Examples
392
+ --------
393
+ >>> import pandas as pd
394
+ >>> df = pd.DataFrame({
395
+ ... "age": [25, 30],
396
+ ... "education": ["bachelors", "masters"]
397
+ ... })
398
+ >>> collection = ParticipantCollection.from_dataframe(df, "test")
399
+ >>> len(collection)
400
+ 2
401
+ """
402
+ # Check if it's a polars DataFrame
403
+ is_polars = isinstance(df, pl.DataFrame)
404
+
405
+ # Get columns, handling both pandas and polars
406
+ if is_polars:
407
+ assert isinstance(df, pl.DataFrame)
408
+ columns_list: list[str] = df.columns
409
+ else:
410
+ assert isinstance(df, pd.DataFrame)
411
+ columns_list = list(df.columns)
412
+
413
+ # Convert to dict format for iteration
414
+ rows: list[dict[str, JsonValue]]
415
+ if is_polars:
416
+ assert isinstance(df, pl.DataFrame)
417
+ rows = df.to_dicts() # type: ignore[assignment]
418
+ else:
419
+ assert isinstance(df, pd.DataFrame)
420
+ rows = df.to_dict("records") # type: ignore[assignment]
421
+
422
+ participants: list[Participant] = []
423
+
424
+ for row in rows:
425
+ # Handle participant ID
426
+ pid: UUID | None = None
427
+ if id_column in columns_list:
428
+ try:
429
+ pid = UUID(str(row[id_column]))
430
+ except (ValueError, TypeError):
431
+ pid = None # Will use auto-generated UUID
432
+
433
+ # Build metadata dict
434
+ metadata: dict[str, JsonValue] = {}
435
+ columns = metadata_columns or [c for c in columns_list if c != id_column]
436
+ for col in columns:
437
+ if col in row and row[col] is not None:
438
+ # Handle pandas NaN
439
+ value = row[col]
440
+ if is_polars:
441
+ # Polars uses None for nulls
442
+ metadata[col] = value
443
+ else:
444
+ # Pandas uses NaN - check for NaN (NaN != NaN)
445
+ is_nan = isinstance(value, float) and value != value
446
+ if not is_nan:
447
+ metadata[col] = value
448
+
449
+ # Create participant
450
+ if pid is not None:
451
+ participant = Participant(
452
+ id=pid,
453
+ participant_metadata=metadata,
454
+ )
455
+ else:
456
+ participant = Participant(participant_metadata=metadata)
457
+
458
+ participants.append(participant)
459
+
460
+ return cls(name=name, participants=participants)
461
+
462
+
463
+ class IDMappingCollection(BeadBaseModel):
464
+ """Collection of ID mappings (stored separately for privacy).
465
+
466
+ This collection should be stored in a SEPARATE file from participant
467
+ data for IRB/privacy compliance.
468
+
469
+ Attributes
470
+ ----------
471
+ name : str
472
+ Name of this mapping collection.
473
+ mappings : list[ParticipantIDMapping]
474
+ List of ID mappings.
475
+ source : str
476
+ Primary source of external IDs (e.g., "prolific").
477
+
478
+ Examples
479
+ --------
480
+ >>> from uuid import uuid4
481
+ >>> collection = IDMappingCollection(name="study_001", source="prolific")
482
+ >>> mapping = collection.add_mapping("PROLIFIC_ABC123", uuid4())
483
+ >>> collection.get_participant_id("PROLIFIC_ABC123") is not None
484
+ True
485
+ """
486
+
487
+ name: str = Field(..., description="Collection name")
488
+ mappings: list[ParticipantIDMapping] = Field(
489
+ default_factory=_empty_mapping_list, description="ID mappings"
490
+ )
491
+ source: str = Field(..., description="Primary external ID source")
492
+
493
+ @field_validator("name", "source")
494
+ @classmethod
495
+ def validate_non_empty(cls, v: str) -> str:
496
+ """Validate string fields are non-empty.
497
+
498
+ Parameters
499
+ ----------
500
+ v : str
501
+ String to validate.
502
+
503
+ Returns
504
+ -------
505
+ str
506
+ Validated string.
507
+
508
+ Raises
509
+ ------
510
+ ValueError
511
+ If string is empty or whitespace only.
512
+ """
513
+ if not v or not v.strip():
514
+ raise ValueError("Field cannot be empty")
515
+ return v.strip()
516
+
517
+ def __len__(self) -> int:
518
+ """Return number of mappings.
519
+
520
+ Returns
521
+ -------
522
+ int
523
+ Number of mappings in the collection.
524
+ """
525
+ return len(self.mappings)
526
+
527
+ def add_mapping(
528
+ self,
529
+ external_id: str,
530
+ participant_id: UUID,
531
+ external_source: str | None = None,
532
+ ) -> ParticipantIDMapping:
533
+ """Create and add a new ID mapping.
534
+
535
+ Parameters
536
+ ----------
537
+ external_id : str
538
+ External participant ID.
539
+ participant_id : UUID
540
+ Internal participant UUID.
541
+ external_source : str | None
542
+ Source of external ID (defaults to collection's source).
543
+
544
+ Returns
545
+ -------
546
+ ParticipantIDMapping
547
+ The created mapping.
548
+
549
+ Examples
550
+ --------
551
+ >>> from uuid import uuid4
552
+ >>> collection = IDMappingCollection(name="test", source="prolific")
553
+ >>> mapping = collection.add_mapping("ABC123", uuid4())
554
+ >>> mapping.external_source
555
+ 'prolific'
556
+ """
557
+ mapping = ParticipantIDMapping(
558
+ external_id=external_id,
559
+ external_source=external_source or self.source,
560
+ participant_id=participant_id,
561
+ )
562
+ self.mappings.append(mapping)
563
+ self.update_modified_time()
564
+ return mapping
565
+
566
+ def get_participant_id(self, external_id: str) -> UUID | None:
567
+ """Look up internal participant ID from external ID.
568
+
569
+ Parameters
570
+ ----------
571
+ external_id : str
572
+ External ID to look up.
573
+
574
+ Returns
575
+ -------
576
+ UUID | None
577
+ Internal participant ID if found, None otherwise.
578
+
579
+ Examples
580
+ --------
581
+ >>> from uuid import uuid4
582
+ >>> collection = IDMappingCollection(name="test", source="prolific")
583
+ >>> pid = uuid4()
584
+ >>> collection.add_mapping("ABC123", pid)
585
+ >>> collection.get_participant_id("ABC123") == pid
586
+ True
587
+ >>> collection.get_participant_id("UNKNOWN") is None
588
+ True
589
+ """
590
+ for m in self.mappings:
591
+ if m.external_id == external_id and m.is_active:
592
+ return m.participant_id
593
+ return None
594
+
595
+ def get_external_id(self, participant_id: UUID) -> str | None:
596
+ """Look up external ID from internal participant ID.
597
+
598
+ Parameters
599
+ ----------
600
+ participant_id : UUID
601
+ Internal participant ID to look up.
602
+
603
+ Returns
604
+ -------
605
+ str | None
606
+ External ID if found, None otherwise.
607
+
608
+ Examples
609
+ --------
610
+ >>> from uuid import uuid4
611
+ >>> collection = IDMappingCollection(name="test", source="prolific")
612
+ >>> pid = uuid4()
613
+ >>> collection.add_mapping("ABC123", pid)
614
+ >>> collection.get_external_id(pid)
615
+ 'ABC123'
616
+ """
617
+ for m in self.mappings:
618
+ if m.participant_id == participant_id and m.is_active:
619
+ return m.external_id
620
+ return None
621
+
622
+ def deactivate_all(self) -> int:
623
+ """Deactivate all mappings (for bulk privacy removal).
624
+
625
+ Returns
626
+ -------
627
+ int
628
+ Number of mappings deactivated.
629
+
630
+ Examples
631
+ --------
632
+ >>> from uuid import uuid4
633
+ >>> collection = IDMappingCollection(name="test", source="prolific")
634
+ >>> collection.add_mapping("ABC123", uuid4())
635
+ >>> collection.add_mapping("DEF456", uuid4())
636
+ >>> count = collection.deactivate_all()
637
+ >>> count
638
+ 2
639
+ """
640
+ count = 0
641
+ for m in self.mappings:
642
+ if m.is_active:
643
+ m.deactivate()
644
+ count += 1
645
+ self.update_modified_time()
646
+ return count
647
+
648
+ # JSONL I/O
649
+
650
+ def to_jsonl(self, path: Path | str) -> None:
651
+ """Write mappings to JSONL file.
652
+
653
+ Parameters
654
+ ----------
655
+ path : Path | str
656
+ Path to output file.
657
+
658
+ Examples
659
+ --------
660
+ >>> from uuid import uuid4
661
+ >>> collection = IDMappingCollection(name="test", source="prolific")
662
+ >>> collection.add_mapping("ABC123", uuid4())
663
+ >>> collection.to_jsonl("/tmp/mappings.jsonl") # doctest: +SKIP
664
+ """
665
+ path = Path(path)
666
+ path.parent.mkdir(parents=True, exist_ok=True)
667
+ write_jsonlines(self.mappings, path)
668
+
669
+ @classmethod
670
+ def from_jsonl(
671
+ cls,
672
+ path: Path | str,
673
+ name: str = "loaded_mappings",
674
+ source: str = "unknown",
675
+ ) -> IDMappingCollection:
676
+ """Load mappings from JSONL file.
677
+
678
+ Parameters
679
+ ----------
680
+ path : Path | str
681
+ Path to JSONL file.
682
+ name : str
683
+ Name for the collection.
684
+ source : str
685
+ External ID source.
686
+
687
+ Returns
688
+ -------
689
+ IDMappingCollection
690
+ Collection with loaded mappings.
691
+
692
+ Examples
693
+ --------
694
+ >>> collection = IDMappingCollection.from_jsonl(
695
+ ... "mappings.jsonl", source="prolific"
696
+ ... ) # doctest: +SKIP
697
+ """
698
+ mappings = read_jsonlines(Path(path), ParticipantIDMapping)
699
+ return cls(name=name, mappings=mappings, source=source)