bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,358 @@
1
+ """Repository pattern for data access with optional caching.
2
+
3
+ This module provides a generic Repository class that implements CRUD operations
4
+ for Pydantic models, with optional in-memory caching for efficient access.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from uuid import UUID
11
+
12
+ from pydantic import BaseModel
13
+
14
+ from bead.data.serialization import (
15
+ append_jsonlines,
16
+ read_jsonlines,
17
+ write_jsonlines,
18
+ )
19
+
20
+
21
+ class Repository[T: BaseModel]:
22
+ """Generic repository for CRUD operations on Pydantic models.
23
+
24
+ Provides create, read, update, delete operations with JSONLines file storage
25
+ and optional in-memory caching for efficient data access.
26
+
27
+ Type Parameters
28
+ ---------------
29
+ T : BaseModel
30
+ Pydantic model type this repository manages
31
+
32
+ Parameters
33
+ ----------
34
+ model_class : type[T]
35
+ The Pydantic model class this repository manages
36
+ storage_path : Path
37
+ Path to the JSONLines file for persistent storage
38
+ use_cache : bool, optional
39
+ Whether to use in-memory caching (default: True)
40
+
41
+ Attributes
42
+ ----------
43
+ model_class : type[T]
44
+ The Pydantic model class
45
+ storage_path : Path
46
+ Path to storage file
47
+ use_cache : bool
48
+ Whether caching is enabled
49
+ cache : dict[UUID, T]
50
+ In-memory cache of objects by ID
51
+
52
+ Examples
53
+ --------
54
+ >>> from pathlib import Path
55
+ >>> from bead.data.base import BeadBaseModel
56
+ >>> class MyModel(BeadBaseModel):
57
+ ... name: str
58
+ >>> repo = Repository[MyModel](
59
+ ... model_class=MyModel,
60
+ ... storage_path=Path("data/models.jsonl"),
61
+ ... use_cache=True
62
+ ... )
63
+ >>> obj = MyModel(name="test")
64
+ >>> repo.add(obj)
65
+ >>> loaded = repo.get(obj.id)
66
+ >>> loaded.name
67
+ 'test'
68
+ >>> repo.count()
69
+ 1
70
+ """
71
+
72
+ def __init__(
73
+ self, model_class: type[T], storage_path: Path, use_cache: bool = True
74
+ ) -> None:
75
+ self.model_class = model_class
76
+ self.storage_path = storage_path
77
+ self.use_cache = use_cache
78
+ self.cache: dict[UUID, T] = {}
79
+
80
+ # load cache on init if enabled and file exists
81
+ if self.use_cache and self.storage_path.exists():
82
+ self._load_cache()
83
+
84
+ def _load_cache(self) -> None:
85
+ """Load all objects from storage into cache.
86
+
87
+ Called during initialization if caching is enabled and the storage
88
+ file exists.
89
+ """
90
+ objects = read_jsonlines(self.storage_path, self.model_class)
91
+ self.cache = {obj.id: obj for obj in objects} # type: ignore[attr-defined]
92
+
93
+ def get(self, object_id: UUID) -> T | None:
94
+ """Get object by ID.
95
+
96
+ Parameters
97
+ ----------
98
+ object_id
99
+ ID of the object to retrieve.
100
+
101
+ Returns
102
+ -------
103
+ T | None
104
+ The object if found, None otherwise.
105
+
106
+ Examples
107
+ --------
108
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
109
+ >>> obj = MyModel(name="test")
110
+ >>> repo.add(obj)
111
+ >>> loaded = repo.get(obj.id)
112
+ >>> loaded is not None
113
+ True
114
+ """
115
+ if self.use_cache:
116
+ return self.cache.get(object_id)
117
+ else:
118
+ # scan file for object
119
+ if not self.storage_path.exists():
120
+ return None
121
+ objects = read_jsonlines(self.storage_path, self.model_class)
122
+ for obj in objects:
123
+ if obj.id == object_id: # type: ignore[attr-defined]
124
+ return obj
125
+ return None
126
+
127
+ def get_all(self) -> list[T]:
128
+ """Get all objects.
129
+
130
+ Returns
131
+ -------
132
+ list[T]
133
+ List of all objects in the repository
134
+
135
+ Examples
136
+ --------
137
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
138
+ >>> repo.add(MyModel(name="test1"))
139
+ >>> repo.add(MyModel(name="test2"))
140
+ >>> len(repo.get_all())
141
+ 2
142
+ """
143
+ if self.use_cache:
144
+ return list(self.cache.values())
145
+ else:
146
+ if not self.storage_path.exists():
147
+ return []
148
+ return read_jsonlines(self.storage_path, self.model_class)
149
+
150
+ def add(self, obj: T) -> None:
151
+ """Add single object to repository.
152
+
153
+ Appends the object to the storage file and updates cache if enabled.
154
+
155
+ Parameters
156
+ ----------
157
+ obj
158
+ Object to add.
159
+
160
+ Examples
161
+ --------
162
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
163
+ >>> obj = MyModel(name="test")
164
+ >>> repo.add(obj)
165
+ >>> repo.exists(obj.id)
166
+ True
167
+ """
168
+ # create parent directories if needed
169
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
170
+
171
+ # append to file
172
+ append_jsonlines([obj], self.storage_path)
173
+
174
+ # update cache
175
+ if self.use_cache:
176
+ self.cache[obj.id] = obj # type: ignore[attr-defined]
177
+
178
+ def add_many(self, objects: list[T]) -> None:
179
+ """Add multiple objects to repository.
180
+
181
+ Appends all objects to the storage file and updates cache if enabled.
182
+
183
+ Parameters
184
+ ----------
185
+ objects
186
+ List of objects to add.
187
+
188
+ Examples
189
+ --------
190
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
191
+ >>> objs = [MyModel(name="test1"), MyModel(name="test2")]
192
+ >>> repo.add_many(objs)
193
+ >>> repo.count()
194
+ 2
195
+ """
196
+ if not objects:
197
+ return
198
+
199
+ # create parent directories if needed
200
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
201
+
202
+ # append to file
203
+ append_jsonlines(objects, self.storage_path)
204
+
205
+ # update cache
206
+ if self.use_cache:
207
+ for obj in objects:
208
+ self.cache[obj.id] = obj # type: ignore[attr-defined]
209
+
210
+ def update(self, obj: T) -> None:
211
+ """Update existing object.
212
+
213
+ Rewrites the entire storage file with the updated object.
214
+
215
+ Parameters
216
+ ----------
217
+ obj
218
+ Object to update (must have existing ID).
219
+
220
+ Examples
221
+ --------
222
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
223
+ >>> obj = MyModel(name="test")
224
+ >>> repo.add(obj)
225
+ >>> obj.name = "updated"
226
+ >>> repo.update(obj)
227
+ >>> loaded = repo.get(obj.id)
228
+ >>> loaded.name
229
+ 'updated'
230
+ """
231
+ # update in cache
232
+ if self.use_cache:
233
+ self.cache[obj.id] = obj # type: ignore[attr-defined]
234
+
235
+ # rewrite file
236
+ objects = list(self.cache.values()) if self.use_cache else self.get_all()
237
+ # replace the object in the list
238
+ objects = [o if o.id != obj.id else obj for o in objects] # type: ignore[attr-defined]
239
+
240
+ # create parent directories if needed
241
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
242
+
243
+ write_jsonlines(objects, self.storage_path)
244
+
245
+ def delete(self, object_id: UUID) -> None:
246
+ """Delete object by ID.
247
+
248
+ Rewrites the entire storage file without the deleted object.
249
+
250
+ Parameters
251
+ ----------
252
+ object_id
253
+ ID of object to delete.
254
+
255
+ Examples
256
+ --------
257
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
258
+ >>> obj = MyModel(name="test")
259
+ >>> repo.add(obj)
260
+ >>> repo.delete(obj.id)
261
+ >>> repo.exists(obj.id)
262
+ False
263
+ """
264
+ # remove from cache
265
+ if self.use_cache:
266
+ self.cache.pop(object_id, None)
267
+
268
+ # rewrite file without the object
269
+ objects = list(self.cache.values()) if self.use_cache else self.get_all()
270
+ objects = [o for o in objects if o.id != object_id] # type: ignore[attr-defined]
271
+
272
+ if objects:
273
+ self.storage_path.parent.mkdir(parents=True, exist_ok=True)
274
+ write_jsonlines(objects, self.storage_path)
275
+ elif self.storage_path.exists():
276
+ # if no objects left, delete the file
277
+ self.storage_path.unlink()
278
+
279
+ def exists(self, object_id: UUID) -> bool:
280
+ """Check if object exists.
281
+
282
+ Parameters
283
+ ----------
284
+ object_id
285
+ ID of object to check.
286
+
287
+ Returns
288
+ -------
289
+ bool
290
+ True if object exists, False otherwise.
291
+
292
+ Examples
293
+ --------
294
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
295
+ >>> obj = MyModel(name="test")
296
+ >>> repo.add(obj)
297
+ >>> repo.exists(obj.id)
298
+ True
299
+ """
300
+ return self.get(object_id) is not None
301
+
302
+ def count(self) -> int:
303
+ """Count objects in repository.
304
+
305
+ Returns
306
+ -------
307
+ int
308
+ Number of objects
309
+
310
+ Examples
311
+ --------
312
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
313
+ >>> repo.count()
314
+ 0
315
+ >>> repo.add(MyModel(name="test"))
316
+ >>> repo.count()
317
+ 1
318
+ """
319
+ if self.use_cache:
320
+ return len(self.cache)
321
+ else:
322
+ if not self.storage_path.exists():
323
+ return 0
324
+ return len(read_jsonlines(self.storage_path, self.model_class))
325
+
326
+ def clear(self) -> None:
327
+ """Clear all objects and delete storage file.
328
+
329
+ Examples
330
+ --------
331
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
332
+ >>> repo.add(MyModel(name="test"))
333
+ >>> repo.clear()
334
+ >>> repo.count()
335
+ 0
336
+ """
337
+ # clear cache
338
+ self.cache.clear()
339
+
340
+ # delete file
341
+ if self.storage_path.exists():
342
+ self.storage_path.unlink()
343
+
344
+ def rebuild_cache(self) -> None:
345
+ """Rebuild cache from storage.
346
+
347
+ Reloads all objects from storage into the cache. Useful if the storage
348
+ file was modified externally.
349
+
350
+ Examples
351
+ --------
352
+ >>> repo = Repository[MyModel](MyModel, Path("data.jsonl"), use_cache=True)
353
+ >>> repo.rebuild_cache()
354
+ """
355
+ if not self.storage_path.exists():
356
+ self.cache.clear()
357
+ else:
358
+ self._load_cache()
@@ -0,0 +1,249 @@
1
+ """JSONLines serialization utilities for bead package.
2
+
3
+ This module provides functions for reading, writing, streaming, and appending
4
+ Pydantic models to/from JSONLines format files. JSONLines is a convenient format
5
+ for storing multiple JSON objects, with one object per line.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from collections.abc import Iterator
11
+ from pathlib import Path
12
+ from typing import TYPE_CHECKING
13
+
14
+ from pydantic import BaseModel, ValidationError
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import Sequence
18
+
19
+
20
+ class SerializationError(Exception):
21
+ """Exception raised when serialization to JSONLines fails.
22
+
23
+ This exception is raised when writing Pydantic objects to JSONLines
24
+ format encounters an error, such as file I/O issues or validation failures.
25
+ """
26
+
27
+ pass
28
+
29
+
30
+ class DeserializationError(Exception):
31
+ """Exception raised when deserialization from JSONLines fails.
32
+
33
+ This exception is raised when reading JSONLines format into Pydantic objects
34
+ encounters an error, such as file not found, invalid JSON, or validation failures.
35
+ """
36
+
37
+ pass
38
+
39
+
40
+ def write_jsonlines[T: BaseModel](
41
+ objects: Sequence[T],
42
+ path: Path | str,
43
+ validate: bool = True,
44
+ append: bool = False,
45
+ ) -> None:
46
+ """Write Pydantic objects to JSONLines file.
47
+
48
+ Serializes a sequence of Pydantic model instances to a JSONLines file,
49
+ with one JSON object per line. Each object is validated before writing
50
+ if validate=True.
51
+
52
+ Parameters
53
+ ----------
54
+ objects
55
+ Sequence of Pydantic model instances to serialize.
56
+ path
57
+ Path to the output file.
58
+ validate
59
+ Whether to validate objects before writing (default: True).
60
+ append
61
+ Whether to append to existing file or overwrite (default: False).
62
+
63
+ Raises
64
+ ------
65
+ SerializationError
66
+ If writing fails due to I/O error or validation failure
67
+
68
+ Examples
69
+ --------
70
+ >>> from pathlib import Path
71
+ >>> from bead.data.base import BeadBaseModel
72
+ >>> class TestModel(BeadBaseModel):
73
+ ... name: str
74
+ >>> objects = [TestModel(name="test1"), TestModel(name="test2")]
75
+ >>> write_jsonlines(objects, Path("output.jsonl")) # doctest: +SKIP
76
+ """
77
+ path = Path(path)
78
+ mode = "a" if append else "w"
79
+
80
+ try:
81
+ with path.open(mode, encoding="utf-8") as f:
82
+ for obj in objects:
83
+ # model_dump_json() handles validation
84
+ json_str = obj.model_dump_json()
85
+ f.write(json_str + "\n")
86
+ except (OSError, ValidationError) as e:
87
+ raise SerializationError(f"Failed to write to {path}: {e}") from e
88
+
89
+
90
+ def read_jsonlines[T: BaseModel](
91
+ path: Path | str,
92
+ model_class: type[T],
93
+ validate: bool = True,
94
+ skip_errors: bool = False,
95
+ ) -> list[T]:
96
+ """Read JSONLines file into list of Pydantic objects.
97
+
98
+ Deserializes a JSONLines file into a list of Pydantic model instances.
99
+ Each line should contain a valid JSON object. Empty lines are skipped.
100
+
101
+ Parameters
102
+ ----------
103
+ path
104
+ Path to the input file.
105
+ model_class
106
+ Pydantic model class to deserialize into.
107
+ validate
108
+ Whether to validate objects during parsing (default: True).
109
+ skip_errors
110
+ Whether to skip invalid lines or raise error (default: False).
111
+
112
+ Returns
113
+ -------
114
+ list[T]
115
+ List of deserialized Pydantic objects
116
+
117
+ Raises
118
+ ------
119
+ DeserializationError
120
+ If reading fails due to file not found, invalid JSON, or validation failure
121
+ (unless skip_errors=True)
122
+
123
+ Examples
124
+ --------
125
+ >>> from pathlib import Path
126
+ >>> from bead.data.base import BeadBaseModel
127
+ >>> class TestModel(BeadBaseModel):
128
+ ... name: str
129
+ >>> objects = read_jsonlines(Path("input.jsonl"), TestModel) # doctest: +SKIP
130
+ """
131
+ path = Path(path)
132
+ objects: list[T] = []
133
+
134
+ try:
135
+ with path.open("r", encoding="utf-8") as f:
136
+ for line_num, line in enumerate(f, start=1):
137
+ line = line.strip()
138
+ if not line: # skip empty lines
139
+ continue
140
+
141
+ try:
142
+ obj = model_class.model_validate_json(line)
143
+ objects.append(obj)
144
+ except ValidationError as e:
145
+ if skip_errors:
146
+ continue
147
+ raise DeserializationError(
148
+ f"Failed to parse line {line_num} in {path}: {e}"
149
+ ) from e
150
+ except OSError as e:
151
+ raise DeserializationError(f"Failed to read from {path}: {e}") from e
152
+
153
+ return objects
154
+
155
+
156
+ def stream_jsonlines[T: BaseModel](
157
+ path: Path | str,
158
+ model_class: type[T],
159
+ validate: bool = True,
160
+ ) -> Iterator[T]:
161
+ """Stream JSONLines file as iterator of Pydantic objects.
162
+
163
+ Memory-efficient iterator that yields Pydantic model instances one at a time
164
+ from a JSONLines file. Useful for processing large files without loading
165
+ everything into memory.
166
+
167
+ Parameters
168
+ ----------
169
+ path
170
+ Path to the input file.
171
+ model_class
172
+ Pydantic model class to deserialize into.
173
+ validate
174
+ Whether to validate objects during parsing (default: True).
175
+
176
+ Yields
177
+ ------
178
+ T
179
+ Pydantic model instances one at a time.
180
+
181
+ Raises
182
+ ------
183
+ DeserializationError
184
+ If reading fails due to file not found, invalid JSON, or validation failure
185
+
186
+ Examples
187
+ --------
188
+ >>> from pathlib import Path
189
+ >>> from bead.data.base import BeadBaseModel
190
+ >>> class TestModel(BeadBaseModel):
191
+ ... name: str
192
+ >>> for obj in stream_jsonlines(Path("input.jsonl"), TestModel): # doctest: +SKIP
193
+ ... print(obj.name)
194
+ """
195
+ path = Path(path)
196
+
197
+ try:
198
+ with path.open("r", encoding="utf-8") as f:
199
+ for line_num, line in enumerate(f, start=1):
200
+ line = line.strip()
201
+ if not line: # skip empty lines
202
+ continue
203
+
204
+ try:
205
+ obj = model_class.model_validate_json(line)
206
+ yield obj
207
+ except ValidationError as e:
208
+ raise DeserializationError(
209
+ f"Failed to parse line {line_num} in {path}: {e}"
210
+ ) from e
211
+ except OSError as e:
212
+ raise DeserializationError(f"Failed to read from {path}: {e}") from e
213
+
214
+
215
+ def append_jsonlines[T: BaseModel](
216
+ objects: Sequence[T],
217
+ path: Path | str,
218
+ validate: bool = True,
219
+ ) -> None:
220
+ """Append Pydantic objects to existing JSONLines file.
221
+
222
+ Convenience wrapper around write_jsonlines with append=True. Adds objects
223
+ to the end of an existing JSONLines file, or creates a new file if it
224
+ doesn't exist.
225
+
226
+ Parameters
227
+ ----------
228
+ objects
229
+ Sequence of Pydantic model instances to serialize.
230
+ path
231
+ Path to the output file.
232
+ validate
233
+ Whether to validate objects before writing (default: True).
234
+
235
+ Raises
236
+ ------
237
+ SerializationError
238
+ If appending fails due to I/O error or validation failure
239
+
240
+ Examples
241
+ --------
242
+ >>> from pathlib import Path
243
+ >>> from bead.data.base import BeadBaseModel
244
+ >>> class TestModel(BeadBaseModel):
245
+ ... name: str
246
+ >>> objects = [TestModel(name="test3"), TestModel(name="test4")]
247
+ >>> append_jsonlines(objects, Path("output.jsonl")) # doctest: +SKIP
248
+ """
249
+ write_jsonlines(objects, path, validate=validate, append=True)
@@ -0,0 +1,89 @@
1
+ """ISO 8601 timestamp utilities for bead package.
2
+
3
+ This module provides functions for creating, parsing, and formatting ISO 8601
4
+ timestamps with timezone information. All timestamps use UTC timezone.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import UTC, datetime
10
+
11
+
12
+ def now_iso8601() -> datetime:
13
+ """Get current UTC datetime with timezone information.
14
+
15
+ Returns the current time in UTC with timezone info attached. This is
16
+ preferred over datetime.utcnow() which is deprecated and doesn't include
17
+ timezone information.
18
+
19
+ Returns
20
+ -------
21
+ datetime
22
+ Current UTC datetime with timezone information
23
+
24
+ Examples
25
+ --------
26
+ >>> dt = now_iso8601()
27
+ >>> dt.tzinfo is not None
28
+ True
29
+ >>> dt.tzinfo == UTC
30
+ True
31
+ """
32
+ return datetime.now(UTC)
33
+
34
+
35
+ def parse_iso8601(timestamp: str) -> datetime:
36
+ """Parse ISO 8601 timestamp string to datetime.
37
+
38
+ Parses an ISO 8601 formatted string into a datetime object. The string
39
+ should include timezone information.
40
+
41
+ Parameters
42
+ ----------
43
+ timestamp
44
+ ISO 8601 formatted timestamp string (e.g., "2025-10-17T14:23:45.123456+00:00").
45
+
46
+ Returns
47
+ -------
48
+ datetime
49
+ Parsed datetime with timezone information
50
+
51
+ Examples
52
+ --------
53
+ >>> dt_str = "2025-10-17T14:23:45.123456+00:00"
54
+ >>> dt = parse_iso8601(dt_str)
55
+ >>> dt.year
56
+ 2025
57
+ >>> dt.month
58
+ 10
59
+ """
60
+ return datetime.fromisoformat(timestamp)
61
+
62
+
63
+ def format_iso8601(dt: datetime) -> str:
64
+ """Format datetime as ISO 8601 string.
65
+
66
+ Converts a datetime object to an ISO 8601 formatted string. If the datetime
67
+ doesn't have timezone information, it will be assumed to be UTC.
68
+
69
+ Parameters
70
+ ----------
71
+ dt
72
+ Datetime to format.
73
+
74
+ Returns
75
+ -------
76
+ str
77
+ ISO 8601 formatted string
78
+
79
+ Examples
80
+ --------
81
+ >>> dt = now_iso8601()
82
+ >>> formatted = format_iso8601(dt)
83
+ >>> "+00:00" in formatted or "Z" in formatted
84
+ True
85
+ """
86
+ # if datetime is naive (no timezone), assume UTC
87
+ if dt.tzinfo is None:
88
+ dt = dt.replace(tzinfo=UTC)
89
+ return dt.isoformat()