bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
bead/data/repository.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Repository pattern for data access with optional caching.
|
|
2
|
+
|
|
3
|
+
This module provides a generic Repository class that implements CRUD operations
|
|
4
|
+
for Pydantic models, with optional in-memory caching for efficient access.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from bead.data.serialization import (
|
|
15
|
+
append_jsonlines,
|
|
16
|
+
read_jsonlines,
|
|
17
|
+
write_jsonlines,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Repository[T: BaseModel]:
|
|
22
|
+
"""Generic repository for CRUD operations on Pydantic models.
|
|
23
|
+
|
|
24
|
+
Provides create, read, update, delete operations with JSONLines file storage
|
|
25
|
+
and optional in-memory caching for efficient data access.
|
|
26
|
+
|
|
27
|
+
Type Parameters
|
|
28
|
+
---------------
|
|
29
|
+
T : BaseModel
|
|
30
|
+
Pydantic model type this repository manages
|
|
31
|
+
|
|
32
|
+
Parameters
|
|
33
|
+
----------
|
|
34
|
+
model_class : type[T]
|
|
35
|
+
The Pydantic model class this repository manages
|
|
36
|
+
storage_path : Path
|
|
37
|
+
Path to the JSONLines file for persistent storage
|
|
38
|
+
use_cache : bool, optional
|
|
39
|
+
Whether to use in-memory caching (default: True)
|
|
40
|
+
|
|
41
|
+
Attributes
|
|
42
|
+
----------
|
|
43
|
+
model_class : type[T]
|
|
44
|
+
The Pydantic model class
|
|
45
|
+
storage_path : Path
|
|
46
|
+
Path to storage file
|
|
47
|
+
use_cache : bool
|
|
48
|
+
Whether caching is enabled
|
|
49
|
+
cache : dict[UUID, T]
|
|
50
|
+
In-memory cache of objects by ID
|
|
51
|
+
|
|
52
|
+
Examples
|
|
53
|
+
--------
|
|
54
|
+
>>> from pathlib import Path
|
|
55
|
+
>>> from bead.data.base import BeadBaseModel
|
|
56
|
+
>>> class MyModel(BeadBaseModel):
|
|
57
|
+
... name: str
|
|
58
|
+
>>> repo = Repository[MyModel](
|
|
59
|
+
... model_class=MyModel,
|
|
60
|
+
... storage_path=Path("data/models.jsonl"),
|
|
61
|
+
... use_cache=True
|
|
62
|
+
... )
|
|
63
|
+
>>> obj = MyModel(name="test")
|
|
64
|
+
>>> repo.add(obj)
|
|
65
|
+
>>> loaded = repo.get(obj.id)
|
|
66
|
+
>>> loaded.name
|
|
67
|
+
'test'
|
|
68
|
+
>>> repo.count()
|
|
69
|
+
1
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self, model_class: type[T], storage_path: Path, use_cache: bool = True
|
|
74
|
+
) -> None:
|
|
75
|
+
self.model_class = model_class
|
|
76
|
+
self.storage_path = storage_path
|
|
77
|
+
self.use_cache = use_cache
|
|
78
|
+
self.cache: dict[UUID, T] = {}
|
|
79
|
+
|
|
80
|
+
# load cache on init if enabled and file exists
|
|
81
|
+
if self.use_cache and self.storage_path.exists():
|
|
82
|
+
self._load_cache()
|
|
83
|
+
|
|
84
|
+
def _load_cache(self) -> None:
|
|
85
|
+
"""Load all objects from storage into cache.
|
|
86
|
+
|
|
87
|
+
Called during initialization if caching is enabled and the storage
|
|
88
|
+
file exists.
|
|
89
|
+
"""
|
|
90
|
+
objects = read_jsonlines(self.storage_path, self.model_class)
|
|
91
|
+
self.cache = {obj.id: obj for obj in objects} # type: ignore[attr-defined]
|
|
92
|
+
|
|
93
|
+
def get(self, object_id: UUID) -> T | None:
|
|
94
|
+
"""Get object by ID.
|
|
95
|
+
|
|
96
|
+
Parameters
|
|
97
|
+
----------
|
|
98
|
+
object_id
|
|
99
|
+
ID of the object to retrieve.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
T | None
|
|
104
|
+
The object if found, None otherwise.
|
|
105
|
+
|
|
106
|
+
Examples
|
|
107
|
+
--------
|
|
108
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
109
|
+
>>> obj = MyModel(name="test")
|
|
110
|
+
>>> repo.add(obj)
|
|
111
|
+
>>> loaded = repo.get(obj.id)
|
|
112
|
+
>>> loaded is not None
|
|
113
|
+
True
|
|
114
|
+
"""
|
|
115
|
+
if self.use_cache:
|
|
116
|
+
return self.cache.get(object_id)
|
|
117
|
+
else:
|
|
118
|
+
# scan file for object
|
|
119
|
+
if not self.storage_path.exists():
|
|
120
|
+
return None
|
|
121
|
+
objects = read_jsonlines(self.storage_path, self.model_class)
|
|
122
|
+
for obj in objects:
|
|
123
|
+
if obj.id == object_id: # type: ignore[attr-defined]
|
|
124
|
+
return obj
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
def get_all(self) -> list[T]:
|
|
128
|
+
"""Get all objects.
|
|
129
|
+
|
|
130
|
+
Returns
|
|
131
|
+
-------
|
|
132
|
+
list[T]
|
|
133
|
+
List of all objects in the repository
|
|
134
|
+
|
|
135
|
+
Examples
|
|
136
|
+
--------
|
|
137
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
138
|
+
>>> repo.add(MyModel(name="test1"))
|
|
139
|
+
>>> repo.add(MyModel(name="test2"))
|
|
140
|
+
>>> len(repo.get_all())
|
|
141
|
+
2
|
|
142
|
+
"""
|
|
143
|
+
if self.use_cache:
|
|
144
|
+
return list(self.cache.values())
|
|
145
|
+
else:
|
|
146
|
+
if not self.storage_path.exists():
|
|
147
|
+
return []
|
|
148
|
+
return read_jsonlines(self.storage_path, self.model_class)
|
|
149
|
+
|
|
150
|
+
def add(self, obj: T) -> None:
|
|
151
|
+
"""Add single object to repository.
|
|
152
|
+
|
|
153
|
+
Appends the object to the storage file and updates cache if enabled.
|
|
154
|
+
|
|
155
|
+
Parameters
|
|
156
|
+
----------
|
|
157
|
+
obj
|
|
158
|
+
Object to add.
|
|
159
|
+
|
|
160
|
+
Examples
|
|
161
|
+
--------
|
|
162
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
163
|
+
>>> obj = MyModel(name="test")
|
|
164
|
+
>>> repo.add(obj)
|
|
165
|
+
>>> repo.exists(obj.id)
|
|
166
|
+
True
|
|
167
|
+
"""
|
|
168
|
+
# create parent directories if needed
|
|
169
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
170
|
+
|
|
171
|
+
# append to file
|
|
172
|
+
append_jsonlines([obj], self.storage_path)
|
|
173
|
+
|
|
174
|
+
# update cache
|
|
175
|
+
if self.use_cache:
|
|
176
|
+
self.cache[obj.id] = obj # type: ignore[attr-defined]
|
|
177
|
+
|
|
178
|
+
def add_many(self, objects: list[T]) -> None:
|
|
179
|
+
"""Add multiple objects to repository.
|
|
180
|
+
|
|
181
|
+
Appends all objects to the storage file and updates cache if enabled.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
objects
|
|
186
|
+
List of objects to add.
|
|
187
|
+
|
|
188
|
+
Examples
|
|
189
|
+
--------
|
|
190
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
191
|
+
>>> objs = [MyModel(name="test1"), MyModel(name="test2")]
|
|
192
|
+
>>> repo.add_many(objs)
|
|
193
|
+
>>> repo.count()
|
|
194
|
+
2
|
|
195
|
+
"""
|
|
196
|
+
if not objects:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
# create parent directories if needed
|
|
200
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
201
|
+
|
|
202
|
+
# append to file
|
|
203
|
+
append_jsonlines(objects, self.storage_path)
|
|
204
|
+
|
|
205
|
+
# update cache
|
|
206
|
+
if self.use_cache:
|
|
207
|
+
for obj in objects:
|
|
208
|
+
self.cache[obj.id] = obj # type: ignore[attr-defined]
|
|
209
|
+
|
|
210
|
+
def update(self, obj: T) -> None:
|
|
211
|
+
"""Update existing object.
|
|
212
|
+
|
|
213
|
+
Rewrites the entire storage file with the updated object.
|
|
214
|
+
|
|
215
|
+
Parameters
|
|
216
|
+
----------
|
|
217
|
+
obj
|
|
218
|
+
Object to update (must have existing ID).
|
|
219
|
+
|
|
220
|
+
Examples
|
|
221
|
+
--------
|
|
222
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
223
|
+
>>> obj = MyModel(name="test")
|
|
224
|
+
>>> repo.add(obj)
|
|
225
|
+
>>> obj.name = "updated"
|
|
226
|
+
>>> repo.update(obj)
|
|
227
|
+
>>> loaded = repo.get(obj.id)
|
|
228
|
+
>>> loaded.name
|
|
229
|
+
'updated'
|
|
230
|
+
"""
|
|
231
|
+
# update in cache
|
|
232
|
+
if self.use_cache:
|
|
233
|
+
self.cache[obj.id] = obj # type: ignore[attr-defined]
|
|
234
|
+
|
|
235
|
+
# rewrite file
|
|
236
|
+
objects = list(self.cache.values()) if self.use_cache else self.get_all()
|
|
237
|
+
# replace the object in the list
|
|
238
|
+
objects = [o if o.id != obj.id else obj for o in objects] # type: ignore[attr-defined]
|
|
239
|
+
|
|
240
|
+
# create parent directories if needed
|
|
241
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
|
|
243
|
+
write_jsonlines(objects, self.storage_path)
|
|
244
|
+
|
|
245
|
+
def delete(self, object_id: UUID) -> None:
|
|
246
|
+
"""Delete object by ID.
|
|
247
|
+
|
|
248
|
+
Rewrites the entire storage file without the deleted object.
|
|
249
|
+
|
|
250
|
+
Parameters
|
|
251
|
+
----------
|
|
252
|
+
object_id
|
|
253
|
+
ID of object to delete.
|
|
254
|
+
|
|
255
|
+
Examples
|
|
256
|
+
--------
|
|
257
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
258
|
+
>>> obj = MyModel(name="test")
|
|
259
|
+
>>> repo.add(obj)
|
|
260
|
+
>>> repo.delete(obj.id)
|
|
261
|
+
>>> repo.exists(obj.id)
|
|
262
|
+
False
|
|
263
|
+
"""
|
|
264
|
+
# remove from cache
|
|
265
|
+
if self.use_cache:
|
|
266
|
+
self.cache.pop(object_id, None)
|
|
267
|
+
|
|
268
|
+
# rewrite file without the object
|
|
269
|
+
objects = list(self.cache.values()) if self.use_cache else self.get_all()
|
|
270
|
+
objects = [o for o in objects if o.id != object_id] # type: ignore[attr-defined]
|
|
271
|
+
|
|
272
|
+
if objects:
|
|
273
|
+
self.storage_path.parent.mkdir(parents=True, exist_ok=True)
|
|
274
|
+
write_jsonlines(objects, self.storage_path)
|
|
275
|
+
elif self.storage_path.exists():
|
|
276
|
+
# if no objects left, delete the file
|
|
277
|
+
self.storage_path.unlink()
|
|
278
|
+
|
|
279
|
+
def exists(self, object_id: UUID) -> bool:
|
|
280
|
+
"""Check if object exists.
|
|
281
|
+
|
|
282
|
+
Parameters
|
|
283
|
+
----------
|
|
284
|
+
object_id
|
|
285
|
+
ID of object to check.
|
|
286
|
+
|
|
287
|
+
Returns
|
|
288
|
+
-------
|
|
289
|
+
bool
|
|
290
|
+
True if object exists, False otherwise.
|
|
291
|
+
|
|
292
|
+
Examples
|
|
293
|
+
--------
|
|
294
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
295
|
+
>>> obj = MyModel(name="test")
|
|
296
|
+
>>> repo.add(obj)
|
|
297
|
+
>>> repo.exists(obj.id)
|
|
298
|
+
True
|
|
299
|
+
"""
|
|
300
|
+
return self.get(object_id) is not None
|
|
301
|
+
|
|
302
|
+
def count(self) -> int:
|
|
303
|
+
"""Count objects in repository.
|
|
304
|
+
|
|
305
|
+
Returns
|
|
306
|
+
-------
|
|
307
|
+
int
|
|
308
|
+
Number of objects
|
|
309
|
+
|
|
310
|
+
Examples
|
|
311
|
+
--------
|
|
312
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
313
|
+
>>> repo.count()
|
|
314
|
+
0
|
|
315
|
+
>>> repo.add(MyModel(name="test"))
|
|
316
|
+
>>> repo.count()
|
|
317
|
+
1
|
|
318
|
+
"""
|
|
319
|
+
if self.use_cache:
|
|
320
|
+
return len(self.cache)
|
|
321
|
+
else:
|
|
322
|
+
if not self.storage_path.exists():
|
|
323
|
+
return 0
|
|
324
|
+
return len(read_jsonlines(self.storage_path, self.model_class))
|
|
325
|
+
|
|
326
|
+
def clear(self) -> None:
|
|
327
|
+
"""Clear all objects and delete storage file.
|
|
328
|
+
|
|
329
|
+
Examples
|
|
330
|
+
--------
|
|
331
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"))
|
|
332
|
+
>>> repo.add(MyModel(name="test"))
|
|
333
|
+
>>> repo.clear()
|
|
334
|
+
>>> repo.count()
|
|
335
|
+
0
|
|
336
|
+
"""
|
|
337
|
+
# clear cache
|
|
338
|
+
self.cache.clear()
|
|
339
|
+
|
|
340
|
+
# delete file
|
|
341
|
+
if self.storage_path.exists():
|
|
342
|
+
self.storage_path.unlink()
|
|
343
|
+
|
|
344
|
+
def rebuild_cache(self) -> None:
|
|
345
|
+
"""Rebuild cache from storage.
|
|
346
|
+
|
|
347
|
+
Reloads all objects from storage into the cache. Useful if the storage
|
|
348
|
+
file was modified externally.
|
|
349
|
+
|
|
350
|
+
Examples
|
|
351
|
+
--------
|
|
352
|
+
>>> repo = Repository[MyModel](MyModel, Path("data.jsonl"), use_cache=True)
|
|
353
|
+
>>> repo.rebuild_cache()
|
|
354
|
+
"""
|
|
355
|
+
if not self.storage_path.exists():
|
|
356
|
+
self.cache.clear()
|
|
357
|
+
else:
|
|
358
|
+
self._load_cache()
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""JSONLines serialization utilities for bead package.
|
|
2
|
+
|
|
3
|
+
This module provides functions for reading, writing, streaming, and appending
|
|
4
|
+
Pydantic models to/from JSONLines format files. JSONLines is a convenient format
|
|
5
|
+
for storing multiple JSON objects, with one object per line.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Iterator
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ValidationError
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SerializationError(Exception):
|
|
21
|
+
"""Exception raised when serialization to JSONLines fails.
|
|
22
|
+
|
|
23
|
+
This exception is raised when writing Pydantic objects to JSONLines
|
|
24
|
+
format encounters an error, such as file I/O issues or validation failures.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
pass
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class DeserializationError(Exception):
|
|
31
|
+
"""Exception raised when deserialization from JSONLines fails.
|
|
32
|
+
|
|
33
|
+
This exception is raised when reading JSONLines format into Pydantic objects
|
|
34
|
+
encounters an error, such as file not found, invalid JSON, or validation failures.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def write_jsonlines[T: BaseModel](
|
|
41
|
+
objects: Sequence[T],
|
|
42
|
+
path: Path | str,
|
|
43
|
+
validate: bool = True,
|
|
44
|
+
append: bool = False,
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Write Pydantic objects to JSONLines file.
|
|
47
|
+
|
|
48
|
+
Serializes a sequence of Pydantic model instances to a JSONLines file,
|
|
49
|
+
with one JSON object per line. Each object is validated before writing
|
|
50
|
+
if validate=True.
|
|
51
|
+
|
|
52
|
+
Parameters
|
|
53
|
+
----------
|
|
54
|
+
objects
|
|
55
|
+
Sequence of Pydantic model instances to serialize.
|
|
56
|
+
path
|
|
57
|
+
Path to the output file.
|
|
58
|
+
validate
|
|
59
|
+
Whether to validate objects before writing (default: True).
|
|
60
|
+
append
|
|
61
|
+
Whether to append to existing file or overwrite (default: False).
|
|
62
|
+
|
|
63
|
+
Raises
|
|
64
|
+
------
|
|
65
|
+
SerializationError
|
|
66
|
+
If writing fails due to I/O error or validation failure
|
|
67
|
+
|
|
68
|
+
Examples
|
|
69
|
+
--------
|
|
70
|
+
>>> from pathlib import Path
|
|
71
|
+
>>> from bead.data.base import BeadBaseModel
|
|
72
|
+
>>> class TestModel(BeadBaseModel):
|
|
73
|
+
... name: str
|
|
74
|
+
>>> objects = [TestModel(name="test1"), TestModel(name="test2")]
|
|
75
|
+
>>> write_jsonlines(objects, Path("output.jsonl")) # doctest: +SKIP
|
|
76
|
+
"""
|
|
77
|
+
path = Path(path)
|
|
78
|
+
mode = "a" if append else "w"
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
with path.open(mode, encoding="utf-8") as f:
|
|
82
|
+
for obj in objects:
|
|
83
|
+
# model_dump_json() handles validation
|
|
84
|
+
json_str = obj.model_dump_json()
|
|
85
|
+
f.write(json_str + "\n")
|
|
86
|
+
except (OSError, ValidationError) as e:
|
|
87
|
+
raise SerializationError(f"Failed to write to {path}: {e}") from e
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def read_jsonlines[T: BaseModel](
|
|
91
|
+
path: Path | str,
|
|
92
|
+
model_class: type[T],
|
|
93
|
+
validate: bool = True,
|
|
94
|
+
skip_errors: bool = False,
|
|
95
|
+
) -> list[T]:
|
|
96
|
+
"""Read JSONLines file into list of Pydantic objects.
|
|
97
|
+
|
|
98
|
+
Deserializes a JSONLines file into a list of Pydantic model instances.
|
|
99
|
+
Each line should contain a valid JSON object. Empty lines are skipped.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
path
|
|
104
|
+
Path to the input file.
|
|
105
|
+
model_class
|
|
106
|
+
Pydantic model class to deserialize into.
|
|
107
|
+
validate
|
|
108
|
+
Whether to validate objects during parsing (default: True).
|
|
109
|
+
skip_errors
|
|
110
|
+
Whether to skip invalid lines or raise error (default: False).
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
list[T]
|
|
115
|
+
List of deserialized Pydantic objects
|
|
116
|
+
|
|
117
|
+
Raises
|
|
118
|
+
------
|
|
119
|
+
DeserializationError
|
|
120
|
+
If reading fails due to file not found, invalid JSON, or validation failure
|
|
121
|
+
(unless skip_errors=True)
|
|
122
|
+
|
|
123
|
+
Examples
|
|
124
|
+
--------
|
|
125
|
+
>>> from pathlib import Path
|
|
126
|
+
>>> from bead.data.base import BeadBaseModel
|
|
127
|
+
>>> class TestModel(BeadBaseModel):
|
|
128
|
+
... name: str
|
|
129
|
+
>>> objects = read_jsonlines(Path("input.jsonl"), TestModel) # doctest: +SKIP
|
|
130
|
+
"""
|
|
131
|
+
path = Path(path)
|
|
132
|
+
objects: list[T] = []
|
|
133
|
+
|
|
134
|
+
try:
|
|
135
|
+
with path.open("r", encoding="utf-8") as f:
|
|
136
|
+
for line_num, line in enumerate(f, start=1):
|
|
137
|
+
line = line.strip()
|
|
138
|
+
if not line: # skip empty lines
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
obj = model_class.model_validate_json(line)
|
|
143
|
+
objects.append(obj)
|
|
144
|
+
except ValidationError as e:
|
|
145
|
+
if skip_errors:
|
|
146
|
+
continue
|
|
147
|
+
raise DeserializationError(
|
|
148
|
+
f"Failed to parse line {line_num} in {path}: {e}"
|
|
149
|
+
) from e
|
|
150
|
+
except OSError as e:
|
|
151
|
+
raise DeserializationError(f"Failed to read from {path}: {e}") from e
|
|
152
|
+
|
|
153
|
+
return objects
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def stream_jsonlines[T: BaseModel](
|
|
157
|
+
path: Path | str,
|
|
158
|
+
model_class: type[T],
|
|
159
|
+
validate: bool = True,
|
|
160
|
+
) -> Iterator[T]:
|
|
161
|
+
"""Stream JSONLines file as iterator of Pydantic objects.
|
|
162
|
+
|
|
163
|
+
Memory-efficient iterator that yields Pydantic model instances one at a time
|
|
164
|
+
from a JSONLines file. Useful for processing large files without loading
|
|
165
|
+
everything into memory.
|
|
166
|
+
|
|
167
|
+
Parameters
|
|
168
|
+
----------
|
|
169
|
+
path
|
|
170
|
+
Path to the input file.
|
|
171
|
+
model_class
|
|
172
|
+
Pydantic model class to deserialize into.
|
|
173
|
+
validate
|
|
174
|
+
Whether to validate objects during parsing (default: True).
|
|
175
|
+
|
|
176
|
+
Yields
|
|
177
|
+
------
|
|
178
|
+
T
|
|
179
|
+
Pydantic model instances one at a time.
|
|
180
|
+
|
|
181
|
+
Raises
|
|
182
|
+
------
|
|
183
|
+
DeserializationError
|
|
184
|
+
If reading fails due to file not found, invalid JSON, or validation failure
|
|
185
|
+
|
|
186
|
+
Examples
|
|
187
|
+
--------
|
|
188
|
+
>>> from pathlib import Path
|
|
189
|
+
>>> from bead.data.base import BeadBaseModel
|
|
190
|
+
>>> class TestModel(BeadBaseModel):
|
|
191
|
+
... name: str
|
|
192
|
+
>>> for obj in stream_jsonlines(Path("input.jsonl"), TestModel): # doctest: +SKIP
|
|
193
|
+
... print(obj.name)
|
|
194
|
+
"""
|
|
195
|
+
path = Path(path)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
with path.open("r", encoding="utf-8") as f:
|
|
199
|
+
for line_num, line in enumerate(f, start=1):
|
|
200
|
+
line = line.strip()
|
|
201
|
+
if not line: # skip empty lines
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
obj = model_class.model_validate_json(line)
|
|
206
|
+
yield obj
|
|
207
|
+
except ValidationError as e:
|
|
208
|
+
raise DeserializationError(
|
|
209
|
+
f"Failed to parse line {line_num} in {path}: {e}"
|
|
210
|
+
) from e
|
|
211
|
+
except OSError as e:
|
|
212
|
+
raise DeserializationError(f"Failed to read from {path}: {e}") from e
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def append_jsonlines[T: BaseModel](
|
|
216
|
+
objects: Sequence[T],
|
|
217
|
+
path: Path | str,
|
|
218
|
+
validate: bool = True,
|
|
219
|
+
) -> None:
|
|
220
|
+
"""Append Pydantic objects to existing JSONLines file.
|
|
221
|
+
|
|
222
|
+
Convenience wrapper around write_jsonlines with append=True. Adds objects
|
|
223
|
+
to the end of an existing JSONLines file, or creates a new file if it
|
|
224
|
+
doesn't exist.
|
|
225
|
+
|
|
226
|
+
Parameters
|
|
227
|
+
----------
|
|
228
|
+
objects
|
|
229
|
+
Sequence of Pydantic model instances to serialize.
|
|
230
|
+
path
|
|
231
|
+
Path to the output file.
|
|
232
|
+
validate
|
|
233
|
+
Whether to validate objects before writing (default: True).
|
|
234
|
+
|
|
235
|
+
Raises
|
|
236
|
+
------
|
|
237
|
+
SerializationError
|
|
238
|
+
If appending fails due to I/O error or validation failure
|
|
239
|
+
|
|
240
|
+
Examples
|
|
241
|
+
--------
|
|
242
|
+
>>> from pathlib import Path
|
|
243
|
+
>>> from bead.data.base import BeadBaseModel
|
|
244
|
+
>>> class TestModel(BeadBaseModel):
|
|
245
|
+
... name: str
|
|
246
|
+
>>> objects = [TestModel(name="test3"), TestModel(name="test4")]
|
|
247
|
+
>>> append_jsonlines(objects, Path("output.jsonl")) # doctest: +SKIP
|
|
248
|
+
"""
|
|
249
|
+
write_jsonlines(objects, path, validate=validate, append=True)
|
bead/data/timestamps.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""ISO 8601 timestamp utilities for bead package.
|
|
2
|
+
|
|
3
|
+
This module provides functions for creating, parsing, and formatting ISO 8601
|
|
4
|
+
timestamps with timezone information. All timestamps use UTC timezone.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def now_iso8601() -> datetime:
|
|
13
|
+
"""Get current UTC datetime with timezone information.
|
|
14
|
+
|
|
15
|
+
Returns the current time in UTC with timezone info attached. This is
|
|
16
|
+
preferred over datetime.utcnow() which is deprecated and doesn't include
|
|
17
|
+
timezone information.
|
|
18
|
+
|
|
19
|
+
Returns
|
|
20
|
+
-------
|
|
21
|
+
datetime
|
|
22
|
+
Current UTC datetime with timezone information
|
|
23
|
+
|
|
24
|
+
Examples
|
|
25
|
+
--------
|
|
26
|
+
>>> dt = now_iso8601()
|
|
27
|
+
>>> dt.tzinfo is not None
|
|
28
|
+
True
|
|
29
|
+
>>> dt.tzinfo == UTC
|
|
30
|
+
True
|
|
31
|
+
"""
|
|
32
|
+
return datetime.now(UTC)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def parse_iso8601(timestamp: str) -> datetime:
|
|
36
|
+
"""Parse ISO 8601 timestamp string to datetime.
|
|
37
|
+
|
|
38
|
+
Parses an ISO 8601 formatted string into a datetime object. The string
|
|
39
|
+
should include timezone information.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
timestamp
|
|
44
|
+
ISO 8601 formatted timestamp string (e.g., "2025-10-17T14:23:45.123456+00:00").
|
|
45
|
+
|
|
46
|
+
Returns
|
|
47
|
+
-------
|
|
48
|
+
datetime
|
|
49
|
+
Parsed datetime with timezone information
|
|
50
|
+
|
|
51
|
+
Examples
|
|
52
|
+
--------
|
|
53
|
+
>>> dt_str = "2025-10-17T14:23:45.123456+00:00"
|
|
54
|
+
>>> dt = parse_iso8601(dt_str)
|
|
55
|
+
>>> dt.year
|
|
56
|
+
2025
|
|
57
|
+
>>> dt.month
|
|
58
|
+
10
|
|
59
|
+
"""
|
|
60
|
+
return datetime.fromisoformat(timestamp)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def format_iso8601(dt: datetime) -> str:
|
|
64
|
+
"""Format datetime as ISO 8601 string.
|
|
65
|
+
|
|
66
|
+
Converts a datetime object to an ISO 8601 formatted string. If the datetime
|
|
67
|
+
doesn't have timezone information, it will be assumed to be UTC.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
dt
|
|
72
|
+
Datetime to format.
|
|
73
|
+
|
|
74
|
+
Returns
|
|
75
|
+
-------
|
|
76
|
+
str
|
|
77
|
+
ISO 8601 formatted string
|
|
78
|
+
|
|
79
|
+
Examples
|
|
80
|
+
--------
|
|
81
|
+
>>> dt = now_iso8601()
|
|
82
|
+
>>> formatted = format_iso8601(dt)
|
|
83
|
+
>>> "+00:00" in formatted or "Z" in formatted
|
|
84
|
+
True
|
|
85
|
+
"""
|
|
86
|
+
# if datetime is naive (no timezone), assume UTC
|
|
87
|
+
if dt.tzinfo is None:
|
|
88
|
+
dt = dt.replace(tzinfo=UTC)
|
|
89
|
+
return dt.isoformat()
|