bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
bead/data/__init__.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Data infrastructure.
|
|
2
|
+
|
|
3
|
+
Provides core data models, identifiers, timestamps, serialization,
|
|
4
|
+
metadata tracking, repository pattern, and validation utilities.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from bead.data.base import BeadBaseModel
|
|
10
|
+
from bead.data.identifiers import extract_timestamp, generate_uuid, is_valid_uuid7
|
|
11
|
+
from bead.data.metadata import (
|
|
12
|
+
MetadataTracker,
|
|
13
|
+
ProcessingRecord,
|
|
14
|
+
ProvenanceRecord,
|
|
15
|
+
)
|
|
16
|
+
from bead.data.range import Range
|
|
17
|
+
from bead.data.repository import Repository
|
|
18
|
+
from bead.data.serialization import (
|
|
19
|
+
DeserializationError,
|
|
20
|
+
SerializationError,
|
|
21
|
+
append_jsonlines,
|
|
22
|
+
read_jsonlines,
|
|
23
|
+
stream_jsonlines,
|
|
24
|
+
write_jsonlines,
|
|
25
|
+
)
|
|
26
|
+
from bead.data.timestamps import format_iso8601, now_iso8601, parse_iso8601
|
|
27
|
+
from bead.data.validation import (
|
|
28
|
+
ValidationReport,
|
|
29
|
+
validate_jsonlines_file,
|
|
30
|
+
validate_provenance_chain,
|
|
31
|
+
validate_uuid_references,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
# Base model
|
|
36
|
+
"BeadBaseModel",
|
|
37
|
+
# Identifiers
|
|
38
|
+
"generate_uuid",
|
|
39
|
+
"extract_timestamp",
|
|
40
|
+
"is_valid_uuid7",
|
|
41
|
+
# Range
|
|
42
|
+
"Range",
|
|
43
|
+
# Timestamps
|
|
44
|
+
"now_iso8601",
|
|
45
|
+
"parse_iso8601",
|
|
46
|
+
"format_iso8601",
|
|
47
|
+
# Serialization
|
|
48
|
+
"write_jsonlines",
|
|
49
|
+
"read_jsonlines",
|
|
50
|
+
"stream_jsonlines",
|
|
51
|
+
"append_jsonlines",
|
|
52
|
+
"SerializationError",
|
|
53
|
+
"DeserializationError",
|
|
54
|
+
# Metadata
|
|
55
|
+
"MetadataTracker",
|
|
56
|
+
"ProvenanceRecord",
|
|
57
|
+
"ProcessingRecord",
|
|
58
|
+
# Repository
|
|
59
|
+
"Repository",
|
|
60
|
+
# Validation
|
|
61
|
+
"ValidationReport",
|
|
62
|
+
"validate_jsonlines_file",
|
|
63
|
+
"validate_uuid_references",
|
|
64
|
+
"validate_provenance_chain",
|
|
65
|
+
]
|
bead/data/base.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Base Pydantic model for all bead objects.
|
|
2
|
+
|
|
3
|
+
This module provides BeadBaseModel, the foundational Pydantic v2 model that all
|
|
4
|
+
bead data models should inherit from. It provides automatic ID generation,
|
|
5
|
+
timestamp tracking, and versioning.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from uuid import UUID
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
14
|
+
|
|
15
|
+
from bead.data.identifiers import generate_uuid
|
|
16
|
+
from bead.data.timestamps import now_iso8601
|
|
17
|
+
|
|
18
|
+
# Type alias for JSON-serializable values (recursive type)
|
|
19
|
+
type JsonValue = (
|
|
20
|
+
str | int | float | bool | None | list[JsonValue] | dict[str, JsonValue]
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BeadBaseModel(BaseModel):
|
|
25
|
+
"""Base Pydantic model for all bead objects.
|
|
26
|
+
|
|
27
|
+
This model provides foundational fields and configuration that all bead
|
|
28
|
+
data models inherit. It includes automatic ID generation using UUIDv7,
|
|
29
|
+
timestamp tracking for creation and modification, versioning, and metadata.
|
|
30
|
+
|
|
31
|
+
Attributes
|
|
32
|
+
----------
|
|
33
|
+
id : UUID
|
|
34
|
+
Unique identifier (UUIDv7) automatically generated on creation
|
|
35
|
+
created_at : datetime
|
|
36
|
+
UTC timestamp when object was created
|
|
37
|
+
modified_at : datetime
|
|
38
|
+
UTC timestamp when object was last modified
|
|
39
|
+
version : str
|
|
40
|
+
Version string for schema versioning (default: "1.0.0")
|
|
41
|
+
metadata : dict[str, JsonValue]
|
|
42
|
+
Optional metadata dictionary for arbitrary key-value pairs
|
|
43
|
+
|
|
44
|
+
Examples
|
|
45
|
+
--------
|
|
46
|
+
>>> class MyModel(BeadBaseModel):
|
|
47
|
+
... name: str
|
|
48
|
+
... value: int
|
|
49
|
+
>>> obj = MyModel(name="test", value=42)
|
|
50
|
+
>>> obj.id # doctest: +SKIP
|
|
51
|
+
UUID('...')
|
|
52
|
+
>>> obj.version
|
|
53
|
+
'1.0.0'
|
|
54
|
+
>>> obj.update_modified_time()
|
|
55
|
+
>>> obj.modified_at > obj.created_at
|
|
56
|
+
True
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
model_config = ConfigDict(
|
|
60
|
+
extra="forbid", # Disallow extra fields not defined in model
|
|
61
|
+
frozen=False, # Allow modification after creation
|
|
62
|
+
validate_assignment=True, # Validate when assigning to fields
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
id: UUID = Field(default_factory=generate_uuid)
|
|
66
|
+
created_at: datetime = Field(default_factory=now_iso8601)
|
|
67
|
+
modified_at: datetime = Field(default_factory=now_iso8601)
|
|
68
|
+
version: str = Field(default="1.0.0")
|
|
69
|
+
metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
70
|
+
|
|
71
|
+
def update_modified_time(self) -> None:
|
|
72
|
+
"""Update the modified_at timestamp to current UTC time.
|
|
73
|
+
|
|
74
|
+
This method should be called whenever the object is modified to
|
|
75
|
+
maintain accurate modification tracking.
|
|
76
|
+
|
|
77
|
+
Examples
|
|
78
|
+
--------
|
|
79
|
+
>>> obj = BeadBaseModel()
|
|
80
|
+
>>> original_time = obj.modified_at
|
|
81
|
+
>>> import time
|
|
82
|
+
>>> time.sleep(0.01) # Small delay to ensure different timestamp
|
|
83
|
+
>>> obj.update_modified_time()
|
|
84
|
+
>>> obj.modified_at > original_time
|
|
85
|
+
True
|
|
86
|
+
"""
|
|
87
|
+
self.modified_at = now_iso8601()
|
bead/data/identifiers.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""UUIDv7 generation and utilities for bead package.
|
|
2
|
+
|
|
3
|
+
This module provides functions for generating time-ordered UUIDv7 identifiers,
|
|
4
|
+
extracting timestamps from them, and validating UUID versions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
import uuid_utils
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_uuid() -> UUID:
|
|
15
|
+
"""Generate a time-ordered UUIDv7.
|
|
16
|
+
|
|
17
|
+
UUIDv7 is a time-ordered UUID format that embeds a timestamp in the first
|
|
18
|
+
48 bits, making UUIDs sortable by creation time. This is useful for
|
|
19
|
+
maintaining chronological ordering of database records.
|
|
20
|
+
|
|
21
|
+
Returns
|
|
22
|
+
-------
|
|
23
|
+
UUID
|
|
24
|
+
A newly generated UUIDv7 with embedded timestamp
|
|
25
|
+
|
|
26
|
+
Examples
|
|
27
|
+
--------
|
|
28
|
+
>>> uuid1 = generate_uuid()
|
|
29
|
+
>>> uuid2 = generate_uuid()
|
|
30
|
+
>>> uuid1 < uuid2 # uuids are time-ordered
|
|
31
|
+
True
|
|
32
|
+
"""
|
|
33
|
+
# convert uuid_utils.UUID to standard Python UUID for Pydantic compatibility
|
|
34
|
+
uuid7 = uuid_utils.uuid7()
|
|
35
|
+
return UUID(str(uuid7))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def extract_timestamp(uuid: UUID) -> int:
|
|
39
|
+
"""Extract timestamp in milliseconds from a UUIDv7.
|
|
40
|
+
|
|
41
|
+
The timestamp is stored in the first 48 bits of the UUID and represents
|
|
42
|
+
milliseconds since Unix epoch (January 1, 1970 00:00:00 UTC).
|
|
43
|
+
|
|
44
|
+
Parameters
|
|
45
|
+
----------
|
|
46
|
+
uuid
|
|
47
|
+
The UUIDv7 to extract timestamp from.
|
|
48
|
+
|
|
49
|
+
Returns
|
|
50
|
+
-------
|
|
51
|
+
int
|
|
52
|
+
Timestamp in milliseconds since Unix epoch
|
|
53
|
+
|
|
54
|
+
Examples
|
|
55
|
+
--------
|
|
56
|
+
>>> import time
|
|
57
|
+
>>> uuid = generate_uuid()
|
|
58
|
+
>>> timestamp = extract_timestamp(uuid)
|
|
59
|
+
>>> current_time = int(time.time() * 1000)
|
|
60
|
+
>>> abs(timestamp - current_time) < 1000 # within 1 second
|
|
61
|
+
True
|
|
62
|
+
"""
|
|
63
|
+
# UUIDv7 stores timestamp in first 48 bits (6 bytes);
|
|
64
|
+
# UUID.bytes gives us the UUID as 16 bytes;
|
|
65
|
+
# extract first 6 bytes and convert to milliseconds
|
|
66
|
+
timestamp_bytes = uuid.bytes[:6]
|
|
67
|
+
timestamp_ms = int.from_bytes(timestamp_bytes, byteorder="big")
|
|
68
|
+
return timestamp_ms
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def is_valid_uuid7(uuid: UUID) -> bool:
|
|
72
|
+
"""Check if a UUID is a valid UUIDv7.
|
|
73
|
+
|
|
74
|
+
Validates that the UUID has version 7 by checking the version bits
|
|
75
|
+
(bits 48-51) which should be 0111 (7).
|
|
76
|
+
|
|
77
|
+
Parameters
|
|
78
|
+
----------
|
|
79
|
+
uuid
|
|
80
|
+
The UUID to validate.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
bool
|
|
85
|
+
True if the UUID is version 7, False otherwise
|
|
86
|
+
|
|
87
|
+
Examples
|
|
88
|
+
--------
|
|
89
|
+
>>> uuid7 = generate_uuid()
|
|
90
|
+
>>> is_valid_uuid7(uuid7)
|
|
91
|
+
True
|
|
92
|
+
>>> from uuid import uuid4
|
|
93
|
+
>>> uuid4_val = uuid4()
|
|
94
|
+
>>> is_valid_uuid7(uuid4_val)
|
|
95
|
+
False
|
|
96
|
+
"""
|
|
97
|
+
return uuid.version == 7
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""ISO 639 language code validation and utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
from langcodes import Language
|
|
8
|
+
from langcodes.tag_parser import LanguageTagError
|
|
9
|
+
from pydantic import AfterValidator, Field
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def validate_iso639_code(code: str | None) -> str | None:
|
|
13
|
+
"""Validate language code against ISO 639-1 or ISO 639-3.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
code
|
|
18
|
+
Language code to validate (e.g., "en", "eng", "ko", "kor").
|
|
19
|
+
|
|
20
|
+
Returns
|
|
21
|
+
-------
|
|
22
|
+
str | None
|
|
23
|
+
Normalized language code (converted to ISO 639-3 if valid).
|
|
24
|
+
|
|
25
|
+
Raises
|
|
26
|
+
------
|
|
27
|
+
ValueError
|
|
28
|
+
If code is not a valid ISO 639 language code.
|
|
29
|
+
|
|
30
|
+
Examples
|
|
31
|
+
--------
|
|
32
|
+
>>> validate_iso639_code("en")
|
|
33
|
+
'eng'
|
|
34
|
+
>>> validate_iso639_code("eng")
|
|
35
|
+
'eng'
|
|
36
|
+
>>> validate_iso639_code("ko")
|
|
37
|
+
'kor'
|
|
38
|
+
>>> validate_iso639_code(None)
|
|
39
|
+
None
|
|
40
|
+
>>> validate_iso639_code("invalid")
|
|
41
|
+
Traceback (most recent call last):
|
|
42
|
+
...
|
|
43
|
+
ValueError: Invalid language code: 'invalid'
|
|
44
|
+
"""
|
|
45
|
+
if code is None:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
# parse and normalize to ISO 639-3
|
|
50
|
+
lang = Language.get(code)
|
|
51
|
+
return lang.to_alpha3()
|
|
52
|
+
except (LanguageTagError, LookupError) as e:
|
|
53
|
+
raise ValueError(f"Invalid language code: {code!r}") from e
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# type alias for language codes
|
|
57
|
+
LanguageCode = Annotated[
|
|
58
|
+
str | None,
|
|
59
|
+
AfterValidator(validate_iso639_code),
|
|
60
|
+
Field(description="ISO 639-1 or ISO 639-3 language code"),
|
|
61
|
+
]
|
bead/data/metadata.py
ADDED
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Metadata tracking models for provenance and processing history.
|
|
2
|
+
|
|
3
|
+
This module provides models for tracking provenance chains and processing history
|
|
4
|
+
for all bead objects. This enables full traceability of data transformations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from uuid import UUID
|
|
11
|
+
|
|
12
|
+
from pydantic import Field
|
|
13
|
+
|
|
14
|
+
from bead.data.base import BeadBaseModel, JsonValue
|
|
15
|
+
from bead.data.timestamps import now_iso8601
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _empty_provenance_list() -> list[ProvenanceRecord]:
|
|
19
|
+
"""Create empty provenance list."""
|
|
20
|
+
return []
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _empty_processing_list() -> list[ProcessingRecord]:
|
|
24
|
+
"""Create empty processing list."""
|
|
25
|
+
return []
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ProvenanceRecord(BeadBaseModel):
|
|
29
|
+
"""Record of a provenance relationship between objects.
|
|
30
|
+
|
|
31
|
+
Tracks a single parent-child relationship in the provenance chain, including
|
|
32
|
+
what the parent was, its type, and the nature of the relationship.
|
|
33
|
+
|
|
34
|
+
Attributes
|
|
35
|
+
----------
|
|
36
|
+
parent_id : UUID
|
|
37
|
+
UUID of the parent object in the provenance chain
|
|
38
|
+
parent_type : str
|
|
39
|
+
Type name of the parent object (e.g., "LexicalItem", "Template")
|
|
40
|
+
relationship : str
|
|
41
|
+
Type of relationship (e.g., "derived_from", "filled_from", "generated_from")
|
|
42
|
+
timestamp : datetime
|
|
43
|
+
When this relationship was established (UTC with timezone)
|
|
44
|
+
|
|
45
|
+
Examples
|
|
46
|
+
--------
|
|
47
|
+
>>> from uuid import uuid4
|
|
48
|
+
>>> parent_id = uuid4()
|
|
49
|
+
>>> record = ProvenanceRecord(
|
|
50
|
+
... parent_id=parent_id,
|
|
51
|
+
... parent_type="Template",
|
|
52
|
+
... relationship="filled_from"
|
|
53
|
+
... )
|
|
54
|
+
>>> record.parent_type
|
|
55
|
+
'Template'
|
|
56
|
+
>>> record.timestamp is not None
|
|
57
|
+
True
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
parent_id: UUID
|
|
61
|
+
parent_type: str
|
|
62
|
+
relationship: str
|
|
63
|
+
timestamp: datetime = Field(default_factory=now_iso8601)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ProcessingRecord(BeadBaseModel):
|
|
67
|
+
"""Record of a processing operation applied to an object.
|
|
68
|
+
|
|
69
|
+
Tracks a single operation in the processing history, including the operation
|
|
70
|
+
name, parameters used, when it was performed, and who/what performed it.
|
|
71
|
+
|
|
72
|
+
Attributes
|
|
73
|
+
----------
|
|
74
|
+
operation : str
|
|
75
|
+
Name of the operation (e.g., "fill_template", "apply_constraint", "filter")
|
|
76
|
+
parameters : dict[str, JsonValue]
|
|
77
|
+
Parameters passed to the operation (default: empty dict)
|
|
78
|
+
timestamp : datetime
|
|
79
|
+
When the operation was performed (UTC with timezone)
|
|
80
|
+
operator : str | None
|
|
81
|
+
Who/what performed the operation (e.g., "TemplateFiller-v1.0", user ID)
|
|
82
|
+
(default: None)
|
|
83
|
+
|
|
84
|
+
Examples
|
|
85
|
+
--------
|
|
86
|
+
>>> record = ProcessingRecord(
|
|
87
|
+
... operation="fill_template",
|
|
88
|
+
... parameters={"strategy": "exhaustive", "max_items": 100},
|
|
89
|
+
... operator="TemplateFiller-v1.0"
|
|
90
|
+
... )
|
|
91
|
+
>>> record.operation
|
|
92
|
+
'fill_template'
|
|
93
|
+
>>> record.parameters["strategy"]
|
|
94
|
+
'exhaustive'
|
|
95
|
+
>>> record.timestamp is not None
|
|
96
|
+
True
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
operation: str
|
|
100
|
+
parameters: dict[str, JsonValue] = Field(default_factory=dict)
|
|
101
|
+
timestamp: datetime = Field(default_factory=now_iso8601)
|
|
102
|
+
operator: str | None = None
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class MetadataTracker(BeadBaseModel):
|
|
106
|
+
"""Metadata tracking for provenance and processing history.
|
|
107
|
+
|
|
108
|
+
Tracks both provenance (where data came from) and processing history
|
|
109
|
+
(what operations were applied) for complete data lineage.
|
|
110
|
+
|
|
111
|
+
Attributes
|
|
112
|
+
----------
|
|
113
|
+
provenance : list[ProvenanceRecord]
|
|
114
|
+
Chain of provenance relationships (default: empty list)
|
|
115
|
+
processing_history : list[ProcessingRecord]
|
|
116
|
+
History of processing operations (default: empty list)
|
|
117
|
+
custom_metadata : dict[str, JsonValue]
|
|
118
|
+
Custom metadata fields (default: empty dict)
|
|
119
|
+
|
|
120
|
+
Examples
|
|
121
|
+
--------
|
|
122
|
+
>>> from uuid import uuid4
|
|
123
|
+
>>> tracker = MetadataTracker()
|
|
124
|
+
>>> parent_id = uuid4()
|
|
125
|
+
>>> tracker.add_provenance(parent_id, "Template", "filled_from")
|
|
126
|
+
>>> tracker.add_processing("fill_template", {"strategy": "exhaustive"})
|
|
127
|
+
>>> len(tracker.provenance)
|
|
128
|
+
1
|
|
129
|
+
>>> len(tracker.processing_history)
|
|
130
|
+
1
|
|
131
|
+
>>> chain = tracker.get_provenance_chain()
|
|
132
|
+
>>> len(chain)
|
|
133
|
+
1
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
provenance: list[ProvenanceRecord] = Field(default_factory=_empty_provenance_list)
|
|
137
|
+
processing_history: list[ProcessingRecord] = Field(
|
|
138
|
+
default_factory=_empty_processing_list
|
|
139
|
+
)
|
|
140
|
+
custom_metadata: dict[str, JsonValue] = Field(default_factory=dict)
|
|
141
|
+
|
|
142
|
+
def add_provenance(
|
|
143
|
+
self, parent_id: UUID, parent_type: str, relationship: str
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Add a provenance record to the chain.
|
|
146
|
+
|
|
147
|
+
Creates a new provenance record and adds it to the provenance list.
|
|
148
|
+
The timestamp is automatically set to the current time.
|
|
149
|
+
|
|
150
|
+
Parameters
|
|
151
|
+
----------
|
|
152
|
+
parent_id : UUID
|
|
153
|
+
UUID of the parent object
|
|
154
|
+
parent_type : str
|
|
155
|
+
Type name of the parent object (e.g., "Template", "LexicalItem")
|
|
156
|
+
relationship : str
|
|
157
|
+
Type of relationship (e.g., "derived_from", "filled_from")
|
|
158
|
+
|
|
159
|
+
Examples
|
|
160
|
+
--------
|
|
161
|
+
>>> from uuid import uuid4
|
|
162
|
+
>>> tracker = MetadataTracker()
|
|
163
|
+
>>> parent_id = uuid4()
|
|
164
|
+
>>> tracker.add_provenance(parent_id, "Template", "filled_from")
|
|
165
|
+
>>> len(tracker.provenance)
|
|
166
|
+
1
|
|
167
|
+
>>> tracker.provenance[0].parent_type
|
|
168
|
+
'Template'
|
|
169
|
+
"""
|
|
170
|
+
record = ProvenanceRecord(
|
|
171
|
+
parent_id=parent_id, parent_type=parent_type, relationship=relationship
|
|
172
|
+
)
|
|
173
|
+
self.provenance.append(record)
|
|
174
|
+
|
|
175
|
+
def add_processing(
|
|
176
|
+
self,
|
|
177
|
+
operation: str,
|
|
178
|
+
parameters: dict[str, JsonValue] | None = None,
|
|
179
|
+
operator: str | None = None,
|
|
180
|
+
) -> None:
|
|
181
|
+
"""Add a processing record to the history.
|
|
182
|
+
|
|
183
|
+
Creates a new processing record and adds it to the processing history.
|
|
184
|
+
The timestamp is automatically set to the current time.
|
|
185
|
+
|
|
186
|
+
Parameters
|
|
187
|
+
----------
|
|
188
|
+
operation : str
|
|
189
|
+
Name of the operation performed
|
|
190
|
+
parameters : dict[str, JsonValue] | None, optional
|
|
191
|
+
Parameters passed to the operation (default: None, which creates empty dict)
|
|
192
|
+
operator : str | None, optional
|
|
193
|
+
Who/what performed the operation (default: None)
|
|
194
|
+
|
|
195
|
+
Examples
|
|
196
|
+
--------
|
|
197
|
+
>>> tracker = MetadataTracker()
|
|
198
|
+
>>> tracker.add_processing("fill_template", {"strategy": "exhaustive"})
|
|
199
|
+
>>> len(tracker.processing_history)
|
|
200
|
+
1
|
|
201
|
+
>>> tracker.processing_history[0].operation
|
|
202
|
+
'fill_template'
|
|
203
|
+
>>> tracker.add_processing("filter", operator="FilterSystem-v2.0")
|
|
204
|
+
>>> tracker.processing_history[1].operator
|
|
205
|
+
'FilterSystem-v2.0'
|
|
206
|
+
"""
|
|
207
|
+
if parameters is None:
|
|
208
|
+
parameters = {}
|
|
209
|
+
record = ProcessingRecord(
|
|
210
|
+
operation=operation, parameters=parameters, operator=operator
|
|
211
|
+
)
|
|
212
|
+
self.processing_history.append(record)
|
|
213
|
+
|
|
214
|
+
def get_provenance_chain(self) -> list[UUID]:
|
|
215
|
+
"""Get the full provenance chain as a list of parent UUIDs.
|
|
216
|
+
|
|
217
|
+
Returns the parent UUIDs in the order they were added to the provenance list.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
list[UUID]
|
|
222
|
+
List of parent UUIDs in chronological order
|
|
223
|
+
|
|
224
|
+
Examples
|
|
225
|
+
--------
|
|
226
|
+
>>> from uuid import uuid4
|
|
227
|
+
>>> tracker = MetadataTracker()
|
|
228
|
+
>>> parent1 = uuid4()
|
|
229
|
+
>>> parent2 = uuid4()
|
|
230
|
+
>>> tracker.add_provenance(parent1, "Template", "filled_from")
|
|
231
|
+
>>> tracker.add_provenance(parent2, "LexicalItem", "derived_from")
|
|
232
|
+
>>> chain = tracker.get_provenance_chain()
|
|
233
|
+
>>> len(chain)
|
|
234
|
+
2
|
|
235
|
+
>>> chain[0] == parent1
|
|
236
|
+
True
|
|
237
|
+
"""
|
|
238
|
+
return [record.parent_id for record in self.provenance]
|
|
239
|
+
|
|
240
|
+
def get_recent_processing(self, n: int = 5) -> list[ProcessingRecord]:
|
|
241
|
+
"""Get the N most recent processing records.
|
|
242
|
+
|
|
243
|
+
Returns the most recent processing records, up to N records. If there
|
|
244
|
+
are fewer than N records, returns all available records.
|
|
245
|
+
|
|
246
|
+
Parameters
|
|
247
|
+
----------
|
|
248
|
+
n : int, optional
|
|
249
|
+
Number of recent records to return (default: 5)
|
|
250
|
+
|
|
251
|
+
Returns
|
|
252
|
+
-------
|
|
253
|
+
list[ProcessingRecord]
|
|
254
|
+
List of up to N most recent processing records, newest first
|
|
255
|
+
|
|
256
|
+
Examples
|
|
257
|
+
--------
|
|
258
|
+
>>> tracker = MetadataTracker()
|
|
259
|
+
>>> tracker.add_processing("operation1")
|
|
260
|
+
>>> tracker.add_processing("operation2")
|
|
261
|
+
>>> tracker.add_processing("operation3")
|
|
262
|
+
>>> recent = tracker.get_recent_processing(n=2)
|
|
263
|
+
>>> len(recent)
|
|
264
|
+
2
|
|
265
|
+
>>> recent[0].operation
|
|
266
|
+
'operation3'
|
|
267
|
+
>>> recent[1].operation
|
|
268
|
+
'operation2'
|
|
269
|
+
"""
|
|
270
|
+
return list(reversed(self.processing_history[-n:]))
|
bead/data/range.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Generic numeric range model with validation.
|
|
2
|
+
|
|
3
|
+
Provides a reusable Range[T] model for representing validated numeric ranges
|
|
4
|
+
with bounds checking, containment testing, and value clamping.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Generic, TypeVar
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
|
12
|
+
|
|
13
|
+
T = TypeVar("T", int, float)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Range(BaseModel, Generic[T]): # noqa: UP046 - Pydantic requires Generic[T]
|
|
17
|
+
"""A validated numeric range with inclusive bounds.
|
|
18
|
+
|
|
19
|
+
Provides a generic container for numeric ranges with automatic validation
|
|
20
|
+
that min < max. Supports containment testing and value clamping.
|
|
21
|
+
|
|
22
|
+
Attributes
|
|
23
|
+
----------
|
|
24
|
+
min
|
|
25
|
+
Minimum value (inclusive).
|
|
26
|
+
max
|
|
27
|
+
Maximum value (inclusive).
|
|
28
|
+
|
|
29
|
+
Examples
|
|
30
|
+
--------
|
|
31
|
+
>>> scale = Range[int](min=1, max=7)
|
|
32
|
+
>>> scale.contains(4)
|
|
33
|
+
True
|
|
34
|
+
>>> scale.contains(0)
|
|
35
|
+
False
|
|
36
|
+
>>> scale.clamp(10)
|
|
37
|
+
7
|
|
38
|
+
|
|
39
|
+
>>> probability = Range[float](min=0.0, max=1.0)
|
|
40
|
+
>>> probability.contains(0.5)
|
|
41
|
+
True
|
|
42
|
+
>>> probability.clamp(-0.1)
|
|
43
|
+
0.0
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
model_config = ConfigDict(
|
|
47
|
+
extra="forbid",
|
|
48
|
+
frozen=True,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
min: T
|
|
52
|
+
max: T
|
|
53
|
+
|
|
54
|
+
@model_validator(mode="after")
|
|
55
|
+
def validate_order(self) -> Range[T]:
|
|
56
|
+
"""Validate that min is strictly less than max.
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
Range[T]
|
|
61
|
+
The validated range instance.
|
|
62
|
+
|
|
63
|
+
Raises
|
|
64
|
+
------
|
|
65
|
+
ValueError
|
|
66
|
+
If min is greater than or equal to max.
|
|
67
|
+
"""
|
|
68
|
+
if self.min >= self.max:
|
|
69
|
+
raise ValueError(f"min ({self.min}) must be less than max ({self.max})")
|
|
70
|
+
return self
|
|
71
|
+
|
|
72
|
+
def contains(self, value: T) -> bool:
|
|
73
|
+
"""Check if a value is within the range (inclusive).
|
|
74
|
+
|
|
75
|
+
Parameters
|
|
76
|
+
----------
|
|
77
|
+
value
|
|
78
|
+
The value to check.
|
|
79
|
+
|
|
80
|
+
Returns
|
|
81
|
+
-------
|
|
82
|
+
bool
|
|
83
|
+
True if min <= value <= max, False otherwise.
|
|
84
|
+
|
|
85
|
+
Examples
|
|
86
|
+
--------
|
|
87
|
+
>>> r = Range[int](min=1, max=5)
|
|
88
|
+
>>> r.contains(3)
|
|
89
|
+
True
|
|
90
|
+
>>> r.contains(1)
|
|
91
|
+
True
|
|
92
|
+
>>> r.contains(5)
|
|
93
|
+
True
|
|
94
|
+
>>> r.contains(6)
|
|
95
|
+
False
|
|
96
|
+
"""
|
|
97
|
+
return self.min <= value <= self.max
|
|
98
|
+
|
|
99
|
+
def clamp(self, value: T) -> T:
|
|
100
|
+
"""Clamp a value to the range bounds.
|
|
101
|
+
|
|
102
|
+
Parameters
|
|
103
|
+
----------
|
|
104
|
+
value
|
|
105
|
+
The value to clamp.
|
|
106
|
+
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
T
|
|
110
|
+
The clamped value (min if value < min, max if value > max,
|
|
111
|
+
otherwise the original value).
|
|
112
|
+
|
|
113
|
+
Examples
|
|
114
|
+
--------
|
|
115
|
+
>>> r = Range[int](min=1, max=5)
|
|
116
|
+
>>> r.clamp(3)
|
|
117
|
+
3
|
|
118
|
+
>>> r.clamp(0)
|
|
119
|
+
1
|
|
120
|
+
>>> r.clamp(10)
|
|
121
|
+
5
|
|
122
|
+
"""
|
|
123
|
+
return max(self.min, min(self.max, value))
|