kitefs 0.2.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kitefs/__init__.py +31 -0
- kitefs/__main__.py +5 -0
- kitefs/cli.py +253 -0
- kitefs/config.py +166 -0
- kitefs/definitions.py +167 -0
- kitefs/exceptions.py +121 -0
- kitefs/feature_store.py +110 -0
- kitefs/providers/__init__.py +11 -0
- kitefs/providers/base.py +78 -0
- kitefs/providers/factory.py +19 -0
- kitefs/providers/local.py +184 -0
- kitefs/py.typed +0 -0
- kitefs/registry/__init__.py +12 -0
- kitefs/registry/_discovery.py +67 -0
- kitefs/registry/_manager.py +198 -0
- kitefs/registry/_serialization.py +149 -0
- kitefs/registry/_validation.py +127 -0
- kitefs-0.2.0a1.dist-info/METADATA +108 -0
- kitefs-0.2.0a1.dist-info/RECORD +21 -0
- kitefs-0.2.0a1.dist-info/WHEEL +4 -0
- kitefs-0.2.0a1.dist-info/entry_points.txt +3 -0
kitefs/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""KiteFS — a Python feature store for offline/online feature storage and serving."""
|
|
2
|
+
|
|
3
|
+
from kitefs.definitions import (
|
|
4
|
+
EntityKey,
|
|
5
|
+
EventTimestamp,
|
|
6
|
+
Expect,
|
|
7
|
+
Feature,
|
|
8
|
+
FeatureGroup,
|
|
9
|
+
FeatureType,
|
|
10
|
+
JoinKey,
|
|
11
|
+
Metadata,
|
|
12
|
+
StorageTarget,
|
|
13
|
+
ValidationMode,
|
|
14
|
+
)
|
|
15
|
+
from kitefs.feature_store import FeatureStore
|
|
16
|
+
from kitefs.registry import ApplyResult
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"ApplyResult",
|
|
20
|
+
"EntityKey",
|
|
21
|
+
"EventTimestamp",
|
|
22
|
+
"Expect",
|
|
23
|
+
"Feature",
|
|
24
|
+
"FeatureGroup",
|
|
25
|
+
"FeatureStore",
|
|
26
|
+
"FeatureType",
|
|
27
|
+
"JoinKey",
|
|
28
|
+
"Metadata",
|
|
29
|
+
"StorageTarget",
|
|
30
|
+
"ValidationMode",
|
|
31
|
+
]
|
kitefs/__main__.py
ADDED
kitefs/cli.py
ADDED
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""CLI entry point for KiteFS — thin delegation layer over the SDK.
|
|
2
|
+
|
|
3
|
+
``kitefs init`` is the only self-contained command because the project
|
|
4
|
+
scaffold (including ``kitefs.yaml``) does not exist yet when it runs.
|
|
5
|
+
All other commands delegate to :class:`kitefs.FeatureStore`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
_GITIGNORE_ENTRY = "feature_store/data/"
|
|
14
|
+
|
|
15
|
+
_DEFAULT_CONFIG = """\
|
|
16
|
+
provider: local
|
|
17
|
+
storage_root: ./feature_store/
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
_EXAMPLE_FEATURES = '''\
|
|
21
|
+
"""Example feature group definitions for KiteFS.
|
|
22
|
+
|
|
23
|
+
Uncomment and modify the example below, then run ``kitefs apply``
|
|
24
|
+
to register your feature groups.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
# from kitefs import (
|
|
28
|
+
# EntityKey,
|
|
29
|
+
# EventTimestamp,
|
|
30
|
+
# Expect,
|
|
31
|
+
# Feature,
|
|
32
|
+
# FeatureGroup,
|
|
33
|
+
# FeatureType,
|
|
34
|
+
# Metadata,
|
|
35
|
+
# StorageTarget,
|
|
36
|
+
# ValidationMode,
|
|
37
|
+
# )
|
|
38
|
+
#
|
|
39
|
+
# example_features = FeatureGroup(
|
|
40
|
+
# name="example_features",
|
|
41
|
+
# storage_target=StorageTarget.OFFLINE,
|
|
42
|
+
# entity_key=EntityKey(name="entity_id", dtype=FeatureType.INTEGER),
|
|
43
|
+
# event_timestamp=EventTimestamp(name="event_timestamp", dtype=FeatureType.DATETIME),
|
|
44
|
+
# features=[
|
|
45
|
+
# Feature(name="feature_one", dtype=FeatureType.FLOAT, expect=Expect().not_null()),
|
|
46
|
+
# Feature(name="feature_two", dtype=FeatureType.STRING),
|
|
47
|
+
# ],
|
|
48
|
+
# ingestion_validation=ValidationMode.ERROR,
|
|
49
|
+
# metadata=Metadata(owner="your-team", description="An example feature group."),
|
|
50
|
+
# )
|
|
51
|
+
'''
|
|
52
|
+
|
|
53
|
+
_SEED_REGISTRY = {"version": "1.0", "feature_groups": {}}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@click.group()
|
|
57
|
+
def cli() -> None:
|
|
58
|
+
"""KiteFS — a Python feature store for offline/online feature storage and serving."""
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@cli.command()
|
|
62
|
+
@click.argument("path", required=False, default=None, type=click.Path(file_okay=False))
|
|
63
|
+
def init(path: str | None) -> None:
|
|
64
|
+
"""Create a new KiteFS project at PATH (default: current directory)."""
|
|
65
|
+
project_root = Path(path).resolve() if path else Path.cwd().resolve()
|
|
66
|
+
config_path = project_root / "kitefs.yaml"
|
|
67
|
+
|
|
68
|
+
if config_path.exists():
|
|
69
|
+
click.echo("Error: KiteFS project already initialized at this location.", err=True)
|
|
70
|
+
raise SystemExit(1)
|
|
71
|
+
|
|
72
|
+
storage_root = project_root / "feature_store"
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
# Create directory structure
|
|
76
|
+
(storage_root / "definitions").mkdir(parents=True, exist_ok=True)
|
|
77
|
+
(storage_root / "data" / "offline_store").mkdir(parents=True, exist_ok=True)
|
|
78
|
+
(storage_root / "data" / "online_store").mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
# Seed definitions
|
|
81
|
+
(storage_root / "definitions" / "__init__.py").write_text("", encoding="utf-8")
|
|
82
|
+
(storage_root / "definitions" / "example_features.py").write_text(_EXAMPLE_FEATURES, encoding="utf-8")
|
|
83
|
+
|
|
84
|
+
# Seed registry.json — deterministic output for meaningful Git diffs
|
|
85
|
+
registry_path = storage_root / "registry.json"
|
|
86
|
+
registry_path.write_text(json.dumps(_SEED_REGISTRY, sort_keys=True, indent=2) + "\n", encoding="utf-8")
|
|
87
|
+
|
|
88
|
+
# Create or append .gitignore — check by exact line, not substring, to avoid
|
|
89
|
+
# false positives from comments or negated rules containing the entry.
|
|
90
|
+
gitignore_path = project_root / ".gitignore"
|
|
91
|
+
if gitignore_path.exists():
|
|
92
|
+
content = gitignore_path.read_text(encoding="utf-8")
|
|
93
|
+
existing_lines = {line.strip() for line in content.splitlines()}
|
|
94
|
+
if _GITIGNORE_ENTRY not in existing_lines:
|
|
95
|
+
with gitignore_path.open("a", encoding="utf-8") as f:
|
|
96
|
+
if content and not content.endswith("\n"):
|
|
97
|
+
f.write("\n")
|
|
98
|
+
f.write(_GITIGNORE_ENTRY + "\n")
|
|
99
|
+
else:
|
|
100
|
+
gitignore_path.write_text(_GITIGNORE_ENTRY + "\n", encoding="utf-8")
|
|
101
|
+
|
|
102
|
+
# Seed kitefs.yaml last — this is the sentinel file that guards against
|
|
103
|
+
# re-init. Writing it last ensures a crash mid-scaffold leaves no sentinel,
|
|
104
|
+
# so the user can retry `kitefs init` without manual cleanup.
|
|
105
|
+
config_path.write_text(_DEFAULT_CONFIG, encoding="utf-8")
|
|
106
|
+
except OSError as e:
|
|
107
|
+
click.echo(f"Error: {e}", err=True)
|
|
108
|
+
raise SystemExit(1) from None
|
|
109
|
+
|
|
110
|
+
click.echo(f"Project initialized at {project_root}")
|
|
111
|
+
click.echo(" Provider: local")
|
|
112
|
+
click.echo(f" Config: {config_path}")
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@cli.command()
|
|
116
|
+
def apply() -> None:
|
|
117
|
+
"""Register feature group definitions into the registry."""
|
|
118
|
+
from kitefs.exceptions import KiteFSError
|
|
119
|
+
from kitefs.feature_store import FeatureStore
|
|
120
|
+
|
|
121
|
+
try:
|
|
122
|
+
fs = FeatureStore()
|
|
123
|
+
result = fs.apply()
|
|
124
|
+
except KiteFSError as e:
|
|
125
|
+
click.echo(f"Error: {e}", err=True)
|
|
126
|
+
raise SystemExit(1) from None
|
|
127
|
+
|
|
128
|
+
click.echo(f"Applied {result.group_count} feature group(s) — registered successfully.")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@cli.command(name="list")
|
|
132
|
+
@click.option("--format", "fmt", default=None, type=click.Choice(["json"], case_sensitive=False), help="Output format.")
|
|
133
|
+
@click.option("--target", default=None, type=click.Path(), help="File path to write output to.")
|
|
134
|
+
def list_cmd(fmt: str | None, target: str | None) -> None:
|
|
135
|
+
"""List all registered feature groups with summary information."""
|
|
136
|
+
from kitefs.exceptions import KiteFSError
|
|
137
|
+
from kitefs.feature_store import FeatureStore
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
fs = FeatureStore()
|
|
141
|
+
result = fs.list_feature_groups(format=fmt, target=target)
|
|
142
|
+
except KiteFSError as e:
|
|
143
|
+
click.echo(f"Error: {e}", err=True)
|
|
144
|
+
raise SystemExit(1) from None
|
|
145
|
+
|
|
146
|
+
if target is not None:
|
|
147
|
+
click.echo(f"Output written to {target}")
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
if fmt == "json":
|
|
151
|
+
click.echo(result)
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
# Default: human-readable table.
|
|
155
|
+
assert isinstance(result, list) # target/format branches already returned
|
|
156
|
+
if not result:
|
|
157
|
+
click.echo("No feature groups registered. Run `kitefs apply` first.")
|
|
158
|
+
return
|
|
159
|
+
|
|
160
|
+
_render_list_table(result)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def _render_list_table(summaries: list[dict]) -> None:
|
|
164
|
+
"""Render feature group summaries as a human-readable table."""
|
|
165
|
+
headers = ["Name", "Owner", "Entity Key", "Storage Target", "Features"]
|
|
166
|
+
keys = ["name", "owner", "entity_key", "storage_target", "feature_count"]
|
|
167
|
+
|
|
168
|
+
rows: list[list[str]] = []
|
|
169
|
+
for s in summaries:
|
|
170
|
+
rows.append([str(s.get(k) or "") for k in keys])
|
|
171
|
+
|
|
172
|
+
# Compute column widths from headers and data.
|
|
173
|
+
widths = [len(h) for h in headers]
|
|
174
|
+
for row in rows:
|
|
175
|
+
for i, cell in enumerate(row):
|
|
176
|
+
widths[i] = max(widths[i], len(cell))
|
|
177
|
+
|
|
178
|
+
def _fmt_row(cells: list[str]) -> str:
|
|
179
|
+
return " ".join(cell.ljust(widths[i]) for i, cell in enumerate(cells))
|
|
180
|
+
|
|
181
|
+
click.echo(_fmt_row(headers))
|
|
182
|
+
click.echo(" ".join("-" * w for w in widths))
|
|
183
|
+
for row in rows:
|
|
184
|
+
click.echo(_fmt_row(row))
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@cli.command()
|
|
188
|
+
@click.argument("feature_group_name")
|
|
189
|
+
@click.option("--format", "fmt", default=None, type=click.Choice(["json"], case_sensitive=False), help="Output format.")
|
|
190
|
+
@click.option("--target", default=None, type=click.Path(), help="File path to write output to.")
|
|
191
|
+
def describe(feature_group_name: str, fmt: str | None, target: str | None) -> None:
|
|
192
|
+
"""Display the full definition of a specific feature group."""
|
|
193
|
+
from kitefs.exceptions import KiteFSError
|
|
194
|
+
from kitefs.feature_store import FeatureStore
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
fs = FeatureStore()
|
|
198
|
+
result = fs.describe_feature_group(feature_group_name, format=fmt, target=target)
|
|
199
|
+
except KiteFSError as e:
|
|
200
|
+
click.echo(f"Error: {e}", err=True)
|
|
201
|
+
raise SystemExit(1) from None
|
|
202
|
+
|
|
203
|
+
if target is not None:
|
|
204
|
+
click.echo(f"Output written to {target}")
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
if fmt == "json":
|
|
208
|
+
click.echo(result)
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
# Default: human-readable key-value layout.
|
|
212
|
+
assert isinstance(result, dict) # target/format branches already returned
|
|
213
|
+
_render_describe(result)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _render_describe(entry: dict) -> None:
|
|
217
|
+
"""Render a full feature group definition as a human-readable layout."""
|
|
218
|
+
click.echo(f"Feature Group: {entry.get('name', '?')}")
|
|
219
|
+
click.echo(f" Storage Target: {entry.get('storage_target', '?')}")
|
|
220
|
+
|
|
221
|
+
ek = entry.get("entity_key", {})
|
|
222
|
+
click.echo(f" Entity Key: {ek.get('name', '?')} ({ek.get('dtype', '?')})")
|
|
223
|
+
|
|
224
|
+
et = entry.get("event_timestamp", {})
|
|
225
|
+
click.echo(f" Event Timestamp: {et.get('name', '?')} ({et.get('dtype', '?')})")
|
|
226
|
+
|
|
227
|
+
click.echo(f" Ingestion Validation: {entry.get('ingestion_validation', '?')}")
|
|
228
|
+
click.echo(f" Offline Retrieval Validation: {entry.get('offline_retrieval_validation', '?')}")
|
|
229
|
+
|
|
230
|
+
meta = entry.get("metadata") or {}
|
|
231
|
+
if meta.get("owner"):
|
|
232
|
+
click.echo(f" Owner: {meta['owner']}")
|
|
233
|
+
if meta.get("description"):
|
|
234
|
+
click.echo(f" Description: {meta['description']}")
|
|
235
|
+
if meta.get("tags"):
|
|
236
|
+
click.echo(f" Tags: {meta['tags']}")
|
|
237
|
+
|
|
238
|
+
click.echo(f" Applied At: {entry.get('applied_at', '?')}")
|
|
239
|
+
click.echo(f" Last Materialized At: {entry.get('last_materialized_at', 'None')}")
|
|
240
|
+
|
|
241
|
+
features = entry.get("features", [])
|
|
242
|
+
click.echo(f" Features ({len(features)}):")
|
|
243
|
+
for f in features:
|
|
244
|
+
expect_str = ""
|
|
245
|
+
if f.get("expect"):
|
|
246
|
+
expect_str = f" expect={f['expect']}"
|
|
247
|
+
click.echo(f" - {f['name']} ({f.get('dtype', '?')}){expect_str}")
|
|
248
|
+
|
|
249
|
+
join_keys = entry.get("join_keys", [])
|
|
250
|
+
if join_keys:
|
|
251
|
+
click.echo(f" Join Keys ({len(join_keys)}):")
|
|
252
|
+
for jk in join_keys:
|
|
253
|
+
click.echo(f" - {jk['field_name']} -> {jk['referenced_group']}")
|
kitefs/config.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Configuration manager — loads, validates, and exposes kitefs.yaml settings."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from kitefs.exceptions import ConfigurationError
|
|
10
|
+
|
|
11
|
+
_SUPPORTED_PROVIDERS = ("local", "aws")
|
|
12
|
+
|
|
13
|
+
_ENV_OVERRIDES: dict[str, str] = {
|
|
14
|
+
"KITEFS_PROVIDER": "provider",
|
|
15
|
+
"KITEFS_STORAGE_ROOT": "storage_root",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
_ENV_AWS_OVERRIDES: dict[str, str] = {
|
|
19
|
+
"KITEFS_AWS_S3_BUCKET": "s3_bucket",
|
|
20
|
+
"KITEFS_AWS_S3_PREFIX": "s3_prefix",
|
|
21
|
+
"KITEFS_AWS_DYNAMODB_TABLE_PREFIX": "dynamodb_table_prefix",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
_REQUIRED_AWS_FIELDS = ("s3_bucket", "s3_prefix", "dynamodb_table_prefix")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class AWSConfig:
|
|
29
|
+
"""AWS-specific configuration for S3 and DynamoDB access."""
|
|
30
|
+
|
|
31
|
+
s3_bucket: str
|
|
32
|
+
s3_prefix: str
|
|
33
|
+
dynamodb_table_prefix: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class Config:
|
|
38
|
+
"""Validated, immutable configuration loaded from kitefs.yaml."""
|
|
39
|
+
|
|
40
|
+
provider: str
|
|
41
|
+
project_root: Path
|
|
42
|
+
storage_root: Path
|
|
43
|
+
definitions_path: Path
|
|
44
|
+
aws: AWSConfig | None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_config(project_root: Path) -> Config:
|
|
48
|
+
"""Load and validate kitefs.yaml from the given project root.
|
|
49
|
+
|
|
50
|
+
Environment variables take precedence over file values.
|
|
51
|
+
All validation errors are collected before raising.
|
|
52
|
+
"""
|
|
53
|
+
project_root = project_root.resolve()
|
|
54
|
+
config_path = project_root / "kitefs.yaml"
|
|
55
|
+
|
|
56
|
+
raw = _read_yaml(config_path)
|
|
57
|
+
env_origins = _apply_env_overrides(raw)
|
|
58
|
+
_validate(raw, config_path, env_origins)
|
|
59
|
+
|
|
60
|
+
storage_root = (project_root / raw["storage_root"]).resolve()
|
|
61
|
+
|
|
62
|
+
aws_config: AWSConfig | None = None
|
|
63
|
+
if raw["provider"] == "aws":
|
|
64
|
+
aws_section = raw.get("aws", {}) or {}
|
|
65
|
+
aws_config = AWSConfig(
|
|
66
|
+
s3_bucket=aws_section["s3_bucket"],
|
|
67
|
+
s3_prefix=aws_section["s3_prefix"],
|
|
68
|
+
dynamodb_table_prefix=aws_section["dynamodb_table_prefix"],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return Config(
|
|
72
|
+
provider=raw["provider"],
|
|
73
|
+
project_root=project_root,
|
|
74
|
+
storage_root=storage_root,
|
|
75
|
+
definitions_path=storage_root / "definitions",
|
|
76
|
+
aws=aws_config,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _read_yaml(config_path: Path) -> dict:
|
|
81
|
+
"""Read and parse kitefs.yaml, raising ConfigurationError on failure."""
|
|
82
|
+
if not config_path.exists():
|
|
83
|
+
raise ConfigurationError(
|
|
84
|
+
f"No configuration file found at '{config_path}'. Run `kitefs init` to create a project."
|
|
85
|
+
)
|
|
86
|
+
try:
|
|
87
|
+
text = config_path.read_text(encoding="utf-8")
|
|
88
|
+
data = yaml.safe_load(text)
|
|
89
|
+
except yaml.YAMLError as exc:
|
|
90
|
+
raise ConfigurationError(f"Malformed YAML in '{config_path}': {exc}") from exc
|
|
91
|
+
|
|
92
|
+
if not isinstance(data, dict):
|
|
93
|
+
raise ConfigurationError(
|
|
94
|
+
f"Expected a YAML mapping in '{config_path}', got {type(data).__name__}. "
|
|
95
|
+
"The file must contain key-value pairs (e.g., 'provider: local')."
|
|
96
|
+
)
|
|
97
|
+
return data
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _apply_env_overrides(raw: dict) -> dict[str, str]:
|
|
101
|
+
"""Apply environment variable overrides to the raw config dict.
|
|
102
|
+
|
|
103
|
+
Returns a mapping of config field path (e.g. 'provider', 'aws.s3_bucket') to
|
|
104
|
+
the env var name that provided the value, for use in actionable error messages.
|
|
105
|
+
"""
|
|
106
|
+
origins: dict[str, str] = {}
|
|
107
|
+
|
|
108
|
+
for env_var, field in _ENV_OVERRIDES.items():
|
|
109
|
+
value = os.environ.get(env_var)
|
|
110
|
+
if value is not None:
|
|
111
|
+
raw[field] = value
|
|
112
|
+
origins[field] = env_var
|
|
113
|
+
|
|
114
|
+
for env_var, field in _ENV_AWS_OVERRIDES.items():
|
|
115
|
+
value = os.environ.get(env_var)
|
|
116
|
+
if value is not None:
|
|
117
|
+
raw.setdefault("aws", {})
|
|
118
|
+
if raw["aws"] is None:
|
|
119
|
+
raw["aws"] = {}
|
|
120
|
+
raw["aws"][field] = value
|
|
121
|
+
origins[f"aws.{field}"] = env_var
|
|
122
|
+
|
|
123
|
+
return origins
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _validate(raw: dict, config_path: Path, env_origins: dict[str, str]) -> None:
|
|
127
|
+
"""Validate all config fields, collecting all errors before raising."""
|
|
128
|
+
errors: list[str] = []
|
|
129
|
+
|
|
130
|
+
# --- provider ---
|
|
131
|
+
provider = raw.get("provider")
|
|
132
|
+
if provider is None:
|
|
133
|
+
errors.append(f"Missing required field: 'provider' in {config_path}")
|
|
134
|
+
elif provider not in _SUPPORTED_PROVIDERS:
|
|
135
|
+
source = f" (set by environment variable {env_origins['provider']})" if "provider" in env_origins else ""
|
|
136
|
+
errors.append(f"Unsupported provider: '{provider}'{source}. Supported: {', '.join(_SUPPORTED_PROVIDERS)}")
|
|
137
|
+
|
|
138
|
+
# --- storage_root ---
|
|
139
|
+
storage_root = raw.get("storage_root")
|
|
140
|
+
if storage_root is None:
|
|
141
|
+
errors.append(f"Missing required field: 'storage_root' in {config_path}")
|
|
142
|
+
elif not isinstance(storage_root, str):
|
|
143
|
+
env_var = env_origins.get("storage_root")
|
|
144
|
+
source = f" (set by environment variable {env_var})" if env_var else ""
|
|
145
|
+
errors.append(
|
|
146
|
+
f"Invalid 'storage_root' value{source}: expected a string path, got {type(storage_root).__name__}"
|
|
147
|
+
)
|
|
148
|
+
elif not storage_root.strip():
|
|
149
|
+
env_var = env_origins.get("storage_root")
|
|
150
|
+
source = f" (set by environment variable {env_var})" if env_var else ""
|
|
151
|
+
errors.append(
|
|
152
|
+
f"Invalid 'storage_root' value{source}: path must not be empty. "
|
|
153
|
+
"Set a relative path such as './feature_store/'."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# --- AWS fields (only when provider is aws) ---
|
|
157
|
+
if provider == "aws":
|
|
158
|
+
aws_section = raw.get("aws", {}) or {}
|
|
159
|
+
missing_aws = [f for f in _REQUIRED_AWS_FIELDS if not aws_section.get(f)]
|
|
160
|
+
if missing_aws:
|
|
161
|
+
formatted = ", ".join(f"'aws.{f}'" for f in missing_aws)
|
|
162
|
+
errors.append(f"Missing required AWS fields when provider is 'aws': {formatted}")
|
|
163
|
+
|
|
164
|
+
if errors:
|
|
165
|
+
joined = "; ".join(errors)
|
|
166
|
+
raise ConfigurationError(f"Invalid configuration in '{config_path}': {joined}")
|
kitefs/definitions.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Definition types for KiteFS feature groups — the foundational data model."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class FeatureType(Enum):
|
|
9
|
+
"""Supported data types for entity keys, event timestamps, and features."""
|
|
10
|
+
|
|
11
|
+
STRING = "STRING"
|
|
12
|
+
INTEGER = "INTEGER"
|
|
13
|
+
FLOAT = "FLOAT"
|
|
14
|
+
DATETIME = "DATETIME"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StorageTarget(Enum):
|
|
18
|
+
"""Where a feature group's data is stored and served from."""
|
|
19
|
+
|
|
20
|
+
OFFLINE = "OFFLINE"
|
|
21
|
+
OFFLINE_AND_ONLINE = "OFFLINE_AND_ONLINE"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ValidationMode(Enum):
|
|
25
|
+
"""How validation failures are handled at each gate (ingestion, retrieval)."""
|
|
26
|
+
|
|
27
|
+
ERROR = "ERROR"
|
|
28
|
+
FILTER = "FILTER"
|
|
29
|
+
NONE = "NONE"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class Expect:
|
|
34
|
+
"""Fluent builder for feature-level data expectations.
|
|
35
|
+
|
|
36
|
+
Each method returns a new Expect instance with the constraint appended,
|
|
37
|
+
preserving immutability. Constraints are stored as a tuple of dicts,
|
|
38
|
+
serializable via dataclasses.asdict().
|
|
39
|
+
|
|
40
|
+
Example::
|
|
41
|
+
|
|
42
|
+
Expect().not_null().gt(0)
|
|
43
|
+
Expect().gte(1900).lte(2030)
|
|
44
|
+
Expect().one_of(["apartment", "house", "land"])
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_constraints: tuple[dict, ...] = ()
|
|
48
|
+
|
|
49
|
+
def not_null(self) -> "Expect":
|
|
50
|
+
"""Require non-null values."""
|
|
51
|
+
return Expect(_constraints=(*self._constraints, {"type": "not_null"}))
|
|
52
|
+
|
|
53
|
+
def gt(self, value: int | float) -> "Expect":
|
|
54
|
+
"""Require values strictly greater than *value*."""
|
|
55
|
+
return Expect(_constraints=(*self._constraints, {"type": "gt", "value": value}))
|
|
56
|
+
|
|
57
|
+
def gte(self, value: int | float) -> "Expect":
|
|
58
|
+
"""Require values greater than or equal to *value*."""
|
|
59
|
+
return Expect(_constraints=(*self._constraints, {"type": "gte", "value": value}))
|
|
60
|
+
|
|
61
|
+
def lt(self, value: int | float) -> "Expect":
|
|
62
|
+
"""Require values strictly less than *value*."""
|
|
63
|
+
return Expect(_constraints=(*self._constraints, {"type": "lt", "value": value}))
|
|
64
|
+
|
|
65
|
+
def lte(self, value: int | float) -> "Expect":
|
|
66
|
+
"""Require values less than or equal to *value*."""
|
|
67
|
+
return Expect(_constraints=(*self._constraints, {"type": "lte", "value": value}))
|
|
68
|
+
|
|
69
|
+
def one_of(self, values: Sequence[str | int | float]) -> "Expect":
|
|
70
|
+
"""Require values to be one of the given *values*. Stores a defensive copy as a tuple."""
|
|
71
|
+
return Expect(_constraints=(*self._constraints, {"type": "one_of", "values": tuple(values)}))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class EntityKey:
|
|
76
|
+
"""The single entity identifier for a feature group.
|
|
77
|
+
|
|
78
|
+
Structural column — always included in query results, implicitly non-null.
|
|
79
|
+
Does not support expectations (Expect is not available on this type).
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
name: str
|
|
83
|
+
dtype: FeatureType
|
|
84
|
+
description: str | None = None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass(frozen=True)
|
|
88
|
+
class EventTimestamp:
|
|
89
|
+
"""The single event timestamp for a feature group.
|
|
90
|
+
|
|
91
|
+
Structural column — always included in query results, implicitly non-null.
|
|
92
|
+
dtype must be FeatureType.DATETIME (enforced at apply() time).
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
name: str
|
|
96
|
+
dtype: FeatureType
|
|
97
|
+
description: str | None = None
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@dataclass(frozen=True)
|
|
101
|
+
class Feature:
|
|
102
|
+
"""A single feature (data column) within a feature group.
|
|
103
|
+
|
|
104
|
+
Supports optional expectations via the Expect fluent builder for
|
|
105
|
+
data-level validation at ingestion and retrieval gates.
|
|
106
|
+
"""
|
|
107
|
+
|
|
108
|
+
name: str
|
|
109
|
+
dtype: FeatureType
|
|
110
|
+
description: str | None = None
|
|
111
|
+
expect: Expect | None = None
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass(frozen=True)
|
|
115
|
+
class JoinKey:
|
|
116
|
+
"""Declares a join relationship to another feature group's entity key.
|
|
117
|
+
|
|
118
|
+
field_name must match a feature name in this group AND the entity key name
|
|
119
|
+
of referenced_group. Type matching is validated at apply() time.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
field_name: str
|
|
123
|
+
referenced_group: str
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass(frozen=True)
|
|
127
|
+
class Metadata:
|
|
128
|
+
"""Optional metadata attached to a feature group.
|
|
129
|
+
|
|
130
|
+
All fields are optional and default to None.
|
|
131
|
+
"""
|
|
132
|
+
|
|
133
|
+
description: str | None = None
|
|
134
|
+
owner: str | None = None
|
|
135
|
+
tags: dict[str, str] | None = None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass(frozen=True)
|
|
139
|
+
class FeatureGroup:
|
|
140
|
+
"""Top-level definition for a feature group.
|
|
141
|
+
|
|
142
|
+
The foundational type that users create in their definitions/ directory.
|
|
143
|
+
FeatureGroup instances are discovered at apply() time via importlib.
|
|
144
|
+
|
|
145
|
+
features is normalised to a tuple sorted alphabetically by Feature.name
|
|
146
|
+
during construction, ensuring deterministic equality and serialisation
|
|
147
|
+
regardless of user-provided order.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
name: str
|
|
151
|
+
storage_target: StorageTarget
|
|
152
|
+
entity_key: EntityKey
|
|
153
|
+
event_timestamp: EventTimestamp
|
|
154
|
+
features: list[Feature]
|
|
155
|
+
join_keys: list[JoinKey] = field(default_factory=list)
|
|
156
|
+
ingestion_validation: ValidationMode = ValidationMode.ERROR
|
|
157
|
+
offline_retrieval_validation: ValidationMode = ValidationMode.NONE
|
|
158
|
+
metadata: Metadata = field(default_factory=Metadata)
|
|
159
|
+
|
|
160
|
+
def __post_init__(self) -> None:
|
|
161
|
+
"""Normalise features to a sorted tuple and join_keys to a tuple."""
|
|
162
|
+
object.__setattr__(
|
|
163
|
+
self,
|
|
164
|
+
"features",
|
|
165
|
+
tuple(sorted(self.features, key=lambda f: f.name)),
|
|
166
|
+
)
|
|
167
|
+
object.__setattr__(self, "join_keys", tuple(self.join_keys))
|