bagelquant-data 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bagelquant_data/__init__.py +32 -0
- bagelquant_data/cli/main.py +30 -0
- bagelquant_data/core/__init__.py +37 -0
- bagelquant_data/core/dataset.py +170 -0
- bagelquant_data/core/deduplication.py +40 -0
- bagelquant_data/core/exceptions.py +39 -0
- bagelquant_data/core/hashing.py +32 -0
- bagelquant_data/core/normalization.py +67 -0
- bagelquant_data/core/partitioning.py +76 -0
- bagelquant_data/core/registry.py +67 -0
- bagelquant_data/core/request.py +21 -0
- bagelquant_data/core/source.py +38 -0
- bagelquant_data/core/types.py +10 -0
- bagelquant_data/core/validation.py +32 -0
- bagelquant_data/finance/__init__.py +80 -0
- bagelquant_data/finance/align.py +5 -0
- bagelquant_data/finance/fields.py +30 -0
- bagelquant_data/finance/flows.py +24 -0
- bagelquant_data/finance/periods.py +11 -0
- bagelquant_data/finance/point_in_time.py +27 -0
- bagelquant_data/finance/ratios.py +31 -0
- bagelquant_data/finance/rolling.py +34 -0
- bagelquant_data/finance/shares.py +28 -0
- bagelquant_data/finance/stocks.py +27 -0
- bagelquant_data/management/__init__.py +5 -0
- bagelquant_data/management/datasets.py +63 -0
- bagelquant_data/management/lake.py +190 -0
- bagelquant_data/management/sources.py +57 -0
- bagelquant_data/management/status.py +55 -0
- bagelquant_data/pipeline/__init__.py +6 -0
- bagelquant_data/pipeline/commit.py +175 -0
- bagelquant_data/pipeline/ingest.py +142 -0
- bagelquant_data/pipeline/update.py +351 -0
- bagelquant_data/query/__init__.py +65 -0
- bagelquant_data/query/field.py +96 -0
- bagelquant_data/query/filters.py +18 -0
- bagelquant_data/query/observations.py +45 -0
- bagelquant_data/query/raw.py +46 -0
- bagelquant_data/query/records.py +17 -0
- bagelquant_data/query/reference.py +18 -0
- bagelquant_data/query/scanner.py +13 -0
- bagelquant_data/sources/__init__.py +1 -0
- bagelquant_data/sources/tushare/__init__.py +5 -0
- bagelquant_data/sources/tushare/authentication.py +16 -0
- bagelquant_data/sources/tushare/client.py +20 -0
- bagelquant_data/sources/tushare/source.py +121 -0
- bagelquant_data/storage/__init__.py +7 -0
- bagelquant_data/storage/atomic.py +29 -0
- bagelquant_data/storage/metadata.py +410 -0
- bagelquant_data/storage/parquet.py +68 -0
- bagelquant_data/storage/paths.py +48 -0
- bagelquant_data/storage/rejected.py +30 -0
- bagelquant_data/storage/staging.py +28 -0
- bagelquant_data-0.1.0.dist-info/METADATA +74 -0
- bagelquant_data-0.1.0.dist-info/RECORD +57 -0
- bagelquant_data-0.1.0.dist-info/WHEEL +4 -0
- bagelquant_data-0.1.0.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Source-agnostic data lake framework for BagelQuant research."""
|
|
2
|
+
|
|
3
|
+
from bagelquant_data.core import (
|
|
4
|
+
BagelQuantDataError,
|
|
5
|
+
DataSource,
|
|
6
|
+
DatasetNotFoundError,
|
|
7
|
+
DatasetSpec,
|
|
8
|
+
DatasetSpecError,
|
|
9
|
+
DuplicateResolutionError,
|
|
10
|
+
SourceNotFoundError,
|
|
11
|
+
ValidationError,
|
|
12
|
+
stable_bucket,
|
|
13
|
+
)
|
|
14
|
+
from bagelquant_data.finance import FinancialFieldKind, FinancialFieldSpec
|
|
15
|
+
from bagelquant_data.management import DataLake
|
|
16
|
+
from bagelquant_data.sources.tushare import TushareSource
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BagelQuantDataError",
|
|
20
|
+
"DataLake",
|
|
21
|
+
"DataSource",
|
|
22
|
+
"DatasetNotFoundError",
|
|
23
|
+
"DatasetSpec",
|
|
24
|
+
"DatasetSpecError",
|
|
25
|
+
"DuplicateResolutionError",
|
|
26
|
+
"FinancialFieldKind",
|
|
27
|
+
"FinancialFieldSpec",
|
|
28
|
+
"SourceNotFoundError",
|
|
29
|
+
"TushareSource",
|
|
30
|
+
"ValidationError",
|
|
31
|
+
"stable_bucket",
|
|
32
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Thin CLI for the Python-first data lake API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
from bagelquant_data import DataLake
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main(argv: list[str] | None = None) -> int:
|
|
11
|
+
parser = argparse.ArgumentParser(prog="bagelquant-data")
|
|
12
|
+
parser.add_argument("--root", default="data")
|
|
13
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
14
|
+
sub.add_parser("status")
|
|
15
|
+
datasets = sub.add_parser("dataset-list")
|
|
16
|
+
datasets.add_argument("--source")
|
|
17
|
+
sub.add_parser("source-list")
|
|
18
|
+
args = parser.parse_args(argv)
|
|
19
|
+
lake = DataLake.open(args.root)
|
|
20
|
+
if args.command == "status":
|
|
21
|
+
print(lake.status.summary())
|
|
22
|
+
elif args.command == "dataset-list":
|
|
23
|
+
print(lake.datasets.list(args.source))
|
|
24
|
+
elif args.command == "source-list":
|
|
25
|
+
print(lake.sources.list())
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Core source-agnostic framework primitives."""
|
|
2
|
+
|
|
3
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
4
|
+
from bagelquant_data.core.exceptions import (
|
|
5
|
+
BagelQuantDataError,
|
|
6
|
+
ConfigurationError,
|
|
7
|
+
DatasetNotFoundError,
|
|
8
|
+
DatasetSpecError,
|
|
9
|
+
DestructiveOperationError,
|
|
10
|
+
DuplicateResolutionError,
|
|
11
|
+
SourceNotFoundError,
|
|
12
|
+
ValidationError,
|
|
13
|
+
)
|
|
14
|
+
from bagelquant_data.core.hashing import frame_content_hash, stable_bucket, stable_record_hash
|
|
15
|
+
from bagelquant_data.core.registry import FrameworkRegistries, Registry, default_registries
|
|
16
|
+
from bagelquant_data.core.request import RequestContext
|
|
17
|
+
from bagelquant_data.core.source import DataSource
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"BagelQuantDataError",
|
|
21
|
+
"ConfigurationError",
|
|
22
|
+
"DataSource",
|
|
23
|
+
"DatasetNotFoundError",
|
|
24
|
+
"DatasetSpec",
|
|
25
|
+
"DatasetSpecError",
|
|
26
|
+
"DestructiveOperationError",
|
|
27
|
+
"DuplicateResolutionError",
|
|
28
|
+
"FrameworkRegistries",
|
|
29
|
+
"Registry",
|
|
30
|
+
"RequestContext",
|
|
31
|
+
"SourceNotFoundError",
|
|
32
|
+
"ValidationError",
|
|
33
|
+
"default_registries",
|
|
34
|
+
"frame_content_hash",
|
|
35
|
+
"stable_bucket",
|
|
36
|
+
"stable_record_hash",
|
|
37
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Dataset specification model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from bagelquant_data.core.exceptions import DatasetSpecError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class DatasetSpec:
|
|
14
|
+
"""Declarative canonical dataset behavior."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
source: str
|
|
18
|
+
source_dataset: str
|
|
19
|
+
category: str
|
|
20
|
+
field_mapping: dict[str, str]
|
|
21
|
+
required_columns: tuple[str, ...]
|
|
22
|
+
primary_key: tuple[str, ...] | None = None
|
|
23
|
+
business_key: tuple[str, ...] | None = None
|
|
24
|
+
asset_column: str | None = None
|
|
25
|
+
time_column: str | None = None
|
|
26
|
+
period_column: str | None = None
|
|
27
|
+
request_planner: str = "snapshot"
|
|
28
|
+
request_options: dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
normalizer: str = "standard"
|
|
30
|
+
deduplication: str = "exact_record_hash"
|
|
31
|
+
partition_strategy: str = "single_file"
|
|
32
|
+
partition_options: dict[str, Any] = field(default_factory=dict)
|
|
33
|
+
update_mode: str = "upsert"
|
|
34
|
+
sort_columns: tuple[str, ...] = ()
|
|
35
|
+
point_in_time: bool = False
|
|
36
|
+
reference: bool = False
|
|
37
|
+
enabled: bool = True
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_mapping(cls, value: dict[str, Any]) -> "DatasetSpec":
|
|
41
|
+
"""Build a specification from parsed config data."""
|
|
42
|
+
|
|
43
|
+
required = ("name", "source", "source_dataset", "category")
|
|
44
|
+
missing = [key for key in required if key not in value]
|
|
45
|
+
if missing:
|
|
46
|
+
raise DatasetSpecError(f"Dataset spec missing required keys: {missing}")
|
|
47
|
+
return cls(
|
|
48
|
+
name=str(value["name"]),
|
|
49
|
+
source=str(value["source"]),
|
|
50
|
+
source_dataset=str(value["source_dataset"]),
|
|
51
|
+
category=str(value["category"]),
|
|
52
|
+
field_mapping=dict(value.get("field_mapping") or {}),
|
|
53
|
+
required_columns=_tuple(value.get("required_columns")),
|
|
54
|
+
primary_key=_optional_tuple(value.get("primary_key")),
|
|
55
|
+
business_key=_optional_tuple(value.get("business_key")),
|
|
56
|
+
asset_column=_optional_str(value.get("asset_column")),
|
|
57
|
+
time_column=_optional_str(value.get("time_column")),
|
|
58
|
+
period_column=_optional_str(value.get("period_column")),
|
|
59
|
+
request_planner=str(value.get("request_planner") or "snapshot"),
|
|
60
|
+
request_options=dict(value.get("request_options") or {}),
|
|
61
|
+
normalizer=str(value.get("normalizer") or "standard"),
|
|
62
|
+
deduplication=str(value.get("deduplication") or "exact_record_hash"),
|
|
63
|
+
partition_strategy=str(value.get("partition_strategy") or "single_file"),
|
|
64
|
+
partition_options=dict(value.get("partition_options") or {}),
|
|
65
|
+
update_mode=str(value.get("update_mode") or "upsert"),
|
|
66
|
+
sort_columns=_tuple(value.get("sort_columns")),
|
|
67
|
+
point_in_time=bool(value.get("point_in_time", False)),
|
|
68
|
+
reference=bool(value.get("reference", False)),
|
|
69
|
+
enabled=bool(value.get("enabled", True)),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_yaml(cls, path: str | Path) -> "DatasetSpec":
|
|
74
|
+
"""Load a dataset spec from a small YAML file.
|
|
75
|
+
|
|
76
|
+
The project intentionally avoids a YAML runtime dependency. This parser
|
|
77
|
+
supports the simple mappings and lists used by bundled dataset specs.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
return cls.from_mapping(_parse_simple_yaml(Path(path).read_text()))
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def key(self) -> tuple[str, str]:
|
|
84
|
+
"""Return the `(source, name)` lookup key."""
|
|
85
|
+
|
|
86
|
+
return (self.source, self.name)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _tuple(value: Any) -> tuple[str, ...]:
|
|
90
|
+
if value is None:
|
|
91
|
+
return ()
|
|
92
|
+
if isinstance(value, str):
|
|
93
|
+
return (value,)
|
|
94
|
+
return tuple(str(item) for item in value)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _optional_tuple(value: Any) -> tuple[str, ...] | None:
|
|
98
|
+
result = _tuple(value)
|
|
99
|
+
return result or None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _optional_str(value: Any) -> str | None:
|
|
103
|
+
return None if value is None else str(value)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _parse_simple_yaml(text: str) -> dict[str, Any]:
|
|
107
|
+
rows = [line.rstrip() for line in text.splitlines()]
|
|
108
|
+
root: dict[str, Any] = {}
|
|
109
|
+
stack: list[tuple[int, dict[str, Any] | list[Any]]] = [(-1, root)]
|
|
110
|
+
for raw in rows:
|
|
111
|
+
if not raw.strip() or raw.lstrip().startswith("#"):
|
|
112
|
+
continue
|
|
113
|
+
indent = len(raw) - len(raw.lstrip(" "))
|
|
114
|
+
line = raw.strip()
|
|
115
|
+
while stack and indent <= stack[-1][0]:
|
|
116
|
+
stack.pop()
|
|
117
|
+
parent = stack[-1][1]
|
|
118
|
+
if line.startswith("- "):
|
|
119
|
+
if not isinstance(parent, list):
|
|
120
|
+
raise DatasetSpecError(f"Unsupported YAML list location: {line}")
|
|
121
|
+
parent.append(_yaml_scalar(line[2:]))
|
|
122
|
+
continue
|
|
123
|
+
key, sep, value = line.partition(":")
|
|
124
|
+
if not sep:
|
|
125
|
+
raise DatasetSpecError(f"Unsupported YAML line: {line}")
|
|
126
|
+
key = key.strip()
|
|
127
|
+
value = value.strip()
|
|
128
|
+
if value == "":
|
|
129
|
+
container: dict[str, Any] | list[Any]
|
|
130
|
+
next_list = _next_content_is_list(rows, raw)
|
|
131
|
+
container = [] if next_list else {}
|
|
132
|
+
if isinstance(parent, dict):
|
|
133
|
+
parent[key] = container
|
|
134
|
+
else:
|
|
135
|
+
raise DatasetSpecError(f"Unsupported nested YAML key: {key}")
|
|
136
|
+
stack.append((indent, container))
|
|
137
|
+
elif isinstance(parent, dict):
|
|
138
|
+
parent[key] = _yaml_scalar(value)
|
|
139
|
+
return root
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _next_content_is_list(rows: list[str], current: str) -> bool:
|
|
143
|
+
index = rows.index(current)
|
|
144
|
+
current_indent = len(current) - len(current.lstrip(" "))
|
|
145
|
+
for row in rows[index + 1 :]:
|
|
146
|
+
if not row.strip() or row.lstrip().startswith("#"):
|
|
147
|
+
continue
|
|
148
|
+
indent = len(row) - len(row.lstrip(" "))
|
|
149
|
+
return indent > current_indent and row.strip().startswith("- ")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _yaml_scalar(value: str) -> Any:
|
|
154
|
+
if value in {"true", "True"}:
|
|
155
|
+
return True
|
|
156
|
+
if value in {"false", "False"}:
|
|
157
|
+
return False
|
|
158
|
+
if value in {"null", "None", "~"}:
|
|
159
|
+
return None
|
|
160
|
+
if value.startswith("[") and value.endswith("]"):
|
|
161
|
+
inner = value[1:-1].strip()
|
|
162
|
+
return [] if not inner else [_yaml_scalar(part.strip()) for part in inner.split(",")]
|
|
163
|
+
if (value.startswith('"') and value.endswith('"')) or (
|
|
164
|
+
value.startswith("'") and value.endswith("'")
|
|
165
|
+
):
|
|
166
|
+
return value[1:-1]
|
|
167
|
+
try:
|
|
168
|
+
return int(value)
|
|
169
|
+
except ValueError:
|
|
170
|
+
return value
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Deduplication strategies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DeduplicationStrategy(Protocol):
|
|
13
|
+
"""Deduplicate records for a dataset."""
|
|
14
|
+
|
|
15
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
16
|
+
"""Return deduplicated records."""
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NoDeduplication:
|
|
21
|
+
"""Leave records unchanged."""
|
|
22
|
+
|
|
23
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
24
|
+
return frame
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExactRecordHashDeduplication:
|
|
28
|
+
"""Drop exact duplicate rows."""
|
|
29
|
+
|
|
30
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
31
|
+
return frame.unique(maintain_order=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PrimaryKeyLastDeduplication:
|
|
35
|
+
"""Keep the last row for each primary key."""
|
|
36
|
+
|
|
37
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
38
|
+
if not spec.primary_key:
|
|
39
|
+
return frame.unique(maintain_order=True)
|
|
40
|
+
return frame.unique(subset=list(spec.primary_key), keep="last", maintain_order=True)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Framework exceptions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BagelQuantDataError(Exception):
|
|
7
|
+
"""Base package error."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigurationError(BagelQuantDataError):
|
|
11
|
+
"""Configuration is invalid or incomplete."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatasetSpecError(ConfigurationError):
|
|
15
|
+
"""Dataset specification is invalid."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatasetNotFoundError(BagelQuantDataError):
|
|
19
|
+
"""Requested dataset is not registered or has no canonical data."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SourceNotFoundError(BagelQuantDataError):
|
|
23
|
+
"""Requested source is not registered."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataSourceError(BagelQuantDataError):
|
|
27
|
+
"""Source adapter failed."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ValidationError(BagelQuantDataError):
|
|
31
|
+
"""Data failed validation."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DuplicateResolutionError(BagelQuantDataError):
|
|
35
|
+
"""A single-value panel cannot be produced without resolving duplicates."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DestructiveOperationError(BagelQuantDataError):
|
|
39
|
+
"""A destructive operation was requested without explicit confirmation."""
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Stable hashing helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def stable_bucket(asset_id: str, bucket_count: int) -> int:
|
|
13
|
+
"""Return a deterministic asset bucket."""
|
|
14
|
+
|
|
15
|
+
digest = hashlib.blake2b(asset_id.encode("utf-8"), digest_size=8).digest()
|
|
16
|
+
return int.from_bytes(digest, byteorder="big") % bucket_count
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def stable_record_hash(values: dict[str, object]) -> str:
|
|
20
|
+
"""Hash a record using stable JSON encoding."""
|
|
21
|
+
|
|
22
|
+
payload = json.dumps(values, sort_keys=True, default=str, separators=(",", ":"))
|
|
23
|
+
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def frame_content_hash(frame: pl.DataFrame, columns: Iterable[str] | None = None) -> str:
|
|
27
|
+
"""Hash a dataframe deterministically after sorting selected columns."""
|
|
28
|
+
|
|
29
|
+
selected = list(columns or frame.columns)
|
|
30
|
+
rows = frame.select(selected).sort(selected).to_dicts()
|
|
31
|
+
payload = json.dumps(rows, sort_keys=True, default=str, separators=(",", ":"))
|
|
32
|
+
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Canonical normalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True, slots=True)
|
|
14
|
+
class NormalizeContext:
|
|
15
|
+
"""Normalization context."""
|
|
16
|
+
|
|
17
|
+
source: str
|
|
18
|
+
dataset: str
|
|
19
|
+
run_id: str | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class NormalizeResult:
|
|
24
|
+
"""Accepted and rejected normalized records."""
|
|
25
|
+
|
|
26
|
+
accepted: pl.LazyFrame
|
|
27
|
+
rejected: pl.LazyFrame
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Normalizer(Protocol):
|
|
31
|
+
"""Dataset normalizer protocol."""
|
|
32
|
+
|
|
33
|
+
def normalize(
|
|
34
|
+
self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
|
|
35
|
+
) -> NormalizeResult:
|
|
36
|
+
"""Normalize source rows."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StandardNormalizer:
|
|
41
|
+
"""Map configured source fields into canonical columns."""
|
|
42
|
+
|
|
43
|
+
def normalize(
|
|
44
|
+
self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
|
|
45
|
+
) -> NormalizeResult:
|
|
46
|
+
lf = frame.rename(spec.field_mapping)
|
|
47
|
+
expressions: list[pl.Expr] = [
|
|
48
|
+
pl.lit(context.source).alias("source"),
|
|
49
|
+
pl.lit(spec.source_dataset).alias("source_dataset"),
|
|
50
|
+
]
|
|
51
|
+
if spec.asset_column and spec.asset_column in lf.collect_schema().names():
|
|
52
|
+
expressions.append(pl.col(spec.asset_column).cast(pl.String).alias("asset_id"))
|
|
53
|
+
if spec.time_column and spec.time_column in lf.collect_schema().names():
|
|
54
|
+
expressions.append(_date_expr(spec.time_column).alias("time"))
|
|
55
|
+
if spec.period_column and spec.period_column in lf.collect_schema().names():
|
|
56
|
+
expressions.append(_date_expr(spec.period_column).alias("period"))
|
|
57
|
+
accepted = lf.with_columns(expressions)
|
|
58
|
+
rejected = accepted.filter(pl.lit(False))
|
|
59
|
+
return NormalizeResult(accepted=accepted, rejected=rejected)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _date_expr(column: str) -> pl.Expr:
|
|
63
|
+
return (
|
|
64
|
+
pl.when(pl.col(column).cast(pl.String).str.len_chars() == 8)
|
|
65
|
+
.then(pl.col(column).cast(pl.String).str.strptime(pl.Date, "%Y%m%d", strict=False))
|
|
66
|
+
.otherwise(pl.col(column).cast(pl.Date, strict=False))
|
|
67
|
+
)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Partition strategies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
11
|
+
from bagelquant_data.core.hashing import stable_bucket
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PartitionStrategy(Protocol):
|
|
15
|
+
"""Derive partition values and paths."""
|
|
16
|
+
|
|
17
|
+
def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
18
|
+
"""Add partition columns."""
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
|
|
22
|
+
"""Return candidate partition paths."""
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
|
|
26
|
+
"""Return partition path for values."""
|
|
27
|
+
...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SingleFilePartition:
|
|
31
|
+
"""One canonical file per dataset."""
|
|
32
|
+
|
|
33
|
+
def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
34
|
+
return frame
|
|
35
|
+
|
|
36
|
+
def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
|
|
37
|
+
return [Path("data.parquet")]
|
|
38
|
+
|
|
39
|
+
def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
|
|
40
|
+
return Path("data.parquet")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class YearMonthPartition:
|
|
44
|
+
"""Partition by year and month of canonical time."""
|
|
45
|
+
|
|
46
|
+
def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
47
|
+
return frame.with_columns(
|
|
48
|
+
pl.col("time").dt.year().cast(pl.Int16).alias("year"),
|
|
49
|
+
pl.col("time").dt.month().cast(pl.Int8).alias("month"),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
|
|
53
|
+
return []
|
|
54
|
+
|
|
55
|
+
def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
|
|
56
|
+
return Path(f"year={values['year']}") / f"month={int(str(values['month'])):02d}" / "data.parquet"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class YearBucketPartition:
|
|
60
|
+
"""Partition by year(time) and stable asset bucket."""
|
|
61
|
+
|
|
62
|
+
def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
63
|
+
bucket_count = int(spec.partition_options.get("bucket_count", 32))
|
|
64
|
+
return frame.with_columns(
|
|
65
|
+
pl.col("time").dt.year().cast(pl.Int16).alias("year"),
|
|
66
|
+
pl.col("asset_id")
|
|
67
|
+
.cast(pl.String)
|
|
68
|
+
.map_elements(lambda value: stable_bucket(value, bucket_count), return_dtype=pl.Int16)
|
|
69
|
+
.alias("bucket"),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
|
|
76
|
+
return Path(f"year={values['year']}") / f"bucket={int(str(values['bucket'])):02d}" / "data.parquet"
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Plugin registries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Generic, TypeVar
|
|
7
|
+
|
|
8
|
+
from bagelquant_data.core.deduplication import (
|
|
9
|
+
ExactRecordHashDeduplication,
|
|
10
|
+
NoDeduplication,
|
|
11
|
+
PrimaryKeyLastDeduplication,
|
|
12
|
+
)
|
|
13
|
+
from bagelquant_data.core.normalization import StandardNormalizer
|
|
14
|
+
from bagelquant_data.core.partitioning import (
|
|
15
|
+
SingleFilePartition,
|
|
16
|
+
YearBucketPartition,
|
|
17
|
+
YearMonthPartition,
|
|
18
|
+
)
|
|
19
|
+
from bagelquant_data.core.validation import FrameworkValidator
|
|
20
|
+
|
|
21
|
+
T = TypeVar("T")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class Registry(Generic[T]):
|
|
26
|
+
"""Named object registry."""
|
|
27
|
+
|
|
28
|
+
_items: dict[str, T] = field(default_factory=dict)
|
|
29
|
+
|
|
30
|
+
def register(self, name: str, value: T) -> None:
|
|
31
|
+
self._items[name] = value
|
|
32
|
+
|
|
33
|
+
def get(self, name: str) -> T:
|
|
34
|
+
try:
|
|
35
|
+
return self._items[name]
|
|
36
|
+
except KeyError as exc:
|
|
37
|
+
raise KeyError(f"Unknown registry item: {name}") from exc
|
|
38
|
+
|
|
39
|
+
def list(self) -> tuple[str, ...]:
|
|
40
|
+
return tuple(sorted(self._items))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class FrameworkRegistries:
|
|
45
|
+
"""All extension registries used by the framework."""
|
|
46
|
+
|
|
47
|
+
sources: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
48
|
+
normalizers: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
49
|
+
validators: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
50
|
+
partition_strategies: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
51
|
+
deduplication_strategies: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
52
|
+
financial_fields: Registry[object] = field(default_factory=lambda: Registry[object]())
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def default_registries() -> FrameworkRegistries:
|
|
56
|
+
"""Return registries with built-in plugins installed."""
|
|
57
|
+
|
|
58
|
+
registries = FrameworkRegistries()
|
|
59
|
+
registries.normalizers.register("standard", StandardNormalizer())
|
|
60
|
+
registries.validators.register("framework", FrameworkValidator())
|
|
61
|
+
registries.partition_strategies.register("single_file", SingleFilePartition())
|
|
62
|
+
registries.partition_strategies.register("year_month", YearMonthPartition())
|
|
63
|
+
registries.partition_strategies.register("year_bucket", YearBucketPartition())
|
|
64
|
+
registries.deduplication_strategies.register("none", NoDeduplication())
|
|
65
|
+
registries.deduplication_strategies.register("exact_record_hash", ExactRecordHashDeduplication())
|
|
66
|
+
registries.deduplication_strategies.register("primary_key_last", PrimaryKeyLastDeduplication())
|
|
67
|
+
return registries
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Request planning models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Mapping, Sequence
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from bagelquant_data.core.types import DateLike
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class RequestContext:
|
|
14
|
+
"""Context passed to request planners and source adapters."""
|
|
15
|
+
|
|
16
|
+
source: str
|
|
17
|
+
dataset: str
|
|
18
|
+
start: DateLike | None = None
|
|
19
|
+
end: DateLike | None = None
|
|
20
|
+
assets: Sequence[str] | None = None
|
|
21
|
+
options: Mapping[str, Any] = field(default_factory=dict)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Source adapter protocol."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterable, Mapping
|
|
6
|
+
from typing import Any, Protocol
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
11
|
+
from bagelquant_data.core.request import RequestContext
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DataSource(Protocol):
|
|
15
|
+
"""Generic external source adapter."""
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def name(self) -> str:
|
|
19
|
+
"""Source name."""
|
|
20
|
+
...
|
|
21
|
+
|
|
22
|
+
def configure(self, **options: Any) -> None:
|
|
23
|
+
"""Configure credentials and runtime options."""
|
|
24
|
+
...
|
|
25
|
+
|
|
26
|
+
def test_connection(self) -> None:
|
|
27
|
+
"""Raise when the source cannot be reached."""
|
|
28
|
+
...
|
|
29
|
+
|
|
30
|
+
def fetch(self, source_dataset: str, request: Mapping[str, Any]) -> pl.DataFrame:
|
|
31
|
+
"""Fetch one source response."""
|
|
32
|
+
...
|
|
33
|
+
|
|
34
|
+
def plan_requests(
|
|
35
|
+
self, dataset: DatasetSpec, context: RequestContext
|
|
36
|
+
) -> Iterable[Mapping[str, Any]]:
|
|
37
|
+
"""Plan source requests for a dataset."""
|
|
38
|
+
...
|