bagelquant-data 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bagelquant_data-0.1.0/PKG-INFO +74 -0
- bagelquant_data-0.1.0/README.md +59 -0
- bagelquant_data-0.1.0/pyproject.toml +38 -0
- bagelquant_data-0.1.0/src/bagelquant_data/__init__.py +32 -0
- bagelquant_data-0.1.0/src/bagelquant_data/cli/main.py +30 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/__init__.py +37 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/dataset.py +170 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/deduplication.py +40 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/exceptions.py +39 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/hashing.py +32 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/normalization.py +67 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/partitioning.py +76 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/registry.py +67 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/request.py +21 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/source.py +38 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/types.py +10 -0
- bagelquant_data-0.1.0/src/bagelquant_data/core/validation.py +32 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/__init__.py +80 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/align.py +5 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/fields.py +30 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/flows.py +24 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/periods.py +11 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/point_in_time.py +27 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/ratios.py +31 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/rolling.py +34 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/shares.py +28 -0
- bagelquant_data-0.1.0/src/bagelquant_data/finance/stocks.py +27 -0
- bagelquant_data-0.1.0/src/bagelquant_data/management/__init__.py +5 -0
- bagelquant_data-0.1.0/src/bagelquant_data/management/datasets.py +63 -0
- bagelquant_data-0.1.0/src/bagelquant_data/management/lake.py +190 -0
- bagelquant_data-0.1.0/src/bagelquant_data/management/sources.py +57 -0
- bagelquant_data-0.1.0/src/bagelquant_data/management/status.py +55 -0
- bagelquant_data-0.1.0/src/bagelquant_data/pipeline/__init__.py +6 -0
- bagelquant_data-0.1.0/src/bagelquant_data/pipeline/commit.py +175 -0
- bagelquant_data-0.1.0/src/bagelquant_data/pipeline/ingest.py +142 -0
- bagelquant_data-0.1.0/src/bagelquant_data/pipeline/update.py +351 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/__init__.py +65 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/field.py +96 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/filters.py +18 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/observations.py +45 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/raw.py +46 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/records.py +17 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/reference.py +18 -0
- bagelquant_data-0.1.0/src/bagelquant_data/query/scanner.py +13 -0
- bagelquant_data-0.1.0/src/bagelquant_data/sources/__init__.py +1 -0
- bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/__init__.py +5 -0
- bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/authentication.py +16 -0
- bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/client.py +20 -0
- bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/source.py +121 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/__init__.py +7 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/atomic.py +29 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/metadata.py +410 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/parquet.py +68 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/paths.py +48 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/rejected.py +30 -0
- bagelquant_data-0.1.0/src/bagelquant_data/storage/staging.py +28 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: bagelquant-data
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified data layer for the BagelQuant ecosystem
|
|
5
|
+
Author: BagelQuant
|
|
6
|
+
License: Apache-2.0
|
|
7
|
+
Requires-Dist: polars>=1.35.0
|
|
8
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
9
|
+
Requires-Dist: tqdm>=4.67.0
|
|
10
|
+
Requires-Dist: pandas>=3.0.0 ; extra == 'tushare'
|
|
11
|
+
Requires-Dist: tushare>=1.4.21 ; extra == 'tushare'
|
|
12
|
+
Requires-Python: >=3.13
|
|
13
|
+
Provides-Extra: tushare
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# BagelQuant Data
|
|
17
|
+
|
|
18
|
+
`bagelquant-data` is a Polars-native, source-agnostic data lake framework for
|
|
19
|
+
quantitative research.
|
|
20
|
+
|
|
21
|
+
- Polars is the dataframe engine.
|
|
22
|
+
- Parquet is the canonical analytical storage format.
|
|
23
|
+
- SQLite stores mutable metadata, manifests, run state, and source/dataset
|
|
24
|
+
registration.
|
|
25
|
+
- Tushare is implemented as the first source adapter under
|
|
26
|
+
`bagelquant_data.sources.tushare`.
|
|
27
|
+
- Non-reference research extraction returns one field at a time as
|
|
28
|
+
`time | asset_id | value`.
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import polars as pl
|
|
32
|
+
|
|
33
|
+
from bagelquant_data import DataLake, DatasetSpec
|
|
34
|
+
|
|
35
|
+
lake = DataLake.open("data")
|
|
36
|
+
spec = DatasetSpec(
|
|
37
|
+
name="daily",
|
|
38
|
+
source="custom",
|
|
39
|
+
source_dataset="daily",
|
|
40
|
+
category="market",
|
|
41
|
+
field_mapping={"ts_code": "ts_code", "trade_date": "trade_date"},
|
|
42
|
+
required_columns=("asset_id", "time"),
|
|
43
|
+
primary_key=("asset_id", "time"),
|
|
44
|
+
asset_column="ts_code",
|
|
45
|
+
time_column="trade_date",
|
|
46
|
+
partition_strategy="year_month",
|
|
47
|
+
deduplication="primary_key_last",
|
|
48
|
+
sort_columns=("time", "asset_id"),
|
|
49
|
+
)
|
|
50
|
+
lake.ingest_frame(
|
|
51
|
+
spec,
|
|
52
|
+
pl.DataFrame(
|
|
53
|
+
{
|
|
54
|
+
"trade_date": ["2024-01-02"],
|
|
55
|
+
"ts_code": ["000001.SZ"],
|
|
56
|
+
"close": [100.0],
|
|
57
|
+
}
|
|
58
|
+
),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
close = lake.query.field("daily", "close", source="custom", collect=True)
|
|
62
|
+
print(close) # time, asset_id, close
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Documentation is available in two languages:
|
|
66
|
+
|
|
67
|
+
- English: `docs/en/index.md`
|
|
68
|
+
- Chinese: `docs/cn/index.md`
|
|
69
|
+
|
|
70
|
+
## Development
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
uv run pytest
|
|
74
|
+
```
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# BagelQuant Data
|
|
2
|
+
|
|
3
|
+
`bagelquant-data` is a Polars-native, source-agnostic data lake framework for
|
|
4
|
+
quantitative research.
|
|
5
|
+
|
|
6
|
+
- Polars is the dataframe engine.
|
|
7
|
+
- Parquet is the canonical analytical storage format.
|
|
8
|
+
- SQLite stores mutable metadata, manifests, run state, and source/dataset
|
|
9
|
+
registration.
|
|
10
|
+
- Tushare is implemented as the first source adapter under
|
|
11
|
+
`bagelquant_data.sources.tushare`.
|
|
12
|
+
- Non-reference research extraction returns one field at a time as
|
|
13
|
+
`time | asset_id | value`.
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
import polars as pl
|
|
17
|
+
|
|
18
|
+
from bagelquant_data import DataLake, DatasetSpec
|
|
19
|
+
|
|
20
|
+
lake = DataLake.open("data")
|
|
21
|
+
spec = DatasetSpec(
|
|
22
|
+
name="daily",
|
|
23
|
+
source="custom",
|
|
24
|
+
source_dataset="daily",
|
|
25
|
+
category="market",
|
|
26
|
+
field_mapping={"ts_code": "ts_code", "trade_date": "trade_date"},
|
|
27
|
+
required_columns=("asset_id", "time"),
|
|
28
|
+
primary_key=("asset_id", "time"),
|
|
29
|
+
asset_column="ts_code",
|
|
30
|
+
time_column="trade_date",
|
|
31
|
+
partition_strategy="year_month",
|
|
32
|
+
deduplication="primary_key_last",
|
|
33
|
+
sort_columns=("time", "asset_id"),
|
|
34
|
+
)
|
|
35
|
+
lake.ingest_frame(
|
|
36
|
+
spec,
|
|
37
|
+
pl.DataFrame(
|
|
38
|
+
{
|
|
39
|
+
"trade_date": ["2024-01-02"],
|
|
40
|
+
"ts_code": ["000001.SZ"],
|
|
41
|
+
"close": [100.0],
|
|
42
|
+
}
|
|
43
|
+
),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
close = lake.query.field("daily", "close", source="custom", collect=True)
|
|
47
|
+
print(close) # time, asset_id, close
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Documentation is available in two languages:
|
|
51
|
+
|
|
52
|
+
- English: `docs/en/index.md`
|
|
53
|
+
- Chinese: `docs/cn/index.md`
|
|
54
|
+
|
|
55
|
+
## Development
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
uv run pytest
|
|
59
|
+
```
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "bagelquant-data"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Unified data layer for the BagelQuant ecosystem"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.13"
|
|
7
|
+
license = { text = "Apache-2.0" }
|
|
8
|
+
authors = [{ name = "BagelQuant" }]
|
|
9
|
+
dependencies = [
|
|
10
|
+
"polars>=1.35.0",
|
|
11
|
+
"pyarrow>=22.0.0",
|
|
12
|
+
"tqdm>=4.67.0",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
bagelquant-data = "bagelquant_data.cli.main:main"
|
|
17
|
+
|
|
18
|
+
[project.optional-dependencies]
|
|
19
|
+
tushare = [
|
|
20
|
+
"pandas>=3.0.0",
|
|
21
|
+
"tushare>=1.4.21",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[dependency-groups]
|
|
25
|
+
dev = [
|
|
26
|
+
"pandas>=3.0.0",
|
|
27
|
+
"pre-commit>=4.5.0",
|
|
28
|
+
"pyright>=1.1.410",
|
|
29
|
+
"pytest>=9.0.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[build-system]
|
|
33
|
+
requires = ["uv_build>=0.8.0,<0.9.0"]
|
|
34
|
+
build-backend = "uv_build"
|
|
35
|
+
|
|
36
|
+
[tool.pytest.ini_options]
|
|
37
|
+
pythonpath = [".", "src"]
|
|
38
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Source-agnostic data lake framework for BagelQuant research."""
|
|
2
|
+
|
|
3
|
+
from bagelquant_data.core import (
|
|
4
|
+
BagelQuantDataError,
|
|
5
|
+
DataSource,
|
|
6
|
+
DatasetNotFoundError,
|
|
7
|
+
DatasetSpec,
|
|
8
|
+
DatasetSpecError,
|
|
9
|
+
DuplicateResolutionError,
|
|
10
|
+
SourceNotFoundError,
|
|
11
|
+
ValidationError,
|
|
12
|
+
stable_bucket,
|
|
13
|
+
)
|
|
14
|
+
from bagelquant_data.finance import FinancialFieldKind, FinancialFieldSpec
|
|
15
|
+
from bagelquant_data.management import DataLake
|
|
16
|
+
from bagelquant_data.sources.tushare import TushareSource
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"BagelQuantDataError",
|
|
20
|
+
"DataLake",
|
|
21
|
+
"DataSource",
|
|
22
|
+
"DatasetNotFoundError",
|
|
23
|
+
"DatasetSpec",
|
|
24
|
+
"DatasetSpecError",
|
|
25
|
+
"DuplicateResolutionError",
|
|
26
|
+
"FinancialFieldKind",
|
|
27
|
+
"FinancialFieldSpec",
|
|
28
|
+
"SourceNotFoundError",
|
|
29
|
+
"TushareSource",
|
|
30
|
+
"ValidationError",
|
|
31
|
+
"stable_bucket",
|
|
32
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Thin CLI for the Python-first data lake API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
|
|
7
|
+
from bagelquant_data import DataLake
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main(argv: list[str] | None = None) -> int:
|
|
11
|
+
parser = argparse.ArgumentParser(prog="bagelquant-data")
|
|
12
|
+
parser.add_argument("--root", default="data")
|
|
13
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
14
|
+
sub.add_parser("status")
|
|
15
|
+
datasets = sub.add_parser("dataset-list")
|
|
16
|
+
datasets.add_argument("--source")
|
|
17
|
+
sub.add_parser("source-list")
|
|
18
|
+
args = parser.parse_args(argv)
|
|
19
|
+
lake = DataLake.open(args.root)
|
|
20
|
+
if args.command == "status":
|
|
21
|
+
print(lake.status.summary())
|
|
22
|
+
elif args.command == "dataset-list":
|
|
23
|
+
print(lake.datasets.list(args.source))
|
|
24
|
+
elif args.command == "source-list":
|
|
25
|
+
print(lake.sources.list())
|
|
26
|
+
return 0
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Core source-agnostic framework primitives."""
|
|
2
|
+
|
|
3
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
4
|
+
from bagelquant_data.core.exceptions import (
|
|
5
|
+
BagelQuantDataError,
|
|
6
|
+
ConfigurationError,
|
|
7
|
+
DatasetNotFoundError,
|
|
8
|
+
DatasetSpecError,
|
|
9
|
+
DestructiveOperationError,
|
|
10
|
+
DuplicateResolutionError,
|
|
11
|
+
SourceNotFoundError,
|
|
12
|
+
ValidationError,
|
|
13
|
+
)
|
|
14
|
+
from bagelquant_data.core.hashing import frame_content_hash, stable_bucket, stable_record_hash
|
|
15
|
+
from bagelquant_data.core.registry import FrameworkRegistries, Registry, default_registries
|
|
16
|
+
from bagelquant_data.core.request import RequestContext
|
|
17
|
+
from bagelquant_data.core.source import DataSource
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"BagelQuantDataError",
|
|
21
|
+
"ConfigurationError",
|
|
22
|
+
"DataSource",
|
|
23
|
+
"DatasetNotFoundError",
|
|
24
|
+
"DatasetSpec",
|
|
25
|
+
"DatasetSpecError",
|
|
26
|
+
"DestructiveOperationError",
|
|
27
|
+
"DuplicateResolutionError",
|
|
28
|
+
"FrameworkRegistries",
|
|
29
|
+
"Registry",
|
|
30
|
+
"RequestContext",
|
|
31
|
+
"SourceNotFoundError",
|
|
32
|
+
"ValidationError",
|
|
33
|
+
"default_registries",
|
|
34
|
+
"frame_content_hash",
|
|
35
|
+
"stable_bucket",
|
|
36
|
+
"stable_record_hash",
|
|
37
|
+
]
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Dataset specification model."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from bagelquant_data.core.exceptions import DatasetSpecError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True, slots=True)
|
|
13
|
+
class DatasetSpec:
|
|
14
|
+
"""Declarative canonical dataset behavior."""
|
|
15
|
+
|
|
16
|
+
name: str
|
|
17
|
+
source: str
|
|
18
|
+
source_dataset: str
|
|
19
|
+
category: str
|
|
20
|
+
field_mapping: dict[str, str]
|
|
21
|
+
required_columns: tuple[str, ...]
|
|
22
|
+
primary_key: tuple[str, ...] | None = None
|
|
23
|
+
business_key: tuple[str, ...] | None = None
|
|
24
|
+
asset_column: str | None = None
|
|
25
|
+
time_column: str | None = None
|
|
26
|
+
period_column: str | None = None
|
|
27
|
+
request_planner: str = "snapshot"
|
|
28
|
+
request_options: dict[str, Any] = field(default_factory=dict)
|
|
29
|
+
normalizer: str = "standard"
|
|
30
|
+
deduplication: str = "exact_record_hash"
|
|
31
|
+
partition_strategy: str = "single_file"
|
|
32
|
+
partition_options: dict[str, Any] = field(default_factory=dict)
|
|
33
|
+
update_mode: str = "upsert"
|
|
34
|
+
sort_columns: tuple[str, ...] = ()
|
|
35
|
+
point_in_time: bool = False
|
|
36
|
+
reference: bool = False
|
|
37
|
+
enabled: bool = True
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def from_mapping(cls, value: dict[str, Any]) -> "DatasetSpec":
|
|
41
|
+
"""Build a specification from parsed config data."""
|
|
42
|
+
|
|
43
|
+
required = ("name", "source", "source_dataset", "category")
|
|
44
|
+
missing = [key for key in required if key not in value]
|
|
45
|
+
if missing:
|
|
46
|
+
raise DatasetSpecError(f"Dataset spec missing required keys: {missing}")
|
|
47
|
+
return cls(
|
|
48
|
+
name=str(value["name"]),
|
|
49
|
+
source=str(value["source"]),
|
|
50
|
+
source_dataset=str(value["source_dataset"]),
|
|
51
|
+
category=str(value["category"]),
|
|
52
|
+
field_mapping=dict(value.get("field_mapping") or {}),
|
|
53
|
+
required_columns=_tuple(value.get("required_columns")),
|
|
54
|
+
primary_key=_optional_tuple(value.get("primary_key")),
|
|
55
|
+
business_key=_optional_tuple(value.get("business_key")),
|
|
56
|
+
asset_column=_optional_str(value.get("asset_column")),
|
|
57
|
+
time_column=_optional_str(value.get("time_column")),
|
|
58
|
+
period_column=_optional_str(value.get("period_column")),
|
|
59
|
+
request_planner=str(value.get("request_planner") or "snapshot"),
|
|
60
|
+
request_options=dict(value.get("request_options") or {}),
|
|
61
|
+
normalizer=str(value.get("normalizer") or "standard"),
|
|
62
|
+
deduplication=str(value.get("deduplication") or "exact_record_hash"),
|
|
63
|
+
partition_strategy=str(value.get("partition_strategy") or "single_file"),
|
|
64
|
+
partition_options=dict(value.get("partition_options") or {}),
|
|
65
|
+
update_mode=str(value.get("update_mode") or "upsert"),
|
|
66
|
+
sort_columns=_tuple(value.get("sort_columns")),
|
|
67
|
+
point_in_time=bool(value.get("point_in_time", False)),
|
|
68
|
+
reference=bool(value.get("reference", False)),
|
|
69
|
+
enabled=bool(value.get("enabled", True)),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
@classmethod
|
|
73
|
+
def from_yaml(cls, path: str | Path) -> "DatasetSpec":
|
|
74
|
+
"""Load a dataset spec from a small YAML file.
|
|
75
|
+
|
|
76
|
+
The project intentionally avoids a YAML runtime dependency. This parser
|
|
77
|
+
supports the simple mappings and lists used by bundled dataset specs.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
return cls.from_mapping(_parse_simple_yaml(Path(path).read_text()))
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def key(self) -> tuple[str, str]:
|
|
84
|
+
"""Return the `(source, name)` lookup key."""
|
|
85
|
+
|
|
86
|
+
return (self.source, self.name)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _tuple(value: Any) -> tuple[str, ...]:
|
|
90
|
+
if value is None:
|
|
91
|
+
return ()
|
|
92
|
+
if isinstance(value, str):
|
|
93
|
+
return (value,)
|
|
94
|
+
return tuple(str(item) for item in value)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _optional_tuple(value: Any) -> tuple[str, ...] | None:
|
|
98
|
+
result = _tuple(value)
|
|
99
|
+
return result or None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _optional_str(value: Any) -> str | None:
|
|
103
|
+
return None if value is None else str(value)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _parse_simple_yaml(text: str) -> dict[str, Any]:
|
|
107
|
+
rows = [line.rstrip() for line in text.splitlines()]
|
|
108
|
+
root: dict[str, Any] = {}
|
|
109
|
+
stack: list[tuple[int, dict[str, Any] | list[Any]]] = [(-1, root)]
|
|
110
|
+
for raw in rows:
|
|
111
|
+
if not raw.strip() or raw.lstrip().startswith("#"):
|
|
112
|
+
continue
|
|
113
|
+
indent = len(raw) - len(raw.lstrip(" "))
|
|
114
|
+
line = raw.strip()
|
|
115
|
+
while stack and indent <= stack[-1][0]:
|
|
116
|
+
stack.pop()
|
|
117
|
+
parent = stack[-1][1]
|
|
118
|
+
if line.startswith("- "):
|
|
119
|
+
if not isinstance(parent, list):
|
|
120
|
+
raise DatasetSpecError(f"Unsupported YAML list location: {line}")
|
|
121
|
+
parent.append(_yaml_scalar(line[2:]))
|
|
122
|
+
continue
|
|
123
|
+
key, sep, value = line.partition(":")
|
|
124
|
+
if not sep:
|
|
125
|
+
raise DatasetSpecError(f"Unsupported YAML line: {line}")
|
|
126
|
+
key = key.strip()
|
|
127
|
+
value = value.strip()
|
|
128
|
+
if value == "":
|
|
129
|
+
container: dict[str, Any] | list[Any]
|
|
130
|
+
next_list = _next_content_is_list(rows, raw)
|
|
131
|
+
container = [] if next_list else {}
|
|
132
|
+
if isinstance(parent, dict):
|
|
133
|
+
parent[key] = container
|
|
134
|
+
else:
|
|
135
|
+
raise DatasetSpecError(f"Unsupported nested YAML key: {key}")
|
|
136
|
+
stack.append((indent, container))
|
|
137
|
+
elif isinstance(parent, dict):
|
|
138
|
+
parent[key] = _yaml_scalar(value)
|
|
139
|
+
return root
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _next_content_is_list(rows: list[str], current: str) -> bool:
|
|
143
|
+
index = rows.index(current)
|
|
144
|
+
current_indent = len(current) - len(current.lstrip(" "))
|
|
145
|
+
for row in rows[index + 1 :]:
|
|
146
|
+
if not row.strip() or row.lstrip().startswith("#"):
|
|
147
|
+
continue
|
|
148
|
+
indent = len(row) - len(row.lstrip(" "))
|
|
149
|
+
return indent > current_indent and row.strip().startswith("- ")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _yaml_scalar(value: str) -> Any:
|
|
154
|
+
if value in {"true", "True"}:
|
|
155
|
+
return True
|
|
156
|
+
if value in {"false", "False"}:
|
|
157
|
+
return False
|
|
158
|
+
if value in {"null", "None", "~"}:
|
|
159
|
+
return None
|
|
160
|
+
if value.startswith("[") and value.endswith("]"):
|
|
161
|
+
inner = value[1:-1].strip()
|
|
162
|
+
return [] if not inner else [_yaml_scalar(part.strip()) for part in inner.split(",")]
|
|
163
|
+
if (value.startswith('"') and value.endswith('"')) or (
|
|
164
|
+
value.startswith("'") and value.endswith("'")
|
|
165
|
+
):
|
|
166
|
+
return value[1:-1]
|
|
167
|
+
try:
|
|
168
|
+
return int(value)
|
|
169
|
+
except ValueError:
|
|
170
|
+
return value
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Deduplication strategies."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DeduplicationStrategy(Protocol):
|
|
13
|
+
"""Deduplicate records for a dataset."""
|
|
14
|
+
|
|
15
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
16
|
+
"""Return deduplicated records."""
|
|
17
|
+
...
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NoDeduplication:
|
|
21
|
+
"""Leave records unchanged."""
|
|
22
|
+
|
|
23
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
24
|
+
return frame
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ExactRecordHashDeduplication:
|
|
28
|
+
"""Drop exact duplicate rows."""
|
|
29
|
+
|
|
30
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
31
|
+
return frame.unique(maintain_order=True)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PrimaryKeyLastDeduplication:
|
|
35
|
+
"""Keep the last row for each primary key."""
|
|
36
|
+
|
|
37
|
+
def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
|
|
38
|
+
if not spec.primary_key:
|
|
39
|
+
return frame.unique(maintain_order=True)
|
|
40
|
+
return frame.unique(subset=list(spec.primary_key), keep="last", maintain_order=True)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Framework exceptions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BagelQuantDataError(Exception):
|
|
7
|
+
"""Base package error."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigurationError(BagelQuantDataError):
|
|
11
|
+
"""Configuration is invalid or incomplete."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatasetSpecError(ConfigurationError):
|
|
15
|
+
"""Dataset specification is invalid."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class DatasetNotFoundError(BagelQuantDataError):
|
|
19
|
+
"""Requested dataset is not registered or has no canonical data."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SourceNotFoundError(BagelQuantDataError):
|
|
23
|
+
"""Requested source is not registered."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DataSourceError(BagelQuantDataError):
|
|
27
|
+
"""Source adapter failed."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ValidationError(BagelQuantDataError):
|
|
31
|
+
"""Data failed validation."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DuplicateResolutionError(BagelQuantDataError):
|
|
35
|
+
"""A single-value panel cannot be produced without resolving duplicates."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DestructiveOperationError(BagelQuantDataError):
|
|
39
|
+
"""A destructive operation was requested without explicit confirmation."""
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Stable hashing helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from collections.abc import Iterable
|
|
8
|
+
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def stable_bucket(asset_id: str, bucket_count: int) -> int:
|
|
13
|
+
"""Return a deterministic asset bucket."""
|
|
14
|
+
|
|
15
|
+
digest = hashlib.blake2b(asset_id.encode("utf-8"), digest_size=8).digest()
|
|
16
|
+
return int.from_bytes(digest, byteorder="big") % bucket_count
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def stable_record_hash(values: dict[str, object]) -> str:
|
|
20
|
+
"""Hash a record using stable JSON encoding."""
|
|
21
|
+
|
|
22
|
+
payload = json.dumps(values, sort_keys=True, default=str, separators=(",", ":"))
|
|
23
|
+
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def frame_content_hash(frame: pl.DataFrame, columns: Iterable[str] | None = None) -> str:
|
|
27
|
+
"""Hash a dataframe deterministically after sorting selected columns."""
|
|
28
|
+
|
|
29
|
+
selected = list(columns or frame.columns)
|
|
30
|
+
rows = frame.select(selected).sort(selected).to_dicts()
|
|
31
|
+
payload = json.dumps(rows, sort_keys=True, default=str, separators=(",", ":"))
|
|
32
|
+
return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Canonical normalization."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
|
|
10
|
+
from bagelquant_data.core.dataset import DatasetSpec
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True, slots=True)
|
|
14
|
+
class NormalizeContext:
|
|
15
|
+
"""Normalization context."""
|
|
16
|
+
|
|
17
|
+
source: str
|
|
18
|
+
dataset: str
|
|
19
|
+
run_id: str | None = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True, slots=True)
|
|
23
|
+
class NormalizeResult:
|
|
24
|
+
"""Accepted and rejected normalized records."""
|
|
25
|
+
|
|
26
|
+
accepted: pl.LazyFrame
|
|
27
|
+
rejected: pl.LazyFrame
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class Normalizer(Protocol):
|
|
31
|
+
"""Dataset normalizer protocol."""
|
|
32
|
+
|
|
33
|
+
def normalize(
|
|
34
|
+
self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
|
|
35
|
+
) -> NormalizeResult:
|
|
36
|
+
"""Normalize source rows."""
|
|
37
|
+
...
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class StandardNormalizer:
|
|
41
|
+
"""Map configured source fields into canonical columns."""
|
|
42
|
+
|
|
43
|
+
def normalize(
|
|
44
|
+
self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
|
|
45
|
+
) -> NormalizeResult:
|
|
46
|
+
lf = frame.rename(spec.field_mapping)
|
|
47
|
+
expressions: list[pl.Expr] = [
|
|
48
|
+
pl.lit(context.source).alias("source"),
|
|
49
|
+
pl.lit(spec.source_dataset).alias("source_dataset"),
|
|
50
|
+
]
|
|
51
|
+
if spec.asset_column and spec.asset_column in lf.collect_schema().names():
|
|
52
|
+
expressions.append(pl.col(spec.asset_column).cast(pl.String).alias("asset_id"))
|
|
53
|
+
if spec.time_column and spec.time_column in lf.collect_schema().names():
|
|
54
|
+
expressions.append(_date_expr(spec.time_column).alias("time"))
|
|
55
|
+
if spec.period_column and spec.period_column in lf.collect_schema().names():
|
|
56
|
+
expressions.append(_date_expr(spec.period_column).alias("period"))
|
|
57
|
+
accepted = lf.with_columns(expressions)
|
|
58
|
+
rejected = accepted.filter(pl.lit(False))
|
|
59
|
+
return NormalizeResult(accepted=accepted, rejected=rejected)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _date_expr(column: str) -> pl.Expr:
|
|
63
|
+
return (
|
|
64
|
+
pl.when(pl.col(column).cast(pl.String).str.len_chars() == 8)
|
|
65
|
+
.then(pl.col(column).cast(pl.String).str.strptime(pl.Date, "%Y%m%d", strict=False))
|
|
66
|
+
.otherwise(pl.col(column).cast(pl.Date, strict=False))
|
|
67
|
+
)
|