bagelquant-data 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. bagelquant_data-0.1.0/PKG-INFO +74 -0
  2. bagelquant_data-0.1.0/README.md +59 -0
  3. bagelquant_data-0.1.0/pyproject.toml +38 -0
  4. bagelquant_data-0.1.0/src/bagelquant_data/__init__.py +32 -0
  5. bagelquant_data-0.1.0/src/bagelquant_data/cli/main.py +30 -0
  6. bagelquant_data-0.1.0/src/bagelquant_data/core/__init__.py +37 -0
  7. bagelquant_data-0.1.0/src/bagelquant_data/core/dataset.py +170 -0
  8. bagelquant_data-0.1.0/src/bagelquant_data/core/deduplication.py +40 -0
  9. bagelquant_data-0.1.0/src/bagelquant_data/core/exceptions.py +39 -0
  10. bagelquant_data-0.1.0/src/bagelquant_data/core/hashing.py +32 -0
  11. bagelquant_data-0.1.0/src/bagelquant_data/core/normalization.py +67 -0
  12. bagelquant_data-0.1.0/src/bagelquant_data/core/partitioning.py +76 -0
  13. bagelquant_data-0.1.0/src/bagelquant_data/core/registry.py +67 -0
  14. bagelquant_data-0.1.0/src/bagelquant_data/core/request.py +21 -0
  15. bagelquant_data-0.1.0/src/bagelquant_data/core/source.py +38 -0
  16. bagelquant_data-0.1.0/src/bagelquant_data/core/types.py +10 -0
  17. bagelquant_data-0.1.0/src/bagelquant_data/core/validation.py +32 -0
  18. bagelquant_data-0.1.0/src/bagelquant_data/finance/__init__.py +80 -0
  19. bagelquant_data-0.1.0/src/bagelquant_data/finance/align.py +5 -0
  20. bagelquant_data-0.1.0/src/bagelquant_data/finance/fields.py +30 -0
  21. bagelquant_data-0.1.0/src/bagelquant_data/finance/flows.py +24 -0
  22. bagelquant_data-0.1.0/src/bagelquant_data/finance/periods.py +11 -0
  23. bagelquant_data-0.1.0/src/bagelquant_data/finance/point_in_time.py +27 -0
  24. bagelquant_data-0.1.0/src/bagelquant_data/finance/ratios.py +31 -0
  25. bagelquant_data-0.1.0/src/bagelquant_data/finance/rolling.py +34 -0
  26. bagelquant_data-0.1.0/src/bagelquant_data/finance/shares.py +28 -0
  27. bagelquant_data-0.1.0/src/bagelquant_data/finance/stocks.py +27 -0
  28. bagelquant_data-0.1.0/src/bagelquant_data/management/__init__.py +5 -0
  29. bagelquant_data-0.1.0/src/bagelquant_data/management/datasets.py +63 -0
  30. bagelquant_data-0.1.0/src/bagelquant_data/management/lake.py +190 -0
  31. bagelquant_data-0.1.0/src/bagelquant_data/management/sources.py +57 -0
  32. bagelquant_data-0.1.0/src/bagelquant_data/management/status.py +55 -0
  33. bagelquant_data-0.1.0/src/bagelquant_data/pipeline/__init__.py +6 -0
  34. bagelquant_data-0.1.0/src/bagelquant_data/pipeline/commit.py +175 -0
  35. bagelquant_data-0.1.0/src/bagelquant_data/pipeline/ingest.py +142 -0
  36. bagelquant_data-0.1.0/src/bagelquant_data/pipeline/update.py +351 -0
  37. bagelquant_data-0.1.0/src/bagelquant_data/query/__init__.py +65 -0
  38. bagelquant_data-0.1.0/src/bagelquant_data/query/field.py +96 -0
  39. bagelquant_data-0.1.0/src/bagelquant_data/query/filters.py +18 -0
  40. bagelquant_data-0.1.0/src/bagelquant_data/query/observations.py +45 -0
  41. bagelquant_data-0.1.0/src/bagelquant_data/query/raw.py +46 -0
  42. bagelquant_data-0.1.0/src/bagelquant_data/query/records.py +17 -0
  43. bagelquant_data-0.1.0/src/bagelquant_data/query/reference.py +18 -0
  44. bagelquant_data-0.1.0/src/bagelquant_data/query/scanner.py +13 -0
  45. bagelquant_data-0.1.0/src/bagelquant_data/sources/__init__.py +1 -0
  46. bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/__init__.py +5 -0
  47. bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/authentication.py +16 -0
  48. bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/client.py +20 -0
  49. bagelquant_data-0.1.0/src/bagelquant_data/sources/tushare/source.py +121 -0
  50. bagelquant_data-0.1.0/src/bagelquant_data/storage/__init__.py +7 -0
  51. bagelquant_data-0.1.0/src/bagelquant_data/storage/atomic.py +29 -0
  52. bagelquant_data-0.1.0/src/bagelquant_data/storage/metadata.py +410 -0
  53. bagelquant_data-0.1.0/src/bagelquant_data/storage/parquet.py +68 -0
  54. bagelquant_data-0.1.0/src/bagelquant_data/storage/paths.py +48 -0
  55. bagelquant_data-0.1.0/src/bagelquant_data/storage/rejected.py +30 -0
  56. bagelquant_data-0.1.0/src/bagelquant_data/storage/staging.py +28 -0
@@ -0,0 +1,74 @@
1
+ Metadata-Version: 2.3
2
+ Name: bagelquant-data
3
+ Version: 0.1.0
4
+ Summary: Unified data layer for the BagelQuant ecosystem
5
+ Author: BagelQuant
6
+ License: Apache-2.0
7
+ Requires-Dist: polars>=1.35.0
8
+ Requires-Dist: pyarrow>=22.0.0
9
+ Requires-Dist: tqdm>=4.67.0
10
+ Requires-Dist: pandas>=3.0.0 ; extra == 'tushare'
11
+ Requires-Dist: tushare>=1.4.21 ; extra == 'tushare'
12
+ Requires-Python: >=3.13
13
+ Provides-Extra: tushare
14
+ Description-Content-Type: text/markdown
15
+
16
+ # BagelQuant Data
17
+
18
+ `bagelquant-data` is a Polars-native, source-agnostic data lake framework for
19
+ quantitative research.
20
+
21
+ - Polars is the dataframe engine.
22
+ - Parquet is the canonical analytical storage format.
23
+ - SQLite stores mutable metadata, manifests, run state, and source/dataset
24
+ registration.
25
+ - Tushare is implemented as the first source adapter under
26
+ `bagelquant_data.sources.tushare`.
27
+ - Non-reference research extraction returns one field at a time as
28
+ `time | asset_id | value`.
29
+
30
+ ```python
31
+ import polars as pl
32
+
33
+ from bagelquant_data import DataLake, DatasetSpec
34
+
35
+ lake = DataLake.open("data")
36
+ spec = DatasetSpec(
37
+ name="daily",
38
+ source="custom",
39
+ source_dataset="daily",
40
+ category="market",
41
+ field_mapping={"ts_code": "ts_code", "trade_date": "trade_date"},
42
+ required_columns=("asset_id", "time"),
43
+ primary_key=("asset_id", "time"),
44
+ asset_column="ts_code",
45
+ time_column="trade_date",
46
+ partition_strategy="year_month",
47
+ deduplication="primary_key_last",
48
+ sort_columns=("time", "asset_id"),
49
+ )
50
+ lake.ingest_frame(
51
+ spec,
52
+ pl.DataFrame(
53
+ {
54
+ "trade_date": ["2024-01-02"],
55
+ "ts_code": ["000001.SZ"],
56
+ "close": [100.0],
57
+ }
58
+ ),
59
+ )
60
+
61
+ close = lake.query.field("daily", "close", source="custom", collect=True)
62
+ print(close) # time, asset_id, close
63
+ ```
64
+
65
+ Documentation is available in two languages:
66
+
67
+ - English: `docs/en/index.md`
68
+ - Chinese: `docs/cn/index.md`
69
+
70
+ ## Development
71
+
72
+ ```bash
73
+ uv run pytest
74
+ ```
@@ -0,0 +1,59 @@
1
+ # BagelQuant Data
2
+
3
+ `bagelquant-data` is a Polars-native, source-agnostic data lake framework for
4
+ quantitative research.
5
+
6
+ - Polars is the dataframe engine.
7
+ - Parquet is the canonical analytical storage format.
8
+ - SQLite stores mutable metadata, manifests, run state, and source/dataset
9
+ registration.
10
+ - Tushare is implemented as the first source adapter under
11
+ `bagelquant_data.sources.tushare`.
12
+ - Non-reference research extraction returns one field at a time as
13
+ `time | asset_id | value`.
14
+
15
+ ```python
16
+ import polars as pl
17
+
18
+ from bagelquant_data import DataLake, DatasetSpec
19
+
20
+ lake = DataLake.open("data")
21
+ spec = DatasetSpec(
22
+ name="daily",
23
+ source="custom",
24
+ source_dataset="daily",
25
+ category="market",
26
+ field_mapping={"ts_code": "ts_code", "trade_date": "trade_date"},
27
+ required_columns=("asset_id", "time"),
28
+ primary_key=("asset_id", "time"),
29
+ asset_column="ts_code",
30
+ time_column="trade_date",
31
+ partition_strategy="year_month",
32
+ deduplication="primary_key_last",
33
+ sort_columns=("time", "asset_id"),
34
+ )
35
+ lake.ingest_frame(
36
+ spec,
37
+ pl.DataFrame(
38
+ {
39
+ "trade_date": ["2024-01-02"],
40
+ "ts_code": ["000001.SZ"],
41
+ "close": [100.0],
42
+ }
43
+ ),
44
+ )
45
+
46
+ close = lake.query.field("daily", "close", source="custom", collect=True)
47
+ print(close) # time, asset_id, close
48
+ ```
49
+
50
+ Documentation is available in two languages:
51
+
52
+ - English: `docs/en/index.md`
53
+ - Chinese: `docs/cn/index.md`
54
+
55
+ ## Development
56
+
57
+ ```bash
58
+ uv run pytest
59
+ ```
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "bagelquant-data"
3
+ version = "0.1.0"
4
+ description = "Unified data layer for the BagelQuant ecosystem"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ license = { text = "Apache-2.0" }
8
+ authors = [{ name = "BagelQuant" }]
9
+ dependencies = [
10
+ "polars>=1.35.0",
11
+ "pyarrow>=22.0.0",
12
+ "tqdm>=4.67.0",
13
+ ]
14
+
15
+ [project.scripts]
16
+ bagelquant-data = "bagelquant_data.cli.main:main"
17
+
18
+ [project.optional-dependencies]
19
+ tushare = [
20
+ "pandas>=3.0.0",
21
+ "tushare>=1.4.21",
22
+ ]
23
+
24
+ [dependency-groups]
25
+ dev = [
26
+ "pandas>=3.0.0",
27
+ "pre-commit>=4.5.0",
28
+ "pyright>=1.1.410",
29
+ "pytest>=9.0.0",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["uv_build>=0.8.0,<0.9.0"]
34
+ build-backend = "uv_build"
35
+
36
+ [tool.pytest.ini_options]
37
+ pythonpath = [".", "src"]
38
+ testpaths = ["tests"]
@@ -0,0 +1,32 @@
1
+ """Source-agnostic data lake framework for BagelQuant research."""
2
+
3
+ from bagelquant_data.core import (
4
+ BagelQuantDataError,
5
+ DataSource,
6
+ DatasetNotFoundError,
7
+ DatasetSpec,
8
+ DatasetSpecError,
9
+ DuplicateResolutionError,
10
+ SourceNotFoundError,
11
+ ValidationError,
12
+ stable_bucket,
13
+ )
14
+ from bagelquant_data.finance import FinancialFieldKind, FinancialFieldSpec
15
+ from bagelquant_data.management import DataLake
16
+ from bagelquant_data.sources.tushare import TushareSource
17
+
18
+ __all__ = [
19
+ "BagelQuantDataError",
20
+ "DataLake",
21
+ "DataSource",
22
+ "DatasetNotFoundError",
23
+ "DatasetSpec",
24
+ "DatasetSpecError",
25
+ "DuplicateResolutionError",
26
+ "FinancialFieldKind",
27
+ "FinancialFieldSpec",
28
+ "SourceNotFoundError",
29
+ "TushareSource",
30
+ "ValidationError",
31
+ "stable_bucket",
32
+ ]
@@ -0,0 +1,30 @@
1
+ """Thin CLI for the Python-first data lake API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+
7
+ from bagelquant_data import DataLake
8
+
9
+
10
+ def main(argv: list[str] | None = None) -> int:
11
+ parser = argparse.ArgumentParser(prog="bagelquant-data")
12
+ parser.add_argument("--root", default="data")
13
+ sub = parser.add_subparsers(dest="command", required=True)
14
+ sub.add_parser("status")
15
+ datasets = sub.add_parser("dataset-list")
16
+ datasets.add_argument("--source")
17
+ sub.add_parser("source-list")
18
+ args = parser.parse_args(argv)
19
+ lake = DataLake.open(args.root)
20
+ if args.command == "status":
21
+ print(lake.status.summary())
22
+ elif args.command == "dataset-list":
23
+ print(lake.datasets.list(args.source))
24
+ elif args.command == "source-list":
25
+ print(lake.sources.list())
26
+ return 0
27
+
28
+
29
+ if __name__ == "__main__":
30
+ raise SystemExit(main())
@@ -0,0 +1,37 @@
1
+ """Core source-agnostic framework primitives."""
2
+
3
+ from bagelquant_data.core.dataset import DatasetSpec
4
+ from bagelquant_data.core.exceptions import (
5
+ BagelQuantDataError,
6
+ ConfigurationError,
7
+ DatasetNotFoundError,
8
+ DatasetSpecError,
9
+ DestructiveOperationError,
10
+ DuplicateResolutionError,
11
+ SourceNotFoundError,
12
+ ValidationError,
13
+ )
14
+ from bagelquant_data.core.hashing import frame_content_hash, stable_bucket, stable_record_hash
15
+ from bagelquant_data.core.registry import FrameworkRegistries, Registry, default_registries
16
+ from bagelquant_data.core.request import RequestContext
17
+ from bagelquant_data.core.source import DataSource
18
+
19
+ __all__ = [
20
+ "BagelQuantDataError",
21
+ "ConfigurationError",
22
+ "DataSource",
23
+ "DatasetNotFoundError",
24
+ "DatasetSpec",
25
+ "DatasetSpecError",
26
+ "DestructiveOperationError",
27
+ "DuplicateResolutionError",
28
+ "FrameworkRegistries",
29
+ "Registry",
30
+ "RequestContext",
31
+ "SourceNotFoundError",
32
+ "ValidationError",
33
+ "default_registries",
34
+ "frame_content_hash",
35
+ "stable_bucket",
36
+ "stable_record_hash",
37
+ ]
@@ -0,0 +1,170 @@
1
+ """Dataset specification model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from bagelquant_data.core.exceptions import DatasetSpecError
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class DatasetSpec:
14
+ """Declarative canonical dataset behavior."""
15
+
16
+ name: str
17
+ source: str
18
+ source_dataset: str
19
+ category: str
20
+ field_mapping: dict[str, str]
21
+ required_columns: tuple[str, ...]
22
+ primary_key: tuple[str, ...] | None = None
23
+ business_key: tuple[str, ...] | None = None
24
+ asset_column: str | None = None
25
+ time_column: str | None = None
26
+ period_column: str | None = None
27
+ request_planner: str = "snapshot"
28
+ request_options: dict[str, Any] = field(default_factory=dict)
29
+ normalizer: str = "standard"
30
+ deduplication: str = "exact_record_hash"
31
+ partition_strategy: str = "single_file"
32
+ partition_options: dict[str, Any] = field(default_factory=dict)
33
+ update_mode: str = "upsert"
34
+ sort_columns: tuple[str, ...] = ()
35
+ point_in_time: bool = False
36
+ reference: bool = False
37
+ enabled: bool = True
38
+
39
+ @classmethod
40
+ def from_mapping(cls, value: dict[str, Any]) -> "DatasetSpec":
41
+ """Build a specification from parsed config data."""
42
+
43
+ required = ("name", "source", "source_dataset", "category")
44
+ missing = [key for key in required if key not in value]
45
+ if missing:
46
+ raise DatasetSpecError(f"Dataset spec missing required keys: {missing}")
47
+ return cls(
48
+ name=str(value["name"]),
49
+ source=str(value["source"]),
50
+ source_dataset=str(value["source_dataset"]),
51
+ category=str(value["category"]),
52
+ field_mapping=dict(value.get("field_mapping") or {}),
53
+ required_columns=_tuple(value.get("required_columns")),
54
+ primary_key=_optional_tuple(value.get("primary_key")),
55
+ business_key=_optional_tuple(value.get("business_key")),
56
+ asset_column=_optional_str(value.get("asset_column")),
57
+ time_column=_optional_str(value.get("time_column")),
58
+ period_column=_optional_str(value.get("period_column")),
59
+ request_planner=str(value.get("request_planner") or "snapshot"),
60
+ request_options=dict(value.get("request_options") or {}),
61
+ normalizer=str(value.get("normalizer") or "standard"),
62
+ deduplication=str(value.get("deduplication") or "exact_record_hash"),
63
+ partition_strategy=str(value.get("partition_strategy") or "single_file"),
64
+ partition_options=dict(value.get("partition_options") or {}),
65
+ update_mode=str(value.get("update_mode") or "upsert"),
66
+ sort_columns=_tuple(value.get("sort_columns")),
67
+ point_in_time=bool(value.get("point_in_time", False)),
68
+ reference=bool(value.get("reference", False)),
69
+ enabled=bool(value.get("enabled", True)),
70
+ )
71
+
72
+ @classmethod
73
+ def from_yaml(cls, path: str | Path) -> "DatasetSpec":
74
+ """Load a dataset spec from a small YAML file.
75
+
76
+ The project intentionally avoids a YAML runtime dependency. This parser
77
+ supports the simple mappings and lists used by bundled dataset specs.
78
+ """
79
+
80
+ return cls.from_mapping(_parse_simple_yaml(Path(path).read_text()))
81
+
82
+ @property
83
+ def key(self) -> tuple[str, str]:
84
+ """Return the `(source, name)` lookup key."""
85
+
86
+ return (self.source, self.name)
87
+
88
+
89
+ def _tuple(value: Any) -> tuple[str, ...]:
90
+ if value is None:
91
+ return ()
92
+ if isinstance(value, str):
93
+ return (value,)
94
+ return tuple(str(item) for item in value)
95
+
96
+
97
+ def _optional_tuple(value: Any) -> tuple[str, ...] | None:
98
+ result = _tuple(value)
99
+ return result or None
100
+
101
+
102
+ def _optional_str(value: Any) -> str | None:
103
+ return None if value is None else str(value)
104
+
105
+
106
+ def _parse_simple_yaml(text: str) -> dict[str, Any]:
107
+ rows = [line.rstrip() for line in text.splitlines()]
108
+ root: dict[str, Any] = {}
109
+ stack: list[tuple[int, dict[str, Any] | list[Any]]] = [(-1, root)]
110
+ for raw in rows:
111
+ if not raw.strip() or raw.lstrip().startswith("#"):
112
+ continue
113
+ indent = len(raw) - len(raw.lstrip(" "))
114
+ line = raw.strip()
115
+ while stack and indent <= stack[-1][0]:
116
+ stack.pop()
117
+ parent = stack[-1][1]
118
+ if line.startswith("- "):
119
+ if not isinstance(parent, list):
120
+ raise DatasetSpecError(f"Unsupported YAML list location: {line}")
121
+ parent.append(_yaml_scalar(line[2:]))
122
+ continue
123
+ key, sep, value = line.partition(":")
124
+ if not sep:
125
+ raise DatasetSpecError(f"Unsupported YAML line: {line}")
126
+ key = key.strip()
127
+ value = value.strip()
128
+ if value == "":
129
+ container: dict[str, Any] | list[Any]
130
+ next_list = _next_content_is_list(rows, raw)
131
+ container = [] if next_list else {}
132
+ if isinstance(parent, dict):
133
+ parent[key] = container
134
+ else:
135
+ raise DatasetSpecError(f"Unsupported nested YAML key: {key}")
136
+ stack.append((indent, container))
137
+ elif isinstance(parent, dict):
138
+ parent[key] = _yaml_scalar(value)
139
+ return root
140
+
141
+
142
+ def _next_content_is_list(rows: list[str], current: str) -> bool:
143
+ index = rows.index(current)
144
+ current_indent = len(current) - len(current.lstrip(" "))
145
+ for row in rows[index + 1 :]:
146
+ if not row.strip() or row.lstrip().startswith("#"):
147
+ continue
148
+ indent = len(row) - len(row.lstrip(" "))
149
+ return indent > current_indent and row.strip().startswith("- ")
150
+ return False
151
+
152
+
153
+ def _yaml_scalar(value: str) -> Any:
154
+ if value in {"true", "True"}:
155
+ return True
156
+ if value in {"false", "False"}:
157
+ return False
158
+ if value in {"null", "None", "~"}:
159
+ return None
160
+ if value.startswith("[") and value.endswith("]"):
161
+ inner = value[1:-1].strip()
162
+ return [] if not inner else [_yaml_scalar(part.strip()) for part in inner.split(",")]
163
+ if (value.startswith('"') and value.endswith('"')) or (
164
+ value.startswith("'") and value.endswith("'")
165
+ ):
166
+ return value[1:-1]
167
+ try:
168
+ return int(value)
169
+ except ValueError:
170
+ return value
@@ -0,0 +1,40 @@
1
+ """Deduplication strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol
6
+
7
+ import polars as pl
8
+
9
+ from bagelquant_data.core.dataset import DatasetSpec
10
+
11
+
12
+ class DeduplicationStrategy(Protocol):
13
+ """Deduplicate records for a dataset."""
14
+
15
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
16
+ """Return deduplicated records."""
17
+ ...
18
+
19
+
20
+ class NoDeduplication:
21
+ """Leave records unchanged."""
22
+
23
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
24
+ return frame
25
+
26
+
27
+ class ExactRecordHashDeduplication:
28
+ """Drop exact duplicate rows."""
29
+
30
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
31
+ return frame.unique(maintain_order=True)
32
+
33
+
34
+ class PrimaryKeyLastDeduplication:
35
+ """Keep the last row for each primary key."""
36
+
37
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
38
+ if not spec.primary_key:
39
+ return frame.unique(maintain_order=True)
40
+ return frame.unique(subset=list(spec.primary_key), keep="last", maintain_order=True)
@@ -0,0 +1,39 @@
1
+ """Framework exceptions."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class BagelQuantDataError(Exception):
7
+ """Base package error."""
8
+
9
+
10
+ class ConfigurationError(BagelQuantDataError):
11
+ """Configuration is invalid or incomplete."""
12
+
13
+
14
+ class DatasetSpecError(ConfigurationError):
15
+ """Dataset specification is invalid."""
16
+
17
+
18
+ class DatasetNotFoundError(BagelQuantDataError):
19
+ """Requested dataset is not registered or has no canonical data."""
20
+
21
+
22
+ class SourceNotFoundError(BagelQuantDataError):
23
+ """Requested source is not registered."""
24
+
25
+
26
+ class DataSourceError(BagelQuantDataError):
27
+ """Source adapter failed."""
28
+
29
+
30
+ class ValidationError(BagelQuantDataError):
31
+ """Data failed validation."""
32
+
33
+
34
+ class DuplicateResolutionError(BagelQuantDataError):
35
+ """A single-value panel cannot be produced without resolving duplicates."""
36
+
37
+
38
+ class DestructiveOperationError(BagelQuantDataError):
39
+ """A destructive operation was requested without explicit confirmation."""
@@ -0,0 +1,32 @@
1
+ """Stable hashing helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from collections.abc import Iterable
8
+
9
+ import polars as pl
10
+
11
+
12
+ def stable_bucket(asset_id: str, bucket_count: int) -> int:
13
+ """Return a deterministic asset bucket."""
14
+
15
+ digest = hashlib.blake2b(asset_id.encode("utf-8"), digest_size=8).digest()
16
+ return int.from_bytes(digest, byteorder="big") % bucket_count
17
+
18
+
19
+ def stable_record_hash(values: dict[str, object]) -> str:
20
+ """Hash a record using stable JSON encoding."""
21
+
22
+ payload = json.dumps(values, sort_keys=True, default=str, separators=(",", ":"))
23
+ return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
24
+
25
+
26
+ def frame_content_hash(frame: pl.DataFrame, columns: Iterable[str] | None = None) -> str:
27
+ """Hash a dataframe deterministically after sorting selected columns."""
28
+
29
+ selected = list(columns or frame.columns)
30
+ rows = frame.select(selected).sort(selected).to_dicts()
31
+ payload = json.dumps(rows, sort_keys=True, default=str, separators=(",", ":"))
32
+ return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
@@ -0,0 +1,67 @@
1
+ """Canonical normalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Protocol
7
+
8
+ import polars as pl
9
+
10
+ from bagelquant_data.core.dataset import DatasetSpec
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class NormalizeContext:
15
+ """Normalization context."""
16
+
17
+ source: str
18
+ dataset: str
19
+ run_id: str | None = None
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class NormalizeResult:
24
+ """Accepted and rejected normalized records."""
25
+
26
+ accepted: pl.LazyFrame
27
+ rejected: pl.LazyFrame
28
+
29
+
30
+ class Normalizer(Protocol):
31
+ """Dataset normalizer protocol."""
32
+
33
+ def normalize(
34
+ self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
35
+ ) -> NormalizeResult:
36
+ """Normalize source rows."""
37
+ ...
38
+
39
+
40
+ class StandardNormalizer:
41
+ """Map configured source fields into canonical columns."""
42
+
43
+ def normalize(
44
+ self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
45
+ ) -> NormalizeResult:
46
+ lf = frame.rename(spec.field_mapping)
47
+ expressions: list[pl.Expr] = [
48
+ pl.lit(context.source).alias("source"),
49
+ pl.lit(spec.source_dataset).alias("source_dataset"),
50
+ ]
51
+ if spec.asset_column and spec.asset_column in lf.collect_schema().names():
52
+ expressions.append(pl.col(spec.asset_column).cast(pl.String).alias("asset_id"))
53
+ if spec.time_column and spec.time_column in lf.collect_schema().names():
54
+ expressions.append(_date_expr(spec.time_column).alias("time"))
55
+ if spec.period_column and spec.period_column in lf.collect_schema().names():
56
+ expressions.append(_date_expr(spec.period_column).alias("period"))
57
+ accepted = lf.with_columns(expressions)
58
+ rejected = accepted.filter(pl.lit(False))
59
+ return NormalizeResult(accepted=accepted, rejected=rejected)
60
+
61
+
62
+ def _date_expr(column: str) -> pl.Expr:
63
+ return (
64
+ pl.when(pl.col(column).cast(pl.String).str.len_chars() == 8)
65
+ .then(pl.col(column).cast(pl.String).str.strptime(pl.Date, "%Y%m%d", strict=False))
66
+ .otherwise(pl.col(column).cast(pl.Date, strict=False))
67
+ )