bagelquant-data 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. bagelquant_data/__init__.py +32 -0
  2. bagelquant_data/cli/main.py +30 -0
  3. bagelquant_data/core/__init__.py +37 -0
  4. bagelquant_data/core/dataset.py +170 -0
  5. bagelquant_data/core/deduplication.py +40 -0
  6. bagelquant_data/core/exceptions.py +39 -0
  7. bagelquant_data/core/hashing.py +32 -0
  8. bagelquant_data/core/normalization.py +67 -0
  9. bagelquant_data/core/partitioning.py +76 -0
  10. bagelquant_data/core/registry.py +67 -0
  11. bagelquant_data/core/request.py +21 -0
  12. bagelquant_data/core/source.py +38 -0
  13. bagelquant_data/core/types.py +10 -0
  14. bagelquant_data/core/validation.py +32 -0
  15. bagelquant_data/finance/__init__.py +80 -0
  16. bagelquant_data/finance/align.py +5 -0
  17. bagelquant_data/finance/fields.py +30 -0
  18. bagelquant_data/finance/flows.py +24 -0
  19. bagelquant_data/finance/periods.py +11 -0
  20. bagelquant_data/finance/point_in_time.py +27 -0
  21. bagelquant_data/finance/ratios.py +31 -0
  22. bagelquant_data/finance/rolling.py +34 -0
  23. bagelquant_data/finance/shares.py +28 -0
  24. bagelquant_data/finance/stocks.py +27 -0
  25. bagelquant_data/management/__init__.py +5 -0
  26. bagelquant_data/management/datasets.py +63 -0
  27. bagelquant_data/management/lake.py +190 -0
  28. bagelquant_data/management/sources.py +57 -0
  29. bagelquant_data/management/status.py +55 -0
  30. bagelquant_data/pipeline/__init__.py +6 -0
  31. bagelquant_data/pipeline/commit.py +175 -0
  32. bagelquant_data/pipeline/ingest.py +142 -0
  33. bagelquant_data/pipeline/update.py +351 -0
  34. bagelquant_data/query/__init__.py +65 -0
  35. bagelquant_data/query/field.py +96 -0
  36. bagelquant_data/query/filters.py +18 -0
  37. bagelquant_data/query/observations.py +45 -0
  38. bagelquant_data/query/raw.py +46 -0
  39. bagelquant_data/query/records.py +17 -0
  40. bagelquant_data/query/reference.py +18 -0
  41. bagelquant_data/query/scanner.py +13 -0
  42. bagelquant_data/sources/__init__.py +1 -0
  43. bagelquant_data/sources/tushare/__init__.py +5 -0
  44. bagelquant_data/sources/tushare/authentication.py +16 -0
  45. bagelquant_data/sources/tushare/client.py +20 -0
  46. bagelquant_data/sources/tushare/source.py +121 -0
  47. bagelquant_data/storage/__init__.py +7 -0
  48. bagelquant_data/storage/atomic.py +29 -0
  49. bagelquant_data/storage/metadata.py +410 -0
  50. bagelquant_data/storage/parquet.py +68 -0
  51. bagelquant_data/storage/paths.py +48 -0
  52. bagelquant_data/storage/rejected.py +30 -0
  53. bagelquant_data/storage/staging.py +28 -0
  54. bagelquant_data-0.1.0.dist-info/METADATA +74 -0
  55. bagelquant_data-0.1.0.dist-info/RECORD +57 -0
  56. bagelquant_data-0.1.0.dist-info/WHEEL +4 -0
  57. bagelquant_data-0.1.0.dist-info/entry_points.txt +3 -0
@@ -0,0 +1,32 @@
1
+ """Source-agnostic data lake framework for BagelQuant research."""
2
+
3
+ from bagelquant_data.core import (
4
+ BagelQuantDataError,
5
+ DataSource,
6
+ DatasetNotFoundError,
7
+ DatasetSpec,
8
+ DatasetSpecError,
9
+ DuplicateResolutionError,
10
+ SourceNotFoundError,
11
+ ValidationError,
12
+ stable_bucket,
13
+ )
14
+ from bagelquant_data.finance import FinancialFieldKind, FinancialFieldSpec
15
+ from bagelquant_data.management import DataLake
16
+ from bagelquant_data.sources.tushare import TushareSource
17
+
18
+ __all__ = [
19
+ "BagelQuantDataError",
20
+ "DataLake",
21
+ "DataSource",
22
+ "DatasetNotFoundError",
23
+ "DatasetSpec",
24
+ "DatasetSpecError",
25
+ "DuplicateResolutionError",
26
+ "FinancialFieldKind",
27
+ "FinancialFieldSpec",
28
+ "SourceNotFoundError",
29
+ "TushareSource",
30
+ "ValidationError",
31
+ "stable_bucket",
32
+ ]
@@ -0,0 +1,30 @@
1
+ """Thin CLI for the Python-first data lake API."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+
7
+ from bagelquant_data import DataLake
8
+
9
+
10
+ def main(argv: list[str] | None = None) -> int:
11
+ parser = argparse.ArgumentParser(prog="bagelquant-data")
12
+ parser.add_argument("--root", default="data")
13
+ sub = parser.add_subparsers(dest="command", required=True)
14
+ sub.add_parser("status")
15
+ datasets = sub.add_parser("dataset-list")
16
+ datasets.add_argument("--source")
17
+ sub.add_parser("source-list")
18
+ args = parser.parse_args(argv)
19
+ lake = DataLake.open(args.root)
20
+ if args.command == "status":
21
+ print(lake.status.summary())
22
+ elif args.command == "dataset-list":
23
+ print(lake.datasets.list(args.source))
24
+ elif args.command == "source-list":
25
+ print(lake.sources.list())
26
+ return 0
27
+
28
+
29
+ if __name__ == "__main__":
30
+ raise SystemExit(main())
@@ -0,0 +1,37 @@
1
+ """Core source-agnostic framework primitives."""
2
+
3
+ from bagelquant_data.core.dataset import DatasetSpec
4
+ from bagelquant_data.core.exceptions import (
5
+ BagelQuantDataError,
6
+ ConfigurationError,
7
+ DatasetNotFoundError,
8
+ DatasetSpecError,
9
+ DestructiveOperationError,
10
+ DuplicateResolutionError,
11
+ SourceNotFoundError,
12
+ ValidationError,
13
+ )
14
+ from bagelquant_data.core.hashing import frame_content_hash, stable_bucket, stable_record_hash
15
+ from bagelquant_data.core.registry import FrameworkRegistries, Registry, default_registries
16
+ from bagelquant_data.core.request import RequestContext
17
+ from bagelquant_data.core.source import DataSource
18
+
19
+ __all__ = [
20
+ "BagelQuantDataError",
21
+ "ConfigurationError",
22
+ "DataSource",
23
+ "DatasetNotFoundError",
24
+ "DatasetSpec",
25
+ "DatasetSpecError",
26
+ "DestructiveOperationError",
27
+ "DuplicateResolutionError",
28
+ "FrameworkRegistries",
29
+ "Registry",
30
+ "RequestContext",
31
+ "SourceNotFoundError",
32
+ "ValidationError",
33
+ "default_registries",
34
+ "frame_content_hash",
35
+ "stable_bucket",
36
+ "stable_record_hash",
37
+ ]
@@ -0,0 +1,170 @@
1
+ """Dataset specification model."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from bagelquant_data.core.exceptions import DatasetSpecError
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class DatasetSpec:
14
+ """Declarative canonical dataset behavior."""
15
+
16
+ name: str
17
+ source: str
18
+ source_dataset: str
19
+ category: str
20
+ field_mapping: dict[str, str]
21
+ required_columns: tuple[str, ...]
22
+ primary_key: tuple[str, ...] | None = None
23
+ business_key: tuple[str, ...] | None = None
24
+ asset_column: str | None = None
25
+ time_column: str | None = None
26
+ period_column: str | None = None
27
+ request_planner: str = "snapshot"
28
+ request_options: dict[str, Any] = field(default_factory=dict)
29
+ normalizer: str = "standard"
30
+ deduplication: str = "exact_record_hash"
31
+ partition_strategy: str = "single_file"
32
+ partition_options: dict[str, Any] = field(default_factory=dict)
33
+ update_mode: str = "upsert"
34
+ sort_columns: tuple[str, ...] = ()
35
+ point_in_time: bool = False
36
+ reference: bool = False
37
+ enabled: bool = True
38
+
39
+ @classmethod
40
+ def from_mapping(cls, value: dict[str, Any]) -> "DatasetSpec":
41
+ """Build a specification from parsed config data."""
42
+
43
+ required = ("name", "source", "source_dataset", "category")
44
+ missing = [key for key in required if key not in value]
45
+ if missing:
46
+ raise DatasetSpecError(f"Dataset spec missing required keys: {missing}")
47
+ return cls(
48
+ name=str(value["name"]),
49
+ source=str(value["source"]),
50
+ source_dataset=str(value["source_dataset"]),
51
+ category=str(value["category"]),
52
+ field_mapping=dict(value.get("field_mapping") or {}),
53
+ required_columns=_tuple(value.get("required_columns")),
54
+ primary_key=_optional_tuple(value.get("primary_key")),
55
+ business_key=_optional_tuple(value.get("business_key")),
56
+ asset_column=_optional_str(value.get("asset_column")),
57
+ time_column=_optional_str(value.get("time_column")),
58
+ period_column=_optional_str(value.get("period_column")),
59
+ request_planner=str(value.get("request_planner") or "snapshot"),
60
+ request_options=dict(value.get("request_options") or {}),
61
+ normalizer=str(value.get("normalizer") or "standard"),
62
+ deduplication=str(value.get("deduplication") or "exact_record_hash"),
63
+ partition_strategy=str(value.get("partition_strategy") or "single_file"),
64
+ partition_options=dict(value.get("partition_options") or {}),
65
+ update_mode=str(value.get("update_mode") or "upsert"),
66
+ sort_columns=_tuple(value.get("sort_columns")),
67
+ point_in_time=bool(value.get("point_in_time", False)),
68
+ reference=bool(value.get("reference", False)),
69
+ enabled=bool(value.get("enabled", True)),
70
+ )
71
+
72
+ @classmethod
73
+ def from_yaml(cls, path: str | Path) -> "DatasetSpec":
74
+ """Load a dataset spec from a small YAML file.
75
+
76
+ The project intentionally avoids a YAML runtime dependency. This parser
77
+ supports the simple mappings and lists used by bundled dataset specs.
78
+ """
79
+
80
+ return cls.from_mapping(_parse_simple_yaml(Path(path).read_text()))
81
+
82
+ @property
83
+ def key(self) -> tuple[str, str]:
84
+ """Return the `(source, name)` lookup key."""
85
+
86
+ return (self.source, self.name)
87
+
88
+
89
+ def _tuple(value: Any) -> tuple[str, ...]:
90
+ if value is None:
91
+ return ()
92
+ if isinstance(value, str):
93
+ return (value,)
94
+ return tuple(str(item) for item in value)
95
+
96
+
97
+ def _optional_tuple(value: Any) -> tuple[str, ...] | None:
98
+ result = _tuple(value)
99
+ return result or None
100
+
101
+
102
+ def _optional_str(value: Any) -> str | None:
103
+ return None if value is None else str(value)
104
+
105
+
106
+ def _parse_simple_yaml(text: str) -> dict[str, Any]:
107
+ rows = [line.rstrip() for line in text.splitlines()]
108
+ root: dict[str, Any] = {}
109
+ stack: list[tuple[int, dict[str, Any] | list[Any]]] = [(-1, root)]
110
+ for raw in rows:
111
+ if not raw.strip() or raw.lstrip().startswith("#"):
112
+ continue
113
+ indent = len(raw) - len(raw.lstrip(" "))
114
+ line = raw.strip()
115
+ while stack and indent <= stack[-1][0]:
116
+ stack.pop()
117
+ parent = stack[-1][1]
118
+ if line.startswith("- "):
119
+ if not isinstance(parent, list):
120
+ raise DatasetSpecError(f"Unsupported YAML list location: {line}")
121
+ parent.append(_yaml_scalar(line[2:]))
122
+ continue
123
+ key, sep, value = line.partition(":")
124
+ if not sep:
125
+ raise DatasetSpecError(f"Unsupported YAML line: {line}")
126
+ key = key.strip()
127
+ value = value.strip()
128
+ if value == "":
129
+ container: dict[str, Any] | list[Any]
130
+ next_list = _next_content_is_list(rows, raw)
131
+ container = [] if next_list else {}
132
+ if isinstance(parent, dict):
133
+ parent[key] = container
134
+ else:
135
+ raise DatasetSpecError(f"Unsupported nested YAML key: {key}")
136
+ stack.append((indent, container))
137
+ elif isinstance(parent, dict):
138
+ parent[key] = _yaml_scalar(value)
139
+ return root
140
+
141
+
142
+ def _next_content_is_list(rows: list[str], current: str) -> bool:
143
+ index = rows.index(current)
144
+ current_indent = len(current) - len(current.lstrip(" "))
145
+ for row in rows[index + 1 :]:
146
+ if not row.strip() or row.lstrip().startswith("#"):
147
+ continue
148
+ indent = len(row) - len(row.lstrip(" "))
149
+ return indent > current_indent and row.strip().startswith("- ")
150
+ return False
151
+
152
+
153
+ def _yaml_scalar(value: str) -> Any:
154
+ if value in {"true", "True"}:
155
+ return True
156
+ if value in {"false", "False"}:
157
+ return False
158
+ if value in {"null", "None", "~"}:
159
+ return None
160
+ if value.startswith("[") and value.endswith("]"):
161
+ inner = value[1:-1].strip()
162
+ return [] if not inner else [_yaml_scalar(part.strip()) for part in inner.split(",")]
163
+ if (value.startswith('"') and value.endswith('"')) or (
164
+ value.startswith("'") and value.endswith("'")
165
+ ):
166
+ return value[1:-1]
167
+ try:
168
+ return int(value)
169
+ except ValueError:
170
+ return value
@@ -0,0 +1,40 @@
1
+ """Deduplication strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol
6
+
7
+ import polars as pl
8
+
9
+ from bagelquant_data.core.dataset import DatasetSpec
10
+
11
+
12
+ class DeduplicationStrategy(Protocol):
13
+ """Deduplicate records for a dataset."""
14
+
15
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
16
+ """Return deduplicated records."""
17
+ ...
18
+
19
+
20
+ class NoDeduplication:
21
+ """Leave records unchanged."""
22
+
23
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
24
+ return frame
25
+
26
+
27
+ class ExactRecordHashDeduplication:
28
+ """Drop exact duplicate rows."""
29
+
30
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
31
+ return frame.unique(maintain_order=True)
32
+
33
+
34
+ class PrimaryKeyLastDeduplication:
35
+ """Keep the last row for each primary key."""
36
+
37
+ def apply(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
38
+ if not spec.primary_key:
39
+ return frame.unique(maintain_order=True)
40
+ return frame.unique(subset=list(spec.primary_key), keep="last", maintain_order=True)
@@ -0,0 +1,39 @@
1
+ """Framework exceptions."""
2
+
3
+ from __future__ import annotations
4
+
5
+
6
+ class BagelQuantDataError(Exception):
7
+ """Base package error."""
8
+
9
+
10
+ class ConfigurationError(BagelQuantDataError):
11
+ """Configuration is invalid or incomplete."""
12
+
13
+
14
+ class DatasetSpecError(ConfigurationError):
15
+ """Dataset specification is invalid."""
16
+
17
+
18
+ class DatasetNotFoundError(BagelQuantDataError):
19
+ """Requested dataset is not registered or has no canonical data."""
20
+
21
+
22
+ class SourceNotFoundError(BagelQuantDataError):
23
+ """Requested source is not registered."""
24
+
25
+
26
+ class DataSourceError(BagelQuantDataError):
27
+ """Source adapter failed."""
28
+
29
+
30
+ class ValidationError(BagelQuantDataError):
31
+ """Data failed validation."""
32
+
33
+
34
+ class DuplicateResolutionError(BagelQuantDataError):
35
+ """A single-value panel cannot be produced without resolving duplicates."""
36
+
37
+
38
+ class DestructiveOperationError(BagelQuantDataError):
39
+ """A destructive operation was requested without explicit confirmation."""
@@ -0,0 +1,32 @@
1
+ """Stable hashing helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import json
7
+ from collections.abc import Iterable
8
+
9
+ import polars as pl
10
+
11
+
12
+ def stable_bucket(asset_id: str, bucket_count: int) -> int:
13
+ """Return a deterministic asset bucket."""
14
+
15
+ digest = hashlib.blake2b(asset_id.encode("utf-8"), digest_size=8).digest()
16
+ return int.from_bytes(digest, byteorder="big") % bucket_count
17
+
18
+
19
+ def stable_record_hash(values: dict[str, object]) -> str:
20
+ """Hash a record using stable JSON encoding."""
21
+
22
+ payload = json.dumps(values, sort_keys=True, default=str, separators=(",", ":"))
23
+ return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
24
+
25
+
26
+ def frame_content_hash(frame: pl.DataFrame, columns: Iterable[str] | None = None) -> str:
27
+ """Hash a dataframe deterministically after sorting selected columns."""
28
+
29
+ selected = list(columns or frame.columns)
30
+ rows = frame.select(selected).sort(selected).to_dicts()
31
+ payload = json.dumps(rows, sort_keys=True, default=str, separators=(",", ":"))
32
+ return hashlib.blake2b(payload.encode("utf-8"), digest_size=16).hexdigest()
@@ -0,0 +1,67 @@
1
+ """Canonical normalization."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Protocol
7
+
8
+ import polars as pl
9
+
10
+ from bagelquant_data.core.dataset import DatasetSpec
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class NormalizeContext:
15
+ """Normalization context."""
16
+
17
+ source: str
18
+ dataset: str
19
+ run_id: str | None = None
20
+
21
+
22
+ @dataclass(frozen=True, slots=True)
23
+ class NormalizeResult:
24
+ """Accepted and rejected normalized records."""
25
+
26
+ accepted: pl.LazyFrame
27
+ rejected: pl.LazyFrame
28
+
29
+
30
+ class Normalizer(Protocol):
31
+ """Dataset normalizer protocol."""
32
+
33
+ def normalize(
34
+ self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
35
+ ) -> NormalizeResult:
36
+ """Normalize source rows."""
37
+ ...
38
+
39
+
40
+ class StandardNormalizer:
41
+ """Map configured source fields into canonical columns."""
42
+
43
+ def normalize(
44
+ self, frame: pl.LazyFrame, spec: DatasetSpec, context: NormalizeContext
45
+ ) -> NormalizeResult:
46
+ lf = frame.rename(spec.field_mapping)
47
+ expressions: list[pl.Expr] = [
48
+ pl.lit(context.source).alias("source"),
49
+ pl.lit(spec.source_dataset).alias("source_dataset"),
50
+ ]
51
+ if spec.asset_column and spec.asset_column in lf.collect_schema().names():
52
+ expressions.append(pl.col(spec.asset_column).cast(pl.String).alias("asset_id"))
53
+ if spec.time_column and spec.time_column in lf.collect_schema().names():
54
+ expressions.append(_date_expr(spec.time_column).alias("time"))
55
+ if spec.period_column and spec.period_column in lf.collect_schema().names():
56
+ expressions.append(_date_expr(spec.period_column).alias("period"))
57
+ accepted = lf.with_columns(expressions)
58
+ rejected = accepted.filter(pl.lit(False))
59
+ return NormalizeResult(accepted=accepted, rejected=rejected)
60
+
61
+
62
+ def _date_expr(column: str) -> pl.Expr:
63
+ return (
64
+ pl.when(pl.col(column).cast(pl.String).str.len_chars() == 8)
65
+ .then(pl.col(column).cast(pl.String).str.strptime(pl.Date, "%Y%m%d", strict=False))
66
+ .otherwise(pl.col(column).cast(pl.Date, strict=False))
67
+ )
@@ -0,0 +1,76 @@
1
+ """Partition strategies."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Protocol
7
+
8
+ import polars as pl
9
+
10
+ from bagelquant_data.core.dataset import DatasetSpec
11
+ from bagelquant_data.core.hashing import stable_bucket
12
+
13
+
14
+ class PartitionStrategy(Protocol):
15
+ """Derive partition values and paths."""
16
+
17
+ def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
18
+ """Add partition columns."""
19
+ ...
20
+
21
+ def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
22
+ """Return candidate partition paths."""
23
+ ...
24
+
25
+ def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
26
+ """Return partition path for values."""
27
+ ...
28
+
29
+
30
+ class SingleFilePartition:
31
+ """One canonical file per dataset."""
32
+
33
+ def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
34
+ return frame
35
+
36
+ def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
37
+ return [Path("data.parquet")]
38
+
39
+ def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
40
+ return Path("data.parquet")
41
+
42
+
43
+ class YearMonthPartition:
44
+ """Partition by year and month of canonical time."""
45
+
46
+ def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
47
+ return frame.with_columns(
48
+ pl.col("time").dt.year().cast(pl.Int16).alias("year"),
49
+ pl.col("time").dt.month().cast(pl.Int8).alias("month"),
50
+ )
51
+
52
+ def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
53
+ return []
54
+
55
+ def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
56
+ return Path(f"year={values['year']}") / f"month={int(str(values['month'])):02d}" / "data.parquet"
57
+
58
+
59
+ class YearBucketPartition:
60
+ """Partition by year(time) and stable asset bucket."""
61
+
62
+ def derive_columns(self, frame: pl.LazyFrame, spec: DatasetSpec) -> pl.LazyFrame:
63
+ bucket_count = int(spec.partition_options.get("bucket_count", 32))
64
+ return frame.with_columns(
65
+ pl.col("time").dt.year().cast(pl.Int16).alias("year"),
66
+ pl.col("asset_id")
67
+ .cast(pl.String)
68
+ .map_elements(lambda value: stable_bucket(value, bucket_count), return_dtype=pl.Int16)
69
+ .alias("bucket"),
70
+ )
71
+
72
+ def paths_for_query(self, spec: DatasetSpec, query: object) -> list[Path]:
73
+ return []
74
+
75
+ def path_for_values(self, spec: DatasetSpec, values: dict[str, object]) -> Path:
76
+ return Path(f"year={values['year']}") / f"bucket={int(str(values['bucket'])):02d}" / "data.parquet"
@@ -0,0 +1,67 @@
1
+ """Plugin registries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Generic, TypeVar
7
+
8
+ from bagelquant_data.core.deduplication import (
9
+ ExactRecordHashDeduplication,
10
+ NoDeduplication,
11
+ PrimaryKeyLastDeduplication,
12
+ )
13
+ from bagelquant_data.core.normalization import StandardNormalizer
14
+ from bagelquant_data.core.partitioning import (
15
+ SingleFilePartition,
16
+ YearBucketPartition,
17
+ YearMonthPartition,
18
+ )
19
+ from bagelquant_data.core.validation import FrameworkValidator
20
+
21
+ T = TypeVar("T")
22
+
23
+
24
+ @dataclass
25
+ class Registry(Generic[T]):
26
+ """Named object registry."""
27
+
28
+ _items: dict[str, T] = field(default_factory=dict)
29
+
30
+ def register(self, name: str, value: T) -> None:
31
+ self._items[name] = value
32
+
33
+ def get(self, name: str) -> T:
34
+ try:
35
+ return self._items[name]
36
+ except KeyError as exc:
37
+ raise KeyError(f"Unknown registry item: {name}") from exc
38
+
39
+ def list(self) -> tuple[str, ...]:
40
+ return tuple(sorted(self._items))
41
+
42
+
43
+ @dataclass
44
+ class FrameworkRegistries:
45
+ """All extension registries used by the framework."""
46
+
47
+ sources: Registry[object] = field(default_factory=lambda: Registry[object]())
48
+ normalizers: Registry[object] = field(default_factory=lambda: Registry[object]())
49
+ validators: Registry[object] = field(default_factory=lambda: Registry[object]())
50
+ partition_strategies: Registry[object] = field(default_factory=lambda: Registry[object]())
51
+ deduplication_strategies: Registry[object] = field(default_factory=lambda: Registry[object]())
52
+ financial_fields: Registry[object] = field(default_factory=lambda: Registry[object]())
53
+
54
+
55
+ def default_registries() -> FrameworkRegistries:
56
+ """Return registries with built-in plugins installed."""
57
+
58
+ registries = FrameworkRegistries()
59
+ registries.normalizers.register("standard", StandardNormalizer())
60
+ registries.validators.register("framework", FrameworkValidator())
61
+ registries.partition_strategies.register("single_file", SingleFilePartition())
62
+ registries.partition_strategies.register("year_month", YearMonthPartition())
63
+ registries.partition_strategies.register("year_bucket", YearBucketPartition())
64
+ registries.deduplication_strategies.register("none", NoDeduplication())
65
+ registries.deduplication_strategies.register("exact_record_hash", ExactRecordHashDeduplication())
66
+ registries.deduplication_strategies.register("primary_key_last", PrimaryKeyLastDeduplication())
67
+ return registries
@@ -0,0 +1,21 @@
1
+ """Request planning models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping, Sequence
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from bagelquant_data.core.types import DateLike
10
+
11
+
12
+ @dataclass(frozen=True, slots=True)
13
+ class RequestContext:
14
+ """Context passed to request planners and source adapters."""
15
+
16
+ source: str
17
+ dataset: str
18
+ start: DateLike | None = None
19
+ end: DateLike | None = None
20
+ assets: Sequence[str] | None = None
21
+ options: Mapping[str, Any] = field(default_factory=dict)
@@ -0,0 +1,38 @@
1
+ """Source adapter protocol."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable, Mapping
6
+ from typing import Any, Protocol
7
+
8
+ import polars as pl
9
+
10
+ from bagelquant_data.core.dataset import DatasetSpec
11
+ from bagelquant_data.core.request import RequestContext
12
+
13
+
14
+ class DataSource(Protocol):
15
+ """Generic external source adapter."""
16
+
17
+ @property
18
+ def name(self) -> str:
19
+ """Source name."""
20
+ ...
21
+
22
+ def configure(self, **options: Any) -> None:
23
+ """Configure credentials and runtime options."""
24
+ ...
25
+
26
+ def test_connection(self) -> None:
27
+ """Raise when the source cannot be reached."""
28
+ ...
29
+
30
+ def fetch(self, source_dataset: str, request: Mapping[str, Any]) -> pl.DataFrame:
31
+ """Fetch one source response."""
32
+ ...
33
+
34
+ def plan_requests(
35
+ self, dataset: DatasetSpec, context: RequestContext
36
+ ) -> Iterable[Mapping[str, Any]]:
37
+ """Plan source requests for a dataset."""
38
+ ...
@@ -0,0 +1,10 @@
1
+ """Shared type aliases."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import date, datetime
6
+ from os import PathLike
7
+ from typing import TypeAlias
8
+
9
+ DateLike: TypeAlias = str | date | datetime
10
+ PathLikeStr: TypeAlias = str | PathLike[str]