featkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featkit/__init__.py +1 -0
- featkit/builders/.gitkeep +0 -0
- featkit/builders/__init__.py +0 -0
- featkit/builders/distributional_space.py +77 -0
- featkit/builders/pivot_space.py +102 -0
- featkit/builders/temporal_space.py +86 -0
- featkit/config.py +38 -0
- featkit/contracts/__init__.py +1 -0
- featkit/contracts/measurement/.gitkeep +0 -0
- featkit/contracts/measurement/__init__.py +27 -0
- featkit/contracts/measurement/base.py +47 -0
- featkit/contracts/measurement/defaults.py +117 -0
- featkit/contracts/output/.gitkeep +0 -0
- featkit/contracts/output/__init__.py +19 -0
- featkit/contracts/output/base.py +36 -0
- featkit/contracts/output/defaults.py +80 -0
- featkit/dataset/.gitkeep +0 -0
- featkit/dataset/__init__.py +0 -0
- featkit/dataset/base.py +120 -0
- featkit/enums.py +110 -0
- featkit/fields/.gitkeep +0 -0
- featkit/fields/__init__.py +9 -0
- featkit/fields/base.py +48 -0
- featkit/fields/categorical_field.py +55 -0
- featkit/fields/id_field.py +14 -0
- featkit/fields/measurement_field.py +42 -0
- featkit/fields/time_field.py +43 -0
- featkit/generators/__init__.py +0 -0
- featkit/generators/base.py +171 -0
- featkit/generators/output.py +118 -0
- featkit/generators/pyspark/.gitkeep +0 -0
- featkit/generators/pyspark/__init__.py +0 -0
- featkit/generators/pyspark/databricks.py +448 -0
- featkit/generators/sql/.gitkeep +0 -0
- featkit/generators/sql/__init__.py +0 -0
- featkit/generators/sql/base.py +496 -0
- featkit/generators/sql/databricks.py +19 -0
- featkit/generators/sql/snowflake.py +19 -0
- featkit/generators/sql/spark_sql.py +19 -0
- featkit/layer2/.gitkeep +0 -0
- featkit/layer2/__init__.py +0 -0
- featkit/layer2/base.py +86 -0
- featkit/layer2/distributional.py +51 -0
- featkit/layer2/pivoted.py +63 -0
- featkit/layer3/.gitkeep +0 -0
- featkit/layer3/__init__.py +0 -0
- featkit/layer3/temporal_feature.py +87 -0
- featkit/pipeline.py +63 -0
- featkit-0.1.0.dist-info/METADATA +140 -0
- featkit-0.1.0.dist-info/RECORD +52 -0
- featkit-0.1.0.dist-info/WHEEL +4 -0
- featkit-0.1.0.dist-info/licenses/LICENSE +21 -0
featkit/dataset/base.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Dataset descriptors — pure schema metadata for the source facts table."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC
|
|
6
|
+
|
|
7
|
+
from featkit.enums import FieldRole
|
|
8
|
+
from featkit.fields.base import AbstractField
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AbstractDataset(ABC): # noqa: B024
|
|
12
|
+
"""Schema descriptor for an input facts table.
|
|
13
|
+
|
|
14
|
+
Holds pure metadata — no data access, no materialisation. Derived
|
|
15
|
+
field-role properties filter :attr:`fields` by role; :meth:`validate`
|
|
16
|
+
asserts structural consistency.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
source_reference: Fully-qualified table name or SQL string that
|
|
20
|
+
identifies the source of the facts.
|
|
21
|
+
fields: All columns in the facts table.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, source_reference: str, fields: list[AbstractField]) -> None:
|
|
25
|
+
self.source_reference = source_reference
|
|
26
|
+
self.fields: list[AbstractField] = list(fields)
|
|
27
|
+
|
|
28
|
+
# ------------------------------------------------------------------
|
|
29
|
+
# Derived field-role properties
|
|
30
|
+
# ------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def id_fields(self) -> list[AbstractField]:
|
|
34
|
+
"""All fields whose role is :attr:`~featkit.enums.FieldRole.ID`."""
|
|
35
|
+
return [f for f in self.fields if f.role == FieldRole.ID]
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def time_field(self) -> AbstractField:
|
|
39
|
+
"""The single TIME field in this dataset.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
ValueError: If no TIME field exists or more than one does.
|
|
43
|
+
"""
|
|
44
|
+
time_fields = [f for f in self.fields if f.role == FieldRole.TIME]
|
|
45
|
+
if len(time_fields) == 0:
|
|
46
|
+
raise ValueError("no TIME field; exactly one is required")
|
|
47
|
+
if len(time_fields) > 1:
|
|
48
|
+
raise ValueError(f"{len(time_fields)} TIME fields; exactly one is required")
|
|
49
|
+
return time_fields[0]
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def categorical_fields(self) -> list[AbstractField]:
|
|
53
|
+
"""All fields whose role is :attr:`~featkit.enums.FieldRole.CATEGORICAL`."""
|
|
54
|
+
return [f for f in self.fields if f.role == FieldRole.CATEGORICAL]
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def measurement_fields(self) -> list[AbstractField]:
|
|
58
|
+
"""All fields whose role is :attr:`~featkit.enums.FieldRole.MEASUREMENT`."""
|
|
59
|
+
return [f for f in self.fields if f.role == FieldRole.MEASUREMENT]
|
|
60
|
+
|
|
61
|
+
# ------------------------------------------------------------------
|
|
62
|
+
# Validation
|
|
63
|
+
# ------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def validate(self) -> None:
|
|
66
|
+
"""Assert structural consistency of the dataset schema.
|
|
67
|
+
|
|
68
|
+
Checks:
|
|
69
|
+
- Exactly one TIME field
|
|
70
|
+
- At least one ID field
|
|
71
|
+
- At least one MEASUREMENT field
|
|
72
|
+
- No duplicate field names
|
|
73
|
+
|
|
74
|
+
Raises:
|
|
75
|
+
ValueError: With a description of every violation found.
|
|
76
|
+
"""
|
|
77
|
+
violations: list[str] = []
|
|
78
|
+
|
|
79
|
+
time_count = sum(1 for f in self.fields if f.role == FieldRole.TIME)
|
|
80
|
+
if time_count == 0:
|
|
81
|
+
violations.append("no TIME field; exactly one is required")
|
|
82
|
+
elif time_count > 1:
|
|
83
|
+
violations.append(f"{time_count} TIME fields; exactly one is required")
|
|
84
|
+
|
|
85
|
+
if not any(f.role == FieldRole.ID for f in self.fields):
|
|
86
|
+
violations.append("no ID field found; at least one is required")
|
|
87
|
+
|
|
88
|
+
if not any(f.role == FieldRole.MEASUREMENT for f in self.fields):
|
|
89
|
+
violations.append("no MEASUREMENT field found; at least one is required")
|
|
90
|
+
|
|
91
|
+
seen: set[str] = set()
|
|
92
|
+
duplicates: set[str] = set()
|
|
93
|
+
for f in self.fields:
|
|
94
|
+
if f.name in seen:
|
|
95
|
+
duplicates.add(f.name)
|
|
96
|
+
seen.add(f.name)
|
|
97
|
+
if duplicates:
|
|
98
|
+
violations.append(f"duplicate field name(s): {', '.join(sorted(duplicates))}")
|
|
99
|
+
|
|
100
|
+
if violations:
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"Dataset validation failed:\n" + "\n".join(f" - {v}" for v in violations)
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def __repr__(self) -> str:
|
|
106
|
+
return (
|
|
107
|
+
f"{type(self).__name__}("
|
|
108
|
+
f"source_reference={self.source_reference!r}, "
|
|
109
|
+
f"fields={len(self.fields)})"
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class SimpleDataset(AbstractDataset):
|
|
114
|
+
"""Concrete dataset descriptor — the standard entry point for schema definition.
|
|
115
|
+
|
|
116
|
+
Accepts ``source_reference`` and ``fields`` directly; no subclassing needed.
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
def __init__(self, source_reference: str, fields: list[AbstractField]) -> None:
|
|
120
|
+
super().__init__(source_reference, fields)
|
featkit/enums.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Enumerators that form the type system of the featkit framework."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FieldRole(Enum):
|
|
7
|
+
"""Role of a column in the source facts table."""
|
|
8
|
+
|
|
9
|
+
ID = "ID"
|
|
10
|
+
TIME = "TIME"
|
|
11
|
+
CATEGORICAL = "CATEGORICAL"
|
|
12
|
+
MEASUREMENT = "MEASUREMENT"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MeasurementType(Enum):
|
|
16
|
+
"""Semantic type of a measurement column.
|
|
17
|
+
|
|
18
|
+
Governs which :class:`~featkit.enums.Layer2Aggregator` values are valid
|
|
19
|
+
for that column at the Layer 1 → Layer 2 boundary.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
MONTO = "MONTO"
|
|
23
|
+
CANTIDAD = "CANTIDAD"
|
|
24
|
+
TICKET = "TICKET"
|
|
25
|
+
FLAG = "FLAG"
|
|
26
|
+
FECHA = "FECHA"
|
|
27
|
+
BALANCE = "BALANCE"
|
|
28
|
+
TIME_DIFF = "TIME_DIFF"
|
|
29
|
+
ESTADISTICO = "ESTADISTICO"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class TimeGranularity(Enum):
|
|
33
|
+
"""Temporal granularity of a time column."""
|
|
34
|
+
|
|
35
|
+
DAILY = "DAILY"
|
|
36
|
+
WEEKLY = "WEEKLY"
|
|
37
|
+
MONTHLY = "MONTHLY"
|
|
38
|
+
QUARTERLY = "QUARTERLY"
|
|
39
|
+
YEARLY = "YEARLY"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class CategoricalTreatment(Enum):
|
|
43
|
+
"""How a categorical column is used when building Layer 2."""
|
|
44
|
+
|
|
45
|
+
PIVOT = "PIVOT"
|
|
46
|
+
DISTRIBUTIONAL = "DISTRIBUTIONAL"
|
|
47
|
+
BOTH = "BOTH"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class Layer2Aggregator(Enum):
|
|
51
|
+
"""SQL aggregation function applied when collapsing facts into Layer 2 columns."""
|
|
52
|
+
|
|
53
|
+
SUM = "SUM"
|
|
54
|
+
COUNT = "COUNT"
|
|
55
|
+
MAX = "MAX"
|
|
56
|
+
MIN = "MIN"
|
|
57
|
+
AVG = "AVG"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class DistributionalMetric(Enum):
|
|
61
|
+
"""Statistical metric computed per-categorical in Layer 2B.
|
|
62
|
+
|
|
63
|
+
All metrics are implemented as pure SQL aggregate expressions within CTEs —
|
|
64
|
+
no custom UDAFs or stored procedures are required.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
ENTROPY = "ENTROPY"
|
|
68
|
+
HHI = "HHI"
|
|
69
|
+
DOMINANT_PROPORTION = "DOMINANT_PROPORTION"
|
|
70
|
+
MODE = "MODE"
|
|
71
|
+
COUNT = "COUNT"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Layer2OutputType(Enum):
|
|
75
|
+
"""Output type of any Layer 2 column.
|
|
76
|
+
|
|
77
|
+
Governs which :class:`~featkit.enums.TemporalOperator` values are valid
|
|
78
|
+
at the Layer 2 → Layer 3 boundary.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
NUMERIC = "NUMERIC"
|
|
82
|
+
FLAG = "FLAG"
|
|
83
|
+
CATEGORICAL = "CATEGORICAL"
|
|
84
|
+
TEMPORAL = "TEMPORAL"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class TemporalOperator(Enum):
|
|
88
|
+
"""Sliding-window or point-in-time operator applied in Layer 3."""
|
|
89
|
+
|
|
90
|
+
PROM_U = "PROM_U"
|
|
91
|
+
PROM_P = "PROM_P"
|
|
92
|
+
SUM_U = "SUM_U"
|
|
93
|
+
SUM_P = "SUM_P"
|
|
94
|
+
ULT_MES = "ULT_MES"
|
|
95
|
+
PREV_MES = "PREV_MES"
|
|
96
|
+
CREC = "CREC"
|
|
97
|
+
FREQ = "FREQ"
|
|
98
|
+
MIN_U = "MIN_U"
|
|
99
|
+
MAX_U = "MAX_U"
|
|
100
|
+
REC = "REC"
|
|
101
|
+
XM = "XM"
|
|
102
|
+
MEDIA_ABS = "MEDIA_ABS"
|
|
103
|
+
RATIO = "RATIO"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class TimeWindowDirection(Enum):
|
|
107
|
+
"""Direction of the temporal sliding window."""
|
|
108
|
+
|
|
109
|
+
BACKWARD = "BACKWARD"
|
|
110
|
+
FORWARD = "FORWARD"
|
featkit/fields/.gitkeep
ADDED
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Field descriptors for the source facts table columns."""
|
|
2
|
+
|
|
3
|
+
from featkit.fields.base import AbstractField
|
|
4
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
5
|
+
from featkit.fields.id_field import IDField
|
|
6
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
7
|
+
from featkit.fields.time_field import TimeField
|
|
8
|
+
|
|
9
|
+
__all__ = ["AbstractField", "CategoricalField", "IDField", "MeasurementField", "TimeField"]
|
featkit/fields/base.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Abstract base class for all column descriptors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
from featkit.enums import FieldRole
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AbstractField(ABC):
|
|
11
|
+
"""Base descriptor for a column in the source facts table.
|
|
12
|
+
|
|
13
|
+
Concrete subclasses fix :attr:`role` to one of the :class:`~featkit.enums.FieldRole`
|
|
14
|
+
values. The class carries only schema metadata — no data access.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, name: str) -> None:
|
|
18
|
+
self._name = name
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def name(self) -> str:
|
|
22
|
+
"""Column name as it appears in the source table."""
|
|
23
|
+
return self._name
|
|
24
|
+
|
|
25
|
+
@property
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def role(self) -> FieldRole:
|
|
28
|
+
"""The semantic role this column plays in the framework."""
|
|
29
|
+
...
|
|
30
|
+
|
|
31
|
+
def __repr__(self) -> str:
|
|
32
|
+
return f"{type(self).__name__}(name={self.name!r})"
|
|
33
|
+
|
|
34
|
+
def _key(self) -> tuple[object, ...]:
|
|
35
|
+
"""Return the identity components for equality and hashing.
|
|
36
|
+
|
|
37
|
+
Subclasses that carry additional schema-defining metadata should
|
|
38
|
+
override this method and extend the returned tuple.
|
|
39
|
+
"""
|
|
40
|
+
return (self.name,)
|
|
41
|
+
|
|
42
|
+
def __eq__(self, other: object) -> bool:
|
|
43
|
+
if not isinstance(other, AbstractField):
|
|
44
|
+
return NotImplemented
|
|
45
|
+
return type(self) is type(other) and self._key() == other._key()
|
|
46
|
+
|
|
47
|
+
def __hash__(self) -> int:
|
|
48
|
+
return hash((type(self), self._key()))
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""CategoricalField — a column used for pivot or distributional grouping."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import CategoricalTreatment, DistributionalMetric, FieldRole
|
|
6
|
+
from featkit.fields.base import AbstractField
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CategoricalField(AbstractField):
|
|
10
|
+
"""Descriptor for a categorical column.
|
|
11
|
+
|
|
12
|
+
Can participate in Layer 2A (``PIVOT``), Layer 2B (``DISTRIBUTIONAL``), or both.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
name: Column name in the source table.
|
|
16
|
+
treatment: How this column is used in Layer 2 construction.
|
|
17
|
+
distributional_metrics: Metrics to compute in Layer 2B. Required when
|
|
18
|
+
*treatment* includes ``DISTRIBUTIONAL``.
|
|
19
|
+
allowed_values: Static list of distinct values. When set, the
|
|
20
|
+
:class:`~featkit.builders.pivot_space.PivotSpaceBuilder` uses these
|
|
21
|
+
directly without executing a query. When ``None``, a ``domain_resolver``
|
|
22
|
+
must be supplied to the builder.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
name: str,
|
|
28
|
+
treatment: CategoricalTreatment,
|
|
29
|
+
distributional_metrics: list[DistributionalMetric] | None = None,
|
|
30
|
+
allowed_values: list[str] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
super().__init__(name)
|
|
33
|
+
self.treatment = treatment
|
|
34
|
+
self.distributional_metrics: list[DistributionalMetric] = distributional_metrics or []
|
|
35
|
+
self.allowed_values = allowed_values
|
|
36
|
+
|
|
37
|
+
if (
|
|
38
|
+
treatment in (CategoricalTreatment.DISTRIBUTIONAL, CategoricalTreatment.BOTH)
|
|
39
|
+
and not self.distributional_metrics
|
|
40
|
+
):
|
|
41
|
+
raise ValueError(
|
|
42
|
+
f"CategoricalField {name!r}: distributional_metrics must not be empty "
|
|
43
|
+
f"when treatment is {treatment.name}"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def role(self) -> FieldRole:
|
|
48
|
+
return FieldRole.CATEGORICAL
|
|
49
|
+
|
|
50
|
+
def _key(self) -> tuple[object, ...]:
|
|
51
|
+
normalized_metrics = tuple(sorted(self.distributional_metrics, key=lambda m: m.value))
|
|
52
|
+
normalized_values = (
|
|
53
|
+
None if self.allowed_values is None else tuple(sorted(self.allowed_values))
|
|
54
|
+
)
|
|
55
|
+
return (self.name, self.treatment, normalized_metrics, normalized_values)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""IDField — identifies the entity being profiled."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import FieldRole
|
|
6
|
+
from featkit.fields.base import AbstractField
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class IDField(AbstractField):
|
|
10
|
+
"""A column that serves as a GROUP BY key identifying the entity (e.g. customer ID)."""
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def role(self) -> FieldRole:
|
|
14
|
+
return FieldRole.ID
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""MeasurementField — a numeric or typed measurement column."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from featkit.enums import FieldRole, MeasurementType
|
|
8
|
+
from featkit.fields.base import AbstractField
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
# AbstractMeasurementTypeContract is defined in Plan 04
|
|
12
|
+
# (contracts/measurement/base.py). The TYPE_CHECKING guard keeps this a
|
|
13
|
+
# static-analysis-only import so the field module has no runtime dependency
|
|
14
|
+
# on the contracts package.
|
|
15
|
+
from featkit.contracts.measurement.base import AbstractMeasurementTypeContract
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MeasurementField(AbstractField):
|
|
19
|
+
"""Descriptor for a measurement column.
|
|
20
|
+
|
|
21
|
+
Carries a :class:`~featkit.enums.MeasurementType` and an optional reference
|
|
22
|
+
to the contract that governs valid Layer 2 aggregators. The contract is
|
|
23
|
+
typically injected via :func:`~featkit.contracts.measurement.defaults.get_default_contract`
|
|
24
|
+
after Plan 04 is available; ``None`` is valid at schema-definition time.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
name: str,
|
|
30
|
+
measurement_type: MeasurementType,
|
|
31
|
+
contract: AbstractMeasurementTypeContract | None = None,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(name)
|
|
34
|
+
self.measurement_type = measurement_type
|
|
35
|
+
self.contract = contract
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def role(self) -> FieldRole:
|
|
39
|
+
return FieldRole.MEASUREMENT
|
|
40
|
+
|
|
41
|
+
def _key(self) -> tuple[object, ...]:
|
|
42
|
+
return (self.name, self.measurement_type, self.contract)
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""TimeField — the temporal axis of the facts table."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import FieldRole, TimeGranularity
|
|
6
|
+
from featkit.fields.base import AbstractField
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TimeField(AbstractField):
|
|
10
|
+
"""Descriptor for the time column.
|
|
11
|
+
|
|
12
|
+
Tracks source and target granularity so downstream layers know whether
|
|
13
|
+
date truncation is required before grouping (e.g. daily source → monthly target).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
name: str,
|
|
19
|
+
source_granularity: TimeGranularity,
|
|
20
|
+
target_granularity: TimeGranularity,
|
|
21
|
+
) -> None:
|
|
22
|
+
super().__init__(name)
|
|
23
|
+
self.source_granularity = source_granularity
|
|
24
|
+
self.target_granularity = target_granularity
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def role(self) -> FieldRole:
|
|
28
|
+
return FieldRole.TIME
|
|
29
|
+
|
|
30
|
+
@property
|
|
31
|
+
def truncation_required(self) -> bool:
|
|
32
|
+
"""``True`` when source and target granularities differ."""
|
|
33
|
+
return self.source_granularity != self.target_granularity
|
|
34
|
+
|
|
35
|
+
def _key(self) -> tuple[object, ...]:
|
|
36
|
+
return (self.name, self.source_granularity, self.target_granularity)
|
|
37
|
+
|
|
38
|
+
def __repr__(self) -> str:
|
|
39
|
+
return (
|
|
40
|
+
f"TimeField(name={self.name!r}, "
|
|
41
|
+
f"source={self.source_granularity.name}, "
|
|
42
|
+
f"target={self.target_granularity.name})"
|
|
43
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""AbstractCodeGenerator — engine-agnostic base for all feature-store code generators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
from featkit.generators.output import (
|
|
9
|
+
DAG,
|
|
10
|
+
CodeOutput,
|
|
11
|
+
DAGNode,
|
|
12
|
+
FeatureStoreOutput,
|
|
13
|
+
PySparkOutput,
|
|
14
|
+
SQLOutput,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING:
|
|
18
|
+
from featkit.pipeline import FeatureStorePipeline
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Fixed DAG node names
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
_FACTS_TABLE = "facts_table"
|
|
25
|
+
_MOB_TABLE = "mob_table"
|
|
26
|
+
_LAYER2A_PIVOT = "layer2a_pivot"
|
|
27
|
+
_LAYER2B_DIST = "layer2b_distributional_ctes"
|
|
28
|
+
_LAYER2_JOIN = "layer2_join"
|
|
29
|
+
_LAYER3_TEMPORAL = "layer3_temporal"
|
|
30
|
+
_FINAL_OUTPUT = "final_output"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class AbstractCodeGenerator(ABC):
|
|
34
|
+
"""Base class for all feature-store code generators.
|
|
35
|
+
|
|
36
|
+
Subclasses implement the five abstract ``build_*`` methods; the concrete
|
|
37
|
+
``build_dag``, ``build_mermaid``, and ``generate`` methods are shared
|
|
38
|
+
across all engines.
|
|
39
|
+
|
|
40
|
+
The DAG structure is fixed for every generator:
|
|
41
|
+
|
|
42
|
+
.. code-block:: text
|
|
43
|
+
|
|
44
|
+
facts_table → mob_table
|
|
45
|
+
facts_table → layer2a_pivot
|
|
46
|
+
facts_table → layer2b_distributional_ctes
|
|
47
|
+
layer2a_pivot + layer2b_distributional_ctes → layer2_join
|
|
48
|
+
layer2_join + mob_table → layer3_temporal
|
|
49
|
+
layer2_join + layer3_temporal → final_output
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
# ------------------------------------------------------------------
|
|
53
|
+
# Abstract build steps
|
|
54
|
+
# ------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
@abstractmethod
|
|
57
|
+
def build_mob_table(self, pipeline: FeatureStorePipeline) -> CodeOutput:
|
|
58
|
+
"""Generate the Month-on-Books (MOB) reference table step."""
|
|
59
|
+
...
|
|
60
|
+
|
|
61
|
+
@abstractmethod
|
|
62
|
+
def build_layer2a(self, pipeline: FeatureStorePipeline) -> CodeOutput:
|
|
63
|
+
"""Generate the pivot (Layer 2a) aggregation step."""
|
|
64
|
+
...
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def build_layer2b(self, pipeline: FeatureStorePipeline) -> CodeOutput:
|
|
68
|
+
"""Generate the distributional (Layer 2b) CTE step."""
|
|
69
|
+
...
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def build_layer3(self, pipeline: FeatureStorePipeline) -> CodeOutput:
|
|
73
|
+
"""Generate the temporal features (Layer 3) step."""
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
@abstractmethod
|
|
77
|
+
def build_final_join(self, pipeline: FeatureStorePipeline) -> CodeOutput:
|
|
78
|
+
"""Generate the final join that assembles the output feature table."""
|
|
79
|
+
...
|
|
80
|
+
|
|
81
|
+
# ------------------------------------------------------------------
|
|
82
|
+
# Concrete DAG / Mermaid helpers
|
|
83
|
+
# ------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def build_dag(self, pipeline: FeatureStorePipeline) -> DAG: # noqa: ARG002
|
|
86
|
+
"""Return the fixed execution DAG (pipeline argument reserved for future use)."""
|
|
87
|
+
return DAG(
|
|
88
|
+
nodes=[
|
|
89
|
+
DAGNode(_FACTS_TABLE, []),
|
|
90
|
+
DAGNode(_MOB_TABLE, [_FACTS_TABLE]),
|
|
91
|
+
DAGNode(_LAYER2A_PIVOT, [_FACTS_TABLE]),
|
|
92
|
+
DAGNode(_LAYER2B_DIST, [_FACTS_TABLE]),
|
|
93
|
+
DAGNode(_LAYER2_JOIN, [_LAYER2A_PIVOT, _LAYER2B_DIST]),
|
|
94
|
+
DAGNode(_LAYER3_TEMPORAL, [_LAYER2_JOIN, _MOB_TABLE]),
|
|
95
|
+
DAGNode(_FINAL_OUTPUT, [_LAYER2_JOIN, _LAYER3_TEMPORAL]),
|
|
96
|
+
]
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def build_mermaid(self, dag: DAG) -> str:
|
|
100
|
+
"""Render *dag* as a Mermaid ``flowchart TD`` string."""
|
|
101
|
+
lines = ["flowchart TD"]
|
|
102
|
+
has_edges: set[str] = set()
|
|
103
|
+
|
|
104
|
+
for node in dag.nodes:
|
|
105
|
+
for dep in node.depends_on:
|
|
106
|
+
lines.append(f" {dep} --> {node.step_name}")
|
|
107
|
+
has_edges.add(dep)
|
|
108
|
+
has_edges.add(node.step_name)
|
|
109
|
+
|
|
110
|
+
# Isolated nodes (neither source nor target of any edge) need explicit declaration.
|
|
111
|
+
for node in dag.nodes:
|
|
112
|
+
if node.step_name not in has_edges:
|
|
113
|
+
lines.append(f" {node.step_name}")
|
|
114
|
+
|
|
115
|
+
return "\n".join(lines)
|
|
116
|
+
|
|
117
|
+
# ------------------------------------------------------------------
|
|
118
|
+
# Orchestration
|
|
119
|
+
# ------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def _combine_code(self, steps: list[CodeOutput]) -> CodeOutput:
|
|
122
|
+
"""Combine a sequence of per-step code outputs into one.
|
|
123
|
+
|
|
124
|
+
Default implementation: concatenates ``SQLOutput`` payloads.
|
|
125
|
+
Generators that produce :class:`~featkit.generators.output.PySparkOutput`
|
|
126
|
+
should override :meth:`generate` instead.
|
|
127
|
+
"""
|
|
128
|
+
sql_steps = [s for s in steps if isinstance(s, SQLOutput) and s.sql]
|
|
129
|
+
pyspark_steps = [s for s in steps if isinstance(s, PySparkOutput) and s.code]
|
|
130
|
+
|
|
131
|
+
if sql_steps and pyspark_steps:
|
|
132
|
+
msg = (
|
|
133
|
+
"Mixed non-empty code output types are not supported in "
|
|
134
|
+
"_combine_code(); found both SQLOutput and PySparkOutput."
|
|
135
|
+
)
|
|
136
|
+
raise ValueError(msg)
|
|
137
|
+
|
|
138
|
+
if sql_steps:
|
|
139
|
+
dialects = {s.dialect for s in sql_steps}
|
|
140
|
+
if len(dialects) > 1:
|
|
141
|
+
msg = (
|
|
142
|
+
f"Mixed SQLOutput dialects are not supported in _combine_code(); "
|
|
143
|
+
f"found dialects: {dialects}."
|
|
144
|
+
)
|
|
145
|
+
raise ValueError(msg)
|
|
146
|
+
return SQLOutput(
|
|
147
|
+
sql="\n\n".join(s.sql for s in sql_steps),
|
|
148
|
+
dialect=sql_steps[0].dialect,
|
|
149
|
+
)
|
|
150
|
+
if pyspark_steps:
|
|
151
|
+
return PySparkOutput(code="\n\n".join(s.code for s in pyspark_steps))
|
|
152
|
+
# All steps were empty SQLOutputs — return an empty one.
|
|
153
|
+
if any(isinstance(s, SQLOutput) for s in steps):
|
|
154
|
+
return SQLOutput(
|
|
155
|
+
sql="", dialect=next(s.dialect for s in steps if isinstance(s, SQLOutput))
|
|
156
|
+
)
|
|
157
|
+
return PySparkOutput()
|
|
158
|
+
|
|
159
|
+
def generate(self, pipeline: FeatureStorePipeline) -> FeatureStoreOutput:
|
|
160
|
+
"""Orchestrate all build steps and return the complete output artefact."""
|
|
161
|
+
steps: list[CodeOutput] = [
|
|
162
|
+
self.build_mob_table(pipeline),
|
|
163
|
+
self.build_layer2a(pipeline),
|
|
164
|
+
self.build_layer2b(pipeline),
|
|
165
|
+
self.build_layer3(pipeline),
|
|
166
|
+
self.build_final_join(pipeline),
|
|
167
|
+
]
|
|
168
|
+
code = self._combine_code(steps)
|
|
169
|
+
dag = self.build_dag(pipeline)
|
|
170
|
+
mermaid = self.build_mermaid(dag)
|
|
171
|
+
return FeatureStoreOutput(code=code, dag=dag, mermaid=mermaid)
|