featkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. featkit/__init__.py +1 -0
  2. featkit/builders/.gitkeep +0 -0
  3. featkit/builders/__init__.py +0 -0
  4. featkit/builders/distributional_space.py +77 -0
  5. featkit/builders/pivot_space.py +102 -0
  6. featkit/builders/temporal_space.py +86 -0
  7. featkit/config.py +38 -0
  8. featkit/contracts/__init__.py +1 -0
  9. featkit/contracts/measurement/.gitkeep +0 -0
  10. featkit/contracts/measurement/__init__.py +27 -0
  11. featkit/contracts/measurement/base.py +47 -0
  12. featkit/contracts/measurement/defaults.py +117 -0
  13. featkit/contracts/output/.gitkeep +0 -0
  14. featkit/contracts/output/__init__.py +19 -0
  15. featkit/contracts/output/base.py +36 -0
  16. featkit/contracts/output/defaults.py +80 -0
  17. featkit/dataset/.gitkeep +0 -0
  18. featkit/dataset/__init__.py +0 -0
  19. featkit/dataset/base.py +120 -0
  20. featkit/enums.py +110 -0
  21. featkit/fields/.gitkeep +0 -0
  22. featkit/fields/__init__.py +9 -0
  23. featkit/fields/base.py +48 -0
  24. featkit/fields/categorical_field.py +55 -0
  25. featkit/fields/id_field.py +14 -0
  26. featkit/fields/measurement_field.py +42 -0
  27. featkit/fields/time_field.py +43 -0
  28. featkit/generators/__init__.py +0 -0
  29. featkit/generators/base.py +171 -0
  30. featkit/generators/output.py +118 -0
  31. featkit/generators/pyspark/.gitkeep +0 -0
  32. featkit/generators/pyspark/__init__.py +0 -0
  33. featkit/generators/pyspark/databricks.py +448 -0
  34. featkit/generators/sql/.gitkeep +0 -0
  35. featkit/generators/sql/__init__.py +0 -0
  36. featkit/generators/sql/base.py +496 -0
  37. featkit/generators/sql/databricks.py +19 -0
  38. featkit/generators/sql/snowflake.py +19 -0
  39. featkit/generators/sql/spark_sql.py +19 -0
  40. featkit/layer2/.gitkeep +0 -0
  41. featkit/layer2/__init__.py +0 -0
  42. featkit/layer2/base.py +86 -0
  43. featkit/layer2/distributional.py +51 -0
  44. featkit/layer2/pivoted.py +63 -0
  45. featkit/layer3/.gitkeep +0 -0
  46. featkit/layer3/__init__.py +0 -0
  47. featkit/layer3/temporal_feature.py +87 -0
  48. featkit/pipeline.py +63 -0
  49. featkit-0.1.0.dist-info/METADATA +140 -0
  50. featkit-0.1.0.dist-info/RECORD +52 -0
  51. featkit-0.1.0.dist-info/WHEEL +4 -0
  52. featkit-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,120 @@
1
+ """Dataset descriptors — pure schema metadata for the source facts table."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC
6
+
7
+ from featkit.enums import FieldRole
8
+ from featkit.fields.base import AbstractField
9
+
10
+
11
+ class AbstractDataset(ABC): # noqa: B024
12
+ """Schema descriptor for an input facts table.
13
+
14
+ Holds pure metadata — no data access, no materialisation. Derived
15
+ field-role properties filter :attr:`fields` by role; :meth:`validate`
16
+ asserts structural consistency.
17
+
18
+ Args:
19
+ source_reference: Fully-qualified table name or SQL string that
20
+ identifies the source of the facts.
21
+ fields: All columns in the facts table.
22
+ """
23
+
24
+ def __init__(self, source_reference: str, fields: list[AbstractField]) -> None:
25
+ self.source_reference = source_reference
26
+ self.fields: list[AbstractField] = list(fields)
27
+
28
+ # ------------------------------------------------------------------
29
+ # Derived field-role properties
30
+ # ------------------------------------------------------------------
31
+
32
+ @property
33
+ def id_fields(self) -> list[AbstractField]:
34
+ """All fields whose role is :attr:`~featkit.enums.FieldRole.ID`."""
35
+ return [f for f in self.fields if f.role == FieldRole.ID]
36
+
37
+ @property
38
+ def time_field(self) -> AbstractField:
39
+ """The single TIME field in this dataset.
40
+
41
+ Raises:
42
+ ValueError: If no TIME field exists or more than one does.
43
+ """
44
+ time_fields = [f for f in self.fields if f.role == FieldRole.TIME]
45
+ if len(time_fields) == 0:
46
+ raise ValueError("no TIME field; exactly one is required")
47
+ if len(time_fields) > 1:
48
+ raise ValueError(f"{len(time_fields)} TIME fields; exactly one is required")
49
+ return time_fields[0]
50
+
51
+ @property
52
+ def categorical_fields(self) -> list[AbstractField]:
53
+ """All fields whose role is :attr:`~featkit.enums.FieldRole.CATEGORICAL`."""
54
+ return [f for f in self.fields if f.role == FieldRole.CATEGORICAL]
55
+
56
+ @property
57
+ def measurement_fields(self) -> list[AbstractField]:
58
+ """All fields whose role is :attr:`~featkit.enums.FieldRole.MEASUREMENT`."""
59
+ return [f for f in self.fields if f.role == FieldRole.MEASUREMENT]
60
+
61
+ # ------------------------------------------------------------------
62
+ # Validation
63
+ # ------------------------------------------------------------------
64
+
65
+ def validate(self) -> None:
66
+ """Assert structural consistency of the dataset schema.
67
+
68
+ Checks:
69
+ - Exactly one TIME field
70
+ - At least one ID field
71
+ - At least one MEASUREMENT field
72
+ - No duplicate field names
73
+
74
+ Raises:
75
+ ValueError: With a description of every violation found.
76
+ """
77
+ violations: list[str] = []
78
+
79
+ time_count = sum(1 for f in self.fields if f.role == FieldRole.TIME)
80
+ if time_count == 0:
81
+ violations.append("no TIME field; exactly one is required")
82
+ elif time_count > 1:
83
+ violations.append(f"{time_count} TIME fields; exactly one is required")
84
+
85
+ if not any(f.role == FieldRole.ID for f in self.fields):
86
+ violations.append("no ID field found; at least one is required")
87
+
88
+ if not any(f.role == FieldRole.MEASUREMENT for f in self.fields):
89
+ violations.append("no MEASUREMENT field found; at least one is required")
90
+
91
+ seen: set[str] = set()
92
+ duplicates: set[str] = set()
93
+ for f in self.fields:
94
+ if f.name in seen:
95
+ duplicates.add(f.name)
96
+ seen.add(f.name)
97
+ if duplicates:
98
+ violations.append(f"duplicate field name(s): {', '.join(sorted(duplicates))}")
99
+
100
+ if violations:
101
+ raise ValueError(
102
+ "Dataset validation failed:\n" + "\n".join(f" - {v}" for v in violations)
103
+ )
104
+
105
+ def __repr__(self) -> str:
106
+ return (
107
+ f"{type(self).__name__}("
108
+ f"source_reference={self.source_reference!r}, "
109
+ f"fields={len(self.fields)})"
110
+ )
111
+
112
+
113
+ class SimpleDataset(AbstractDataset):
114
+ """Concrete dataset descriptor — the standard entry point for schema definition.
115
+
116
+ Accepts ``source_reference`` and ``fields`` directly; no subclassing needed.
117
+ """
118
+
119
+ def __init__(self, source_reference: str, fields: list[AbstractField]) -> None:
120
+ super().__init__(source_reference, fields)
featkit/enums.py ADDED
@@ -0,0 +1,110 @@
1
+ """Enumerators that form the type system of the featkit framework."""
2
+
3
+ from enum import Enum
4
+
5
+
6
+ class FieldRole(Enum):
7
+ """Role of a column in the source facts table."""
8
+
9
+ ID = "ID"
10
+ TIME = "TIME"
11
+ CATEGORICAL = "CATEGORICAL"
12
+ MEASUREMENT = "MEASUREMENT"
13
+
14
+
15
+ class MeasurementType(Enum):
16
+ """Semantic type of a measurement column.
17
+
18
+ Governs which :class:`~featkit.enums.Layer2Aggregator` values are valid
19
+ for that column at the Layer 1 → Layer 2 boundary.
20
+ """
21
+
22
+ MONTO = "MONTO"
23
+ CANTIDAD = "CANTIDAD"
24
+ TICKET = "TICKET"
25
+ FLAG = "FLAG"
26
+ FECHA = "FECHA"
27
+ BALANCE = "BALANCE"
28
+ TIME_DIFF = "TIME_DIFF"
29
+ ESTADISTICO = "ESTADISTICO"
30
+
31
+
32
+ class TimeGranularity(Enum):
33
+ """Temporal granularity of a time column."""
34
+
35
+ DAILY = "DAILY"
36
+ WEEKLY = "WEEKLY"
37
+ MONTHLY = "MONTHLY"
38
+ QUARTERLY = "QUARTERLY"
39
+ YEARLY = "YEARLY"
40
+
41
+
42
+ class CategoricalTreatment(Enum):
43
+ """How a categorical column is used when building Layer 2."""
44
+
45
+ PIVOT = "PIVOT"
46
+ DISTRIBUTIONAL = "DISTRIBUTIONAL"
47
+ BOTH = "BOTH"
48
+
49
+
50
+ class Layer2Aggregator(Enum):
51
+ """SQL aggregation function applied when collapsing facts into Layer 2 columns."""
52
+
53
+ SUM = "SUM"
54
+ COUNT = "COUNT"
55
+ MAX = "MAX"
56
+ MIN = "MIN"
57
+ AVG = "AVG"
58
+
59
+
60
+ class DistributionalMetric(Enum):
61
+ """Statistical metric computed per-categorical in Layer 2B.
62
+
63
+ All metrics are implemented as pure SQL aggregate expressions within CTEs —
64
+ no custom UDAFs or stored procedures are required.
65
+ """
66
+
67
+ ENTROPY = "ENTROPY"
68
+ HHI = "HHI"
69
+ DOMINANT_PROPORTION = "DOMINANT_PROPORTION"
70
+ MODE = "MODE"
71
+ COUNT = "COUNT"
72
+
73
+
74
+ class Layer2OutputType(Enum):
75
+ """Output type of any Layer 2 column.
76
+
77
+ Governs which :class:`~featkit.enums.TemporalOperator` values are valid
78
+ at the Layer 2 → Layer 3 boundary.
79
+ """
80
+
81
+ NUMERIC = "NUMERIC"
82
+ FLAG = "FLAG"
83
+ CATEGORICAL = "CATEGORICAL"
84
+ TEMPORAL = "TEMPORAL"
85
+
86
+
87
+ class TemporalOperator(Enum):
88
+ """Sliding-window or point-in-time operator applied in Layer 3."""
89
+
90
+ PROM_U = "PROM_U"
91
+ PROM_P = "PROM_P"
92
+ SUM_U = "SUM_U"
93
+ SUM_P = "SUM_P"
94
+ ULT_MES = "ULT_MES"
95
+ PREV_MES = "PREV_MES"
96
+ CREC = "CREC"
97
+ FREQ = "FREQ"
98
+ MIN_U = "MIN_U"
99
+ MAX_U = "MAX_U"
100
+ REC = "REC"
101
+ XM = "XM"
102
+ MEDIA_ABS = "MEDIA_ABS"
103
+ RATIO = "RATIO"
104
+
105
+
106
+ class TimeWindowDirection(Enum):
107
+ """Direction of the temporal sliding window."""
108
+
109
+ BACKWARD = "BACKWARD"
110
+ FORWARD = "FORWARD"
File without changes
@@ -0,0 +1,9 @@
1
+ """Field descriptors for the source facts table columns."""
2
+
3
+ from featkit.fields.base import AbstractField
4
+ from featkit.fields.categorical_field import CategoricalField
5
+ from featkit.fields.id_field import IDField
6
+ from featkit.fields.measurement_field import MeasurementField
7
+ from featkit.fields.time_field import TimeField
8
+
9
+ __all__ = ["AbstractField", "CategoricalField", "IDField", "MeasurementField", "TimeField"]
featkit/fields/base.py ADDED
@@ -0,0 +1,48 @@
1
+ """Abstract base class for all column descriptors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from featkit.enums import FieldRole
8
+
9
+
10
+ class AbstractField(ABC):
11
+ """Base descriptor for a column in the source facts table.
12
+
13
+ Concrete subclasses fix :attr:`role` to one of the :class:`~featkit.enums.FieldRole`
14
+ values. The class carries only schema metadata — no data access.
15
+ """
16
+
17
+ def __init__(self, name: str) -> None:
18
+ self._name = name
19
+
20
+ @property
21
+ def name(self) -> str:
22
+ """Column name as it appears in the source table."""
23
+ return self._name
24
+
25
+ @property
26
+ @abstractmethod
27
+ def role(self) -> FieldRole:
28
+ """The semantic role this column plays in the framework."""
29
+ ...
30
+
31
+ def __repr__(self) -> str:
32
+ return f"{type(self).__name__}(name={self.name!r})"
33
+
34
+ def _key(self) -> tuple[object, ...]:
35
+ """Return the identity components for equality and hashing.
36
+
37
+ Subclasses that carry additional schema-defining metadata should
38
+ override this method and extend the returned tuple.
39
+ """
40
+ return (self.name,)
41
+
42
+ def __eq__(self, other: object) -> bool:
43
+ if not isinstance(other, AbstractField):
44
+ return NotImplemented
45
+ return type(self) is type(other) and self._key() == other._key()
46
+
47
+ def __hash__(self) -> int:
48
+ return hash((type(self), self._key()))
@@ -0,0 +1,55 @@
1
+ """CategoricalField — a column used for pivot or distributional grouping."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from featkit.enums import CategoricalTreatment, DistributionalMetric, FieldRole
6
+ from featkit.fields.base import AbstractField
7
+
8
+
9
+ class CategoricalField(AbstractField):
10
+ """Descriptor for a categorical column.
11
+
12
+ Can participate in Layer 2A (``PIVOT``), Layer 2B (``DISTRIBUTIONAL``), or both.
13
+
14
+ Args:
15
+ name: Column name in the source table.
16
+ treatment: How this column is used in Layer 2 construction.
17
+ distributional_metrics: Metrics to compute in Layer 2B. Required when
18
+ *treatment* includes ``DISTRIBUTIONAL``.
19
+ allowed_values: Static list of distinct values. When set, the
20
+ :class:`~featkit.builders.pivot_space.PivotSpaceBuilder` uses these
21
+ directly without executing a query. When ``None``, a ``domain_resolver``
22
+ must be supplied to the builder.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ name: str,
28
+ treatment: CategoricalTreatment,
29
+ distributional_metrics: list[DistributionalMetric] | None = None,
30
+ allowed_values: list[str] | None = None,
31
+ ) -> None:
32
+ super().__init__(name)
33
+ self.treatment = treatment
34
+ self.distributional_metrics: list[DistributionalMetric] = distributional_metrics or []
35
+ self.allowed_values = allowed_values
36
+
37
+ if (
38
+ treatment in (CategoricalTreatment.DISTRIBUTIONAL, CategoricalTreatment.BOTH)
39
+ and not self.distributional_metrics
40
+ ):
41
+ raise ValueError(
42
+ f"CategoricalField {name!r}: distributional_metrics must not be empty "
43
+ f"when treatment is {treatment.name}"
44
+ )
45
+
46
+ @property
47
+ def role(self) -> FieldRole:
48
+ return FieldRole.CATEGORICAL
49
+
50
+ def _key(self) -> tuple[object, ...]:
51
+ normalized_metrics = tuple(sorted(self.distributional_metrics, key=lambda m: m.value))
52
+ normalized_values = (
53
+ None if self.allowed_values is None else tuple(sorted(self.allowed_values))
54
+ )
55
+ return (self.name, self.treatment, normalized_metrics, normalized_values)
@@ -0,0 +1,14 @@
1
+ """IDField — identifies the entity being profiled."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from featkit.enums import FieldRole
6
+ from featkit.fields.base import AbstractField
7
+
8
+
9
+ class IDField(AbstractField):
10
+ """A column that serves as a GROUP BY key identifying the entity (e.g. customer ID)."""
11
+
12
+ @property
13
+ def role(self) -> FieldRole:
14
+ return FieldRole.ID
@@ -0,0 +1,42 @@
1
+ """MeasurementField — a numeric or typed measurement column."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from featkit.enums import FieldRole, MeasurementType
8
+ from featkit.fields.base import AbstractField
9
+
10
+ if TYPE_CHECKING:
11
+ # AbstractMeasurementTypeContract is defined in Plan 04
12
+ # (contracts/measurement/base.py). The TYPE_CHECKING guard keeps this a
13
+ # static-analysis-only import so the field module has no runtime dependency
14
+ # on the contracts package.
15
+ from featkit.contracts.measurement.base import AbstractMeasurementTypeContract
16
+
17
+
18
+ class MeasurementField(AbstractField):
19
+ """Descriptor for a measurement column.
20
+
21
+ Carries a :class:`~featkit.enums.MeasurementType` and an optional reference
22
+ to the contract that governs valid Layer 2 aggregators. The contract is
23
+ typically injected via :func:`~featkit.contracts.measurement.defaults.get_default_contract`
24
+ after Plan 04 is available; ``None`` is valid at schema-definition time.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ name: str,
30
+ measurement_type: MeasurementType,
31
+ contract: AbstractMeasurementTypeContract | None = None,
32
+ ) -> None:
33
+ super().__init__(name)
34
+ self.measurement_type = measurement_type
35
+ self.contract = contract
36
+
37
+ @property
38
+ def role(self) -> FieldRole:
39
+ return FieldRole.MEASUREMENT
40
+
41
+ def _key(self) -> tuple[object, ...]:
42
+ return (self.name, self.measurement_type, self.contract)
@@ -0,0 +1,43 @@
1
+ """TimeField — the temporal axis of the facts table."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from featkit.enums import FieldRole, TimeGranularity
6
+ from featkit.fields.base import AbstractField
7
+
8
+
9
+ class TimeField(AbstractField):
10
+ """Descriptor for the time column.
11
+
12
+ Tracks source and target granularity so downstream layers know whether
13
+ date truncation is required before grouping (e.g. daily source → monthly target).
14
+ """
15
+
16
+ def __init__(
17
+ self,
18
+ name: str,
19
+ source_granularity: TimeGranularity,
20
+ target_granularity: TimeGranularity,
21
+ ) -> None:
22
+ super().__init__(name)
23
+ self.source_granularity = source_granularity
24
+ self.target_granularity = target_granularity
25
+
26
+ @property
27
+ def role(self) -> FieldRole:
28
+ return FieldRole.TIME
29
+
30
+ @property
31
+ def truncation_required(self) -> bool:
32
+ """``True`` when source and target granularities differ."""
33
+ return self.source_granularity != self.target_granularity
34
+
35
+ def _key(self) -> tuple[object, ...]:
36
+ return (self.name, self.source_granularity, self.target_granularity)
37
+
38
+ def __repr__(self) -> str:
39
+ return (
40
+ f"TimeField(name={self.name!r}, "
41
+ f"source={self.source_granularity.name}, "
42
+ f"target={self.target_granularity.name})"
43
+ )
File without changes
@@ -0,0 +1,171 @@
1
+ """AbstractCodeGenerator — engine-agnostic base for all feature-store code generators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING
7
+
8
+ from featkit.generators.output import (
9
+ DAG,
10
+ CodeOutput,
11
+ DAGNode,
12
+ FeatureStoreOutput,
13
+ PySparkOutput,
14
+ SQLOutput,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ from featkit.pipeline import FeatureStorePipeline
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Fixed DAG node names
22
+ # ---------------------------------------------------------------------------
23
+
24
+ _FACTS_TABLE = "facts_table"
25
+ _MOB_TABLE = "mob_table"
26
+ _LAYER2A_PIVOT = "layer2a_pivot"
27
+ _LAYER2B_DIST = "layer2b_distributional_ctes"
28
+ _LAYER2_JOIN = "layer2_join"
29
+ _LAYER3_TEMPORAL = "layer3_temporal"
30
+ _FINAL_OUTPUT = "final_output"
31
+
32
+
33
+ class AbstractCodeGenerator(ABC):
34
+ """Base class for all feature-store code generators.
35
+
36
+ Subclasses implement the five abstract ``build_*`` methods; the concrete
37
+ ``build_dag``, ``build_mermaid``, and ``generate`` methods are shared
38
+ across all engines.
39
+
40
+ The DAG structure is fixed for every generator:
41
+
42
+ .. code-block:: text
43
+
44
+ facts_table → mob_table
45
+ facts_table → layer2a_pivot
46
+ facts_table → layer2b_distributional_ctes
47
+ layer2a_pivot + layer2b_distributional_ctes → layer2_join
48
+ layer2_join + mob_table → layer3_temporal
49
+ layer2_join + layer3_temporal → final_output
50
+ """
51
+
52
+ # ------------------------------------------------------------------
53
+ # Abstract build steps
54
+ # ------------------------------------------------------------------
55
+
56
+ @abstractmethod
57
+ def build_mob_table(self, pipeline: FeatureStorePipeline) -> CodeOutput:
58
+ """Generate the Month-on-Books (MOB) reference table step."""
59
+ ...
60
+
61
+ @abstractmethod
62
+ def build_layer2a(self, pipeline: FeatureStorePipeline) -> CodeOutput:
63
+ """Generate the pivot (Layer 2a) aggregation step."""
64
+ ...
65
+
66
+ @abstractmethod
67
+ def build_layer2b(self, pipeline: FeatureStorePipeline) -> CodeOutput:
68
+ """Generate the distributional (Layer 2b) CTE step."""
69
+ ...
70
+
71
+ @abstractmethod
72
+ def build_layer3(self, pipeline: FeatureStorePipeline) -> CodeOutput:
73
+ """Generate the temporal features (Layer 3) step."""
74
+ ...
75
+
76
+ @abstractmethod
77
+ def build_final_join(self, pipeline: FeatureStorePipeline) -> CodeOutput:
78
+ """Generate the final join that assembles the output feature table."""
79
+ ...
80
+
81
+ # ------------------------------------------------------------------
82
+ # Concrete DAG / Mermaid helpers
83
+ # ------------------------------------------------------------------
84
+
85
+ def build_dag(self, pipeline: FeatureStorePipeline) -> DAG: # noqa: ARG002
86
+ """Return the fixed execution DAG (pipeline argument reserved for future use)."""
87
+ return DAG(
88
+ nodes=[
89
+ DAGNode(_FACTS_TABLE, []),
90
+ DAGNode(_MOB_TABLE, [_FACTS_TABLE]),
91
+ DAGNode(_LAYER2A_PIVOT, [_FACTS_TABLE]),
92
+ DAGNode(_LAYER2B_DIST, [_FACTS_TABLE]),
93
+ DAGNode(_LAYER2_JOIN, [_LAYER2A_PIVOT, _LAYER2B_DIST]),
94
+ DAGNode(_LAYER3_TEMPORAL, [_LAYER2_JOIN, _MOB_TABLE]),
95
+ DAGNode(_FINAL_OUTPUT, [_LAYER2_JOIN, _LAYER3_TEMPORAL]),
96
+ ]
97
+ )
98
+
99
+ def build_mermaid(self, dag: DAG) -> str:
100
+ """Render *dag* as a Mermaid ``flowchart TD`` string."""
101
+ lines = ["flowchart TD"]
102
+ has_edges: set[str] = set()
103
+
104
+ for node in dag.nodes:
105
+ for dep in node.depends_on:
106
+ lines.append(f" {dep} --> {node.step_name}")
107
+ has_edges.add(dep)
108
+ has_edges.add(node.step_name)
109
+
110
+ # Isolated nodes (neither source nor target of any edge) need explicit declaration.
111
+ for node in dag.nodes:
112
+ if node.step_name not in has_edges:
113
+ lines.append(f" {node.step_name}")
114
+
115
+ return "\n".join(lines)
116
+
117
+ # ------------------------------------------------------------------
118
+ # Orchestration
119
+ # ------------------------------------------------------------------
120
+
121
+ def _combine_code(self, steps: list[CodeOutput]) -> CodeOutput:
122
+ """Combine a sequence of per-step code outputs into one.
123
+
124
+ Default implementation: concatenates ``SQLOutput`` payloads.
125
+ Generators that produce :class:`~featkit.generators.output.PySparkOutput`
126
+ should override :meth:`generate` instead.
127
+ """
128
+ sql_steps = [s for s in steps if isinstance(s, SQLOutput) and s.sql]
129
+ pyspark_steps = [s for s in steps if isinstance(s, PySparkOutput) and s.code]
130
+
131
+ if sql_steps and pyspark_steps:
132
+ msg = (
133
+ "Mixed non-empty code output types are not supported in "
134
+ "_combine_code(); found both SQLOutput and PySparkOutput."
135
+ )
136
+ raise ValueError(msg)
137
+
138
+ if sql_steps:
139
+ dialects = {s.dialect for s in sql_steps}
140
+ if len(dialects) > 1:
141
+ msg = (
142
+ f"Mixed SQLOutput dialects are not supported in _combine_code(); "
143
+ f"found dialects: {dialects}."
144
+ )
145
+ raise ValueError(msg)
146
+ return SQLOutput(
147
+ sql="\n\n".join(s.sql for s in sql_steps),
148
+ dialect=sql_steps[0].dialect,
149
+ )
150
+ if pyspark_steps:
151
+ return PySparkOutput(code="\n\n".join(s.code for s in pyspark_steps))
152
+ # All steps were empty SQLOutputs — return an empty one.
153
+ if any(isinstance(s, SQLOutput) for s in steps):
154
+ return SQLOutput(
155
+ sql="", dialect=next(s.dialect for s in steps if isinstance(s, SQLOutput))
156
+ )
157
+ return PySparkOutput()
158
+
159
+ def generate(self, pipeline: FeatureStorePipeline) -> FeatureStoreOutput:
160
+ """Orchestrate all build steps and return the complete output artefact."""
161
+ steps: list[CodeOutput] = [
162
+ self.build_mob_table(pipeline),
163
+ self.build_layer2a(pipeline),
164
+ self.build_layer2b(pipeline),
165
+ self.build_layer3(pipeline),
166
+ self.build_final_join(pipeline),
167
+ ]
168
+ code = self._combine_code(steps)
169
+ dag = self.build_dag(pipeline)
170
+ mermaid = self.build_mermaid(dag)
171
+ return FeatureStoreOutput(code=code, dag=dag, mermaid=mermaid)