featkit 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featkit/__init__.py +1 -0
- featkit/builders/.gitkeep +0 -0
- featkit/builders/__init__.py +0 -0
- featkit/builders/distributional_space.py +77 -0
- featkit/builders/pivot_space.py +102 -0
- featkit/builders/temporal_space.py +86 -0
- featkit/config.py +38 -0
- featkit/contracts/__init__.py +1 -0
- featkit/contracts/measurement/.gitkeep +0 -0
- featkit/contracts/measurement/__init__.py +27 -0
- featkit/contracts/measurement/base.py +47 -0
- featkit/contracts/measurement/defaults.py +117 -0
- featkit/contracts/output/.gitkeep +0 -0
- featkit/contracts/output/__init__.py +19 -0
- featkit/contracts/output/base.py +36 -0
- featkit/contracts/output/defaults.py +80 -0
- featkit/dataset/.gitkeep +0 -0
- featkit/dataset/__init__.py +0 -0
- featkit/dataset/base.py +120 -0
- featkit/enums.py +110 -0
- featkit/fields/.gitkeep +0 -0
- featkit/fields/__init__.py +9 -0
- featkit/fields/base.py +48 -0
- featkit/fields/categorical_field.py +55 -0
- featkit/fields/id_field.py +14 -0
- featkit/fields/measurement_field.py +42 -0
- featkit/fields/time_field.py +43 -0
- featkit/generators/__init__.py +0 -0
- featkit/generators/base.py +171 -0
- featkit/generators/output.py +118 -0
- featkit/generators/pyspark/.gitkeep +0 -0
- featkit/generators/pyspark/__init__.py +0 -0
- featkit/generators/pyspark/databricks.py +448 -0
- featkit/generators/sql/.gitkeep +0 -0
- featkit/generators/sql/__init__.py +0 -0
- featkit/generators/sql/base.py +496 -0
- featkit/generators/sql/databricks.py +19 -0
- featkit/generators/sql/snowflake.py +19 -0
- featkit/generators/sql/spark_sql.py +19 -0
- featkit/layer2/.gitkeep +0 -0
- featkit/layer2/__init__.py +0 -0
- featkit/layer2/base.py +86 -0
- featkit/layer2/distributional.py +51 -0
- featkit/layer2/pivoted.py +63 -0
- featkit/layer3/.gitkeep +0 -0
- featkit/layer3/__init__.py +0 -0
- featkit/layer3/temporal_feature.py +87 -0
- featkit/pipeline.py +63 -0
- featkit-0.1.0.dist-info/METADATA +140 -0
- featkit-0.1.0.dist-info/RECORD +52 -0
- featkit-0.1.0.dist-info/WHEEL +4 -0
- featkit-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""DistributionalColumn — one Layer 2B distributional metric column."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import DistributionalMetric, Layer2Aggregator, Layer2OutputType
|
|
6
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
7
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
8
|
+
from featkit.layer2.base import AbstractLayer2Column
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DistributionalColumn(AbstractLayer2Column):
|
|
12
|
+
"""A single distributional metric column in the Layer 2B output table.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
source_measurement: Measurement field providing the numerical context.
|
|
16
|
+
layer2_aggregator: SQL aggregation function applied to the measurement.
|
|
17
|
+
categorical: Categorical field whose value distribution is measured.
|
|
18
|
+
distributional_metric: The distributional statistic to compute.
|
|
19
|
+
|
|
20
|
+
Raises:
|
|
21
|
+
ValueError: If ``layer2_aggregator`` is not permitted by the
|
|
22
|
+
measurement's contract, or if ``categorical.name`` contains the
|
|
23
|
+
column name separator (``__``).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
source_measurement: MeasurementField,
|
|
29
|
+
layer2_aggregator: Layer2Aggregator,
|
|
30
|
+
categorical: CategoricalField,
|
|
31
|
+
distributional_metric: DistributionalMetric,
|
|
32
|
+
) -> None:
|
|
33
|
+
super().__init__(source_measurement, layer2_aggregator)
|
|
34
|
+
self._check_name_part(categorical.name, "categorical field name")
|
|
35
|
+
self.categorical = categorical
|
|
36
|
+
self.distributional_metric = distributional_metric
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def output_type(self) -> Layer2OutputType:
|
|
40
|
+
if self.distributional_metric == DistributionalMetric.MODE:
|
|
41
|
+
return Layer2OutputType.CATEGORICAL
|
|
42
|
+
return Layer2OutputType.NUMERIC
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def column_name(self) -> str:
|
|
46
|
+
return (
|
|
47
|
+
f"{self.categorical.name}"
|
|
48
|
+
f"__{self.source_measurement.name}"
|
|
49
|
+
f"__{self.layer2_aggregator.value}"
|
|
50
|
+
f"__{self.distributional_metric.value}"
|
|
51
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""PivotedColumn — one cell in the Layer 2A pivot table."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import Layer2Aggregator, Layer2OutputType, MeasurementType
|
|
6
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
7
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
8
|
+
from featkit.layer2.base import AbstractLayer2Column
|
|
9
|
+
|
|
10
|
+
_MT_TO_OUTPUT: dict[MeasurementType, Layer2OutputType] = {
|
|
11
|
+
MeasurementType.MONTO: Layer2OutputType.NUMERIC,
|
|
12
|
+
MeasurementType.CANTIDAD: Layer2OutputType.NUMERIC,
|
|
13
|
+
MeasurementType.TICKET: Layer2OutputType.NUMERIC,
|
|
14
|
+
MeasurementType.FLAG: Layer2OutputType.FLAG,
|
|
15
|
+
MeasurementType.FECHA: Layer2OutputType.TEMPORAL,
|
|
16
|
+
MeasurementType.BALANCE: Layer2OutputType.NUMERIC,
|
|
17
|
+
MeasurementType.TIME_DIFF: Layer2OutputType.NUMERIC,
|
|
18
|
+
MeasurementType.ESTADISTICO: Layer2OutputType.NUMERIC,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PivotedColumn(AbstractLayer2Column):
|
|
23
|
+
"""A single cell in the Layer 2A (pivot) output table.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source_measurement: Measurement field being aggregated.
|
|
27
|
+
layer2_aggregator: SQL aggregation function applied to the measurement.
|
|
28
|
+
categorical_combination: Mapping of categorical field → value that
|
|
29
|
+
defines the filter for this cell. ``None`` as a value means the
|
|
30
|
+
∅ marginal (no filter on that dimension). Defensively copied at
|
|
31
|
+
construction time.
|
|
32
|
+
|
|
33
|
+
Raises:
|
|
34
|
+
ValueError: If ``layer2_aggregator`` is not permitted by the
|
|
35
|
+
measurement's contract, or if any field name or categorical value
|
|
36
|
+
contains the column name separator (``__``).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
def __init__(
|
|
40
|
+
self,
|
|
41
|
+
source_measurement: MeasurementField,
|
|
42
|
+
layer2_aggregator: Layer2Aggregator,
|
|
43
|
+
categorical_combination: dict[CategoricalField, str | None] | None = None,
|
|
44
|
+
) -> None:
|
|
45
|
+
super().__init__(source_measurement, layer2_aggregator)
|
|
46
|
+
self.categorical_combination: dict[CategoricalField, str | None] = (
|
|
47
|
+
dict(categorical_combination) if categorical_combination is not None else {}
|
|
48
|
+
)
|
|
49
|
+
for field, value in self.categorical_combination.items():
|
|
50
|
+
self._check_name_part(field.name, "categorical field name")
|
|
51
|
+
if value is not None:
|
|
52
|
+
self._check_name_part(value, "categorical value")
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def output_type(self) -> Layer2OutputType:
|
|
56
|
+
return _MT_TO_OUTPUT[self.source_measurement.measurement_type]
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def column_name(self) -> str:
|
|
60
|
+
parts = [self.layer2_aggregator.value, self.source_measurement.name]
|
|
61
|
+
for field, value in sorted(self.categorical_combination.items(), key=lambda kv: kv[0].name):
|
|
62
|
+
parts.append(f"{field.name}_{value}" if value is not None else field.name)
|
|
63
|
+
return "__".join(parts)
|
featkit/layer3/.gitkeep
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""TemporalFeature — a single Layer 3 output column."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from featkit.enums import TemporalOperator, TimeWindowDirection
|
|
6
|
+
from featkit.layer2.base import AbstractLayer2Column
|
|
7
|
+
|
|
8
|
+
#: Operators that operate on a single point in time and do not require a window.
|
|
9
|
+
_POINT_IN_TIME_OPERATORS: frozenset[TemporalOperator] = frozenset(
|
|
10
|
+
{TemporalOperator.ULT_MES, TemporalOperator.PREV_MES, TemporalOperator.REC}
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TemporalFeature:
|
|
15
|
+
"""A single Layer 3 output column derived by applying a temporal operator to a Layer 2 column.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
source: The Layer 2 column this feature is built from.
|
|
19
|
+
operator: Temporal operator to apply.
|
|
20
|
+
direction: Direction of the sliding window.
|
|
21
|
+
window_size: Number of periods in the window. Required for windowed
|
|
22
|
+
operators; must be ``None`` for point-in-time operators
|
|
23
|
+
(e.g. ``ULT_MES``, ``REC``).
|
|
24
|
+
|
|
25
|
+
Raises:
|
|
26
|
+
ValueError: If ``operator`` is not permitted by ``source.output_contract``.
|
|
27
|
+
ValueError: If a windowed operator is used with ``window_size=None``.
|
|
28
|
+
ValueError: If a point-in-time operator is used with a non-``None`` ``window_size``.
|
|
29
|
+
ValueError: If ``window_size`` is not a positive integer when provided.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
source: AbstractLayer2Column,
|
|
35
|
+
operator: TemporalOperator,
|
|
36
|
+
direction: TimeWindowDirection,
|
|
37
|
+
window_size: int | None = None,
|
|
38
|
+
) -> None:
|
|
39
|
+
if not source.output_contract.is_valid(operator):
|
|
40
|
+
valid = ", ".join(
|
|
41
|
+
op.name
|
|
42
|
+
for op in sorted(
|
|
43
|
+
source.output_contract.valid_temporal_operators, key=lambda o: o.value
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
raise ValueError(
|
|
47
|
+
f"TemporalOperator.{operator.name} is not valid for "
|
|
48
|
+
f"output type Layer2OutputType.{source.output_type.name}. "
|
|
49
|
+
f"Valid operators: {valid}"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
is_pit = operator in _POINT_IN_TIME_OPERATORS
|
|
53
|
+
if is_pit and window_size is not None:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"TemporalOperator.{operator.name} is a point-in-time operator; "
|
|
56
|
+
f"window_size must be None, got {window_size!r}"
|
|
57
|
+
)
|
|
58
|
+
if not is_pit and window_size is None:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"TemporalOperator.{operator.name} is a windowed operator; window_size is required"
|
|
61
|
+
)
|
|
62
|
+
if window_size is not None and (
|
|
63
|
+
isinstance(window_size, bool) or not isinstance(window_size, int) or window_size < 1
|
|
64
|
+
):
|
|
65
|
+
raise ValueError(f"window_size must be a positive integer, got {window_size!r}")
|
|
66
|
+
|
|
67
|
+
self.source = source
|
|
68
|
+
self.operator = operator
|
|
69
|
+
self.direction = direction
|
|
70
|
+
self.window_size = window_size
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def column_name(self) -> str:
|
|
74
|
+
"""Deterministic name for this Layer 3 feature column."""
|
|
75
|
+
parts = [self.source.column_name, self.operator.value, self.direction.value]
|
|
76
|
+
if self.window_size is not None:
|
|
77
|
+
parts.append(str(self.window_size))
|
|
78
|
+
return "__".join(parts)
|
|
79
|
+
|
|
80
|
+
def __repr__(self) -> str:
|
|
81
|
+
return (
|
|
82
|
+
f"TemporalFeature("
|
|
83
|
+
f"source={self.source.column_name!r}, "
|
|
84
|
+
f"operator={self.operator.name!r}, "
|
|
85
|
+
f"direction={self.direction.name!r}, "
|
|
86
|
+
f"window_size={self.window_size!r})"
|
|
87
|
+
)
|
featkit/pipeline.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""FeatureStorePipeline — orchestrates space builders and delegates to a code generator."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
from featkit.builders.distributional_space import DistributionalSpaceBuilder
|
|
8
|
+
from featkit.builders.pivot_space import PivotSpaceBuilder
|
|
9
|
+
from featkit.builders.temporal_space import TemporalSpaceBuilder
|
|
10
|
+
from featkit.config import FeatureStoreConfig
|
|
11
|
+
from featkit.layer2.distributional import DistributionalColumn
|
|
12
|
+
from featkit.layer2.pivoted import PivotedColumn
|
|
13
|
+
from featkit.layer3.temporal_feature import TemporalFeature
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from featkit.generators.base import AbstractCodeGenerator
|
|
17
|
+
from featkit.generators.output import FeatureStoreOutput
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class FeatureStorePipeline:
|
|
21
|
+
"""Orchestrates the three space builders and delegates code generation.
|
|
22
|
+
|
|
23
|
+
The pipeline holds the results of the last ``build()`` call in three lists:
|
|
24
|
+
``layer2a`` (pivot columns), ``layer2b`` (distributional columns), and
|
|
25
|
+
``layer3`` (temporal features). All three are empty until ``build()`` is
|
|
26
|
+
called.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
config: The pipeline configuration.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, config: FeatureStoreConfig) -> None:
|
|
33
|
+
self.config = config
|
|
34
|
+
self.layer2a: list[PivotedColumn] = []
|
|
35
|
+
self.layer2b: list[DistributionalColumn] = []
|
|
36
|
+
self.layer3: list[TemporalFeature] = []
|
|
37
|
+
|
|
38
|
+
def build(self) -> FeatureStorePipeline:
|
|
39
|
+
"""Run all three space builders and return ``self``.
|
|
40
|
+
|
|
41
|
+
Idempotent: calling ``build()`` more than once replaces the previous
|
|
42
|
+
results with an identical fresh computation.
|
|
43
|
+
"""
|
|
44
|
+
cfg = self.config
|
|
45
|
+
self.layer2a = PivotSpaceBuilder(
|
|
46
|
+
dataset=cfg.dataset,
|
|
47
|
+
include_marginals=cfg.include_marginals,
|
|
48
|
+
aggregators_override=cfg.aggregators_override,
|
|
49
|
+
).build()
|
|
50
|
+
self.layer2b = DistributionalSpaceBuilder(
|
|
51
|
+
dataset=cfg.dataset,
|
|
52
|
+
).build()
|
|
53
|
+
self.layer3 = TemporalSpaceBuilder(
|
|
54
|
+
layer2_columns=[*self.layer2a, *self.layer2b],
|
|
55
|
+
time_windows=cfg.time_windows,
|
|
56
|
+
composed_windows=cfg.composed_windows,
|
|
57
|
+
operators_override=cfg.operators_override,
|
|
58
|
+
).build()
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def run(self, generator: AbstractCodeGenerator) -> FeatureStoreOutput:
|
|
62
|
+
"""Delegate code generation entirely to the supplied generator."""
|
|
63
|
+
return generator.generate(self)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
|
+
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
|
+
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
7
|
+
Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
|
|
9
|
+
Author: Mirko
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Mirko
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
|
|
33
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Requires-Dist: sqlglot>=23.0
|
|
46
|
+
Provides-Extra: databricks
|
|
47
|
+
Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
55
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
56
|
+
Provides-Extra: docs
|
|
57
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
60
|
+
Provides-Extra: ibis
|
|
61
|
+
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
62
|
+
Provides-Extra: spark
|
|
63
|
+
Requires-Dist: pyspark>=3.4; extra == 'spark'
|
|
64
|
+
Description-Content-Type: text/markdown
|
|
65
|
+
|
|
66
|
+
# featkit
|
|
67
|
+
|
|
68
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
69
|
+
|
|
70
|
+
It implements a three-layer architecture:
|
|
71
|
+
|
|
72
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
73
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
74
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
75
|
+
|
|
76
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
77
|
+
|
|
78
|
+
## Key concepts
|
|
79
|
+
|
|
80
|
+
| Layer | What it does |
|
|
81
|
+
|---|---|
|
|
82
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
83
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
84
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install featkit
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Quickstart
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
96
|
+
from featkit.dataset import SimpleDataset
|
|
97
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
98
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
99
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
100
|
+
|
|
101
|
+
# Define schema
|
|
102
|
+
fields = [
|
|
103
|
+
IDField(name="ID_CLIENTE"),
|
|
104
|
+
TimeField(name="PERIODO",
|
|
105
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
106
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
107
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
108
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
109
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
110
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
111
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
112
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
dataset = SimpleDataset(
|
|
116
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
117
|
+
fields=fields,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
config = FeatureStoreConfig(
|
|
121
|
+
dataset=dataset,
|
|
122
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
123
|
+
output_table_prefix="FS",
|
|
124
|
+
time_windows=[3, 6, 9, 12],
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
128
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
129
|
+
|
|
130
|
+
output.save("./output")
|
|
131
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Architecture
|
|
135
|
+
|
|
136
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
featkit/__init__.py,sha256=Gye3QbTN1Ipd8LAxOo81dnvXOEBHFz8n2cRvvyX5NUg,83
|
|
2
|
+
featkit/config.py,sha256=jaa35QJsJ1RnxyTGBi3bgsu1TLzgKb9A4zC-bpY3a8I,1666
|
|
3
|
+
featkit/enums.py,sha256=HhXffwPPLWtUmrjdQ31AyZVmhQf8TqkGLpX-YGI78TA,2463
|
|
4
|
+
featkit/pipeline.py,sha256=JxndaErlD4PRyU8oS0h0Kumm8CmJba-4QhpO04AuQ7Y,2443
|
|
5
|
+
featkit/builders/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
featkit/builders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
featkit/builders/distributional_space.py,sha256=KmhFT3eY-URyxvAvHLAZPW_rTGEy3S4AMo9_SXHXIWI,3274
|
|
8
|
+
featkit/builders/pivot_space.py,sha256=a5EIS8FJrHaLLCPasU1mVu_nnN8CkuBSzQamuU1tZ7I,4578
|
|
9
|
+
featkit/builders/temporal_space.py,sha256=p4xsy5H_MBRXSnv3bsisNEbEp7dsxEPYWND1PQOxWLo,3617
|
|
10
|
+
featkit/contracts/__init__.py,sha256=fyhKqo6gfJRkh-Wpdm542xX_dTkSFjirxV8XYfvluQ0,50
|
|
11
|
+
featkit/contracts/measurement/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
featkit/contracts/measurement/__init__.py,sha256=z6R5h_7B64zNNyQ3UAHsK0Bp5e5y4z7KBaXGbDrKPWM,645
|
|
13
|
+
featkit/contracts/measurement/base.py,sha256=c15hjpiB-eseOvystZ-Ku5OCJuyi76JcBnYqZunQJG0,1735
|
|
14
|
+
featkit/contracts/measurement/defaults.py,sha256=UeJ2CnFlBwUBGe3p9r9pbBGynkDze3JXACe5w7jl9zc,3688
|
|
15
|
+
featkit/contracts/output/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
+
featkit/contracts/output/__init__.py,sha256=NuHpmLkm0Bi8CZiKBrEFOcV_DoPmud8lznSFjAFJNf8,518
|
|
17
|
+
featkit/contracts/output/base.py,sha256=PNTvO87_JT4A9R3KtI7zdTK_BMzweAXotOM-FQR4sbw,1269
|
|
18
|
+
featkit/contracts/output/defaults.py,sha256=p9pyObMgmkZpI6eu1sYG5W0ZGJ-A7ybgy1_MC9x490M,2593
|
|
19
|
+
featkit/dataset/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
20
|
+
featkit/dataset/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
21
|
+
featkit/dataset/base.py,sha256=famnNBUamzekTAVjrznLP0CD5Ho-xteMvHrb2hvIOko,4436
|
|
22
|
+
featkit/fields/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
23
|
+
featkit/fields/__init__.py,sha256=9-emUyqQLjPY_MBDrqEZGo0hRBIfoZpuw3_RChnmtBU,416
|
|
24
|
+
featkit/fields/base.py,sha256=CbPb-M8NSzOYgkeVgeLT0C-ucyRRap3q4kT0Noj1xJ4,1430
|
|
25
|
+
featkit/fields/categorical_field.py,sha256=ojoYXW8Zdcs-FK2S2u_S3JoMk5RgXNm5_oaXvZvvFac,2177
|
|
26
|
+
featkit/fields/id_field.py,sha256=IHgOol2cxALly7cu1T8r27gbcr4Nc3BTJ5ODqMxkqyA,375
|
|
27
|
+
featkit/fields/measurement_field.py,sha256=vP5vP-PBr4S03zQSSlreq3NF6c2RS8d6bgSpQc-smqY,1490
|
|
28
|
+
featkit/fields/time_field.py,sha256=vYJ_Wdh3u039A2zpMcZxJQ8Hn-7poWLqj_kvKi49-hE,1337
|
|
29
|
+
featkit/generators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
+
featkit/generators/base.py,sha256=v4SkI8hrMVCIVATQTI2oZPQAIpC9UG-sReUOd79-1Kc,6459
|
|
31
|
+
featkit/generators/output.py,sha256=JxODvBXjyoa1OOtAmwneQK_z1o_qCA4HHCw7v9xjTIQ,3275
|
|
32
|
+
featkit/generators/pyspark/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
33
|
+
featkit/generators/pyspark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
|
+
featkit/generators/pyspark/databricks.py,sha256=5vP5Aq7C4EZkh4-dSl76EWLHN-HpMadJoE8rBdTNVeQ,18985
|
|
35
|
+
featkit/generators/sql/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
|
+
featkit/generators/sql/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
featkit/generators/sql/base.py,sha256=pEgNFBqEEJgk4XNy8MVAKBO7OYcWovurr6b9F-s4LeI,20417
|
|
38
|
+
featkit/generators/sql/databricks.py,sha256=s2FDXkONXzs4iEm_IanbVLUNqqIoGFzCQvWinOZ_ybI,680
|
|
39
|
+
featkit/generators/sql/snowflake.py,sha256=ljN9qrSi62JvwVsGU9JNeaRbp8VSjBRNgvHDk-e5mr0,677
|
|
40
|
+
featkit/generators/sql/spark_sql.py,sha256=Ed-_OUXeBzohzUnzzFY5B7aeNVS-3xl_LtQ-VdodrHA,672
|
|
41
|
+
featkit/layer2/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
42
|
+
featkit/layer2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
+
featkit/layer2/base.py,sha256=5Oqssix47dC__1_81vtY2o2XhTwXe2PeRW2f3JVOfMs,3266
|
|
44
|
+
featkit/layer2/distributional.py,sha256=Y44Jhju-I6O3wOqeN3RqjOCWm6f8poGWAtgbLFmoJSk,1975
|
|
45
|
+
featkit/layer2/pivoted.py,sha256=o5d5gnDfVKuzO4rLqSppTkPE4r4Vy4y-qbh3y2FET1A,2778
|
|
46
|
+
featkit/layer3/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
|
+
featkit/layer3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
+
featkit/layer3/temporal_feature.py,sha256=I_OIOIRxo5-l9Sm519ZzE2hWByfXB2yy8k6q_Pe0jhc,3462
|
|
49
|
+
featkit-0.1.0.dist-info/METADATA,sha256=4SlWA7N812iB3O88_l262E8OS0LYWycGEjmTJIfLdqI,5866
|
|
50
|
+
featkit-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
51
|
+
featkit-0.1.0.dist-info/licenses/LICENSE,sha256=CZwDNkl35IhZurGF12e5fdiH-dYFgHKdmm74PZDQWPI,1062
|
|
52
|
+
featkit-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mirko
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|