featkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featkit-0.1.0/.github/workflows/ci.yml +70 -0
- featkit-0.1.0/.github/workflows/docs.yml +28 -0
- featkit-0.1.0/.github/workflows/publish.yml +54 -0
- featkit-0.1.0/.gitignore +15 -0
- featkit-0.1.0/CHANGELOG.md +8 -0
- featkit-0.1.0/LICENSE +21 -0
- featkit-0.1.0/PKG-INFO +140 -0
- featkit-0.1.0/README.md +75 -0
- featkit-0.1.0/docs/.gitkeep +0 -0
- featkit-0.1.0/docs/examples.md +245 -0
- featkit-0.1.0/docs/general_plan.md +488 -0
- featkit-0.1.0/docs/index.md +5 -0
- featkit-0.1.0/docs/quickstart.md +94 -0
- featkit-0.1.0/mkdocs.yml +42 -0
- featkit-0.1.0/pyproject.toml +86 -0
- featkit-0.1.0/src/featkit/__init__.py +1 -0
- featkit-0.1.0/src/featkit/builders/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/builders/__init__.py +0 -0
- featkit-0.1.0/src/featkit/builders/distributional_space.py +77 -0
- featkit-0.1.0/src/featkit/builders/pivot_space.py +102 -0
- featkit-0.1.0/src/featkit/builders/temporal_space.py +86 -0
- featkit-0.1.0/src/featkit/config.py +38 -0
- featkit-0.1.0/src/featkit/contracts/__init__.py +1 -0
- featkit-0.1.0/src/featkit/contracts/measurement/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/contracts/measurement/__init__.py +27 -0
- featkit-0.1.0/src/featkit/contracts/measurement/base.py +47 -0
- featkit-0.1.0/src/featkit/contracts/measurement/defaults.py +117 -0
- featkit-0.1.0/src/featkit/contracts/output/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/contracts/output/__init__.py +19 -0
- featkit-0.1.0/src/featkit/contracts/output/base.py +36 -0
- featkit-0.1.0/src/featkit/contracts/output/defaults.py +80 -0
- featkit-0.1.0/src/featkit/dataset/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/dataset/__init__.py +0 -0
- featkit-0.1.0/src/featkit/dataset/base.py +120 -0
- featkit-0.1.0/src/featkit/enums.py +110 -0
- featkit-0.1.0/src/featkit/fields/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/fields/__init__.py +9 -0
- featkit-0.1.0/src/featkit/fields/base.py +48 -0
- featkit-0.1.0/src/featkit/fields/categorical_field.py +55 -0
- featkit-0.1.0/src/featkit/fields/id_field.py +14 -0
- featkit-0.1.0/src/featkit/fields/measurement_field.py +42 -0
- featkit-0.1.0/src/featkit/fields/time_field.py +43 -0
- featkit-0.1.0/src/featkit/generators/__init__.py +0 -0
- featkit-0.1.0/src/featkit/generators/base.py +171 -0
- featkit-0.1.0/src/featkit/generators/output.py +118 -0
- featkit-0.1.0/src/featkit/generators/pyspark/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/generators/pyspark/__init__.py +0 -0
- featkit-0.1.0/src/featkit/generators/pyspark/databricks.py +448 -0
- featkit-0.1.0/src/featkit/generators/sql/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/generators/sql/__init__.py +0 -0
- featkit-0.1.0/src/featkit/generators/sql/base.py +496 -0
- featkit-0.1.0/src/featkit/generators/sql/databricks.py +19 -0
- featkit-0.1.0/src/featkit/generators/sql/snowflake.py +19 -0
- featkit-0.1.0/src/featkit/generators/sql/spark_sql.py +19 -0
- featkit-0.1.0/src/featkit/layer2/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/layer2/__init__.py +0 -0
- featkit-0.1.0/src/featkit/layer2/base.py +86 -0
- featkit-0.1.0/src/featkit/layer2/distributional.py +51 -0
- featkit-0.1.0/src/featkit/layer2/pivoted.py +63 -0
- featkit-0.1.0/src/featkit/layer3/.gitkeep +0 -0
- featkit-0.1.0/src/featkit/layer3/__init__.py +0 -0
- featkit-0.1.0/src/featkit/layer3/temporal_feature.py +87 -0
- featkit-0.1.0/src/featkit/pipeline.py +63 -0
- featkit-0.1.0/tests/__init__.py +0 -0
- featkit-0.1.0/tests/test_builders.py +608 -0
- featkit-0.1.0/tests/test_contracts.py +173 -0
- featkit-0.1.0/tests/test_enums.py +138 -0
- featkit-0.1.0/tests/test_fields.py +467 -0
- featkit-0.1.0/tests/test_generators/.gitkeep +0 -0
- featkit-0.1.0/tests/test_generators/__init__.py +0 -0
- featkit-0.1.0/tests/test_generators/test_base.py +432 -0
- featkit-0.1.0/tests/test_generators/test_pyspark.py +366 -0
- featkit-0.1.0/tests/test_generators/test_sql_databricks.py +316 -0
- featkit-0.1.0/tests/test_generators/test_sql_snowflake.py +421 -0
- featkit-0.1.0/tests/test_integration.py +512 -0
- featkit-0.1.0/tests/test_layer2.py +401 -0
- featkit-0.1.0/tests/test_layer3.py +202 -0
- featkit-0.1.0/tests/test_output_contracts.py +152 -0
- featkit-0.1.0/tests/test_pipeline.py +266 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: Lint
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.12"
|
|
19
|
+
|
|
20
|
+
- name: Install lint dependencies
|
|
21
|
+
run: |
|
|
22
|
+
python -m pip install --upgrade pip
|
|
23
|
+
pip install ruff
|
|
24
|
+
|
|
25
|
+
- name: Run Ruff lint
|
|
26
|
+
run: ruff check .
|
|
27
|
+
|
|
28
|
+
- name: Check formatting
|
|
29
|
+
run: ruff format --check .
|
|
30
|
+
|
|
31
|
+
typecheck:
|
|
32
|
+
name: Type Check
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
steps:
|
|
35
|
+
- uses: actions/checkout@v4
|
|
36
|
+
|
|
37
|
+
- uses: actions/setup-python@v5
|
|
38
|
+
with:
|
|
39
|
+
python-version: "3.12"
|
|
40
|
+
|
|
41
|
+
- name: Install dependencies
|
|
42
|
+
run: |
|
|
43
|
+
python -m pip install --upgrade pip
|
|
44
|
+
pip install -e ".[dev]"
|
|
45
|
+
|
|
46
|
+
- name: Run mypy
|
|
47
|
+
run: mypy .
|
|
48
|
+
|
|
49
|
+
test:
|
|
50
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
strategy:
|
|
53
|
+
fail-fast: false
|
|
54
|
+
matrix:
|
|
55
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
56
|
+
|
|
57
|
+
steps:
|
|
58
|
+
- uses: actions/checkout@v4
|
|
59
|
+
|
|
60
|
+
- uses: actions/setup-python@v5
|
|
61
|
+
with:
|
|
62
|
+
python-version: ${{ matrix.python-version }}
|
|
63
|
+
|
|
64
|
+
- name: Install dependencies
|
|
65
|
+
run: |
|
|
66
|
+
python -m pip install --upgrade pip
|
|
67
|
+
pip install -e ".[dev]"
|
|
68
|
+
|
|
69
|
+
- name: Run tests
|
|
70
|
+
run: pytest
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
deploy:
|
|
12
|
+
name: Build and deploy docs
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Install dependencies
|
|
23
|
+
run: |
|
|
24
|
+
python -m pip install --upgrade pip
|
|
25
|
+
pip install -e ".[docs]"
|
|
26
|
+
|
|
27
|
+
- name: Deploy to GitHub Pages
|
|
28
|
+
run: mkdocs gh-deploy --force
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
name: Build distribution
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- uses: actions/setup-python@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: "3.12"
|
|
19
|
+
|
|
20
|
+
- name: Install build tools
|
|
21
|
+
run: |
|
|
22
|
+
python -m pip install --upgrade pip
|
|
23
|
+
pip install build twine
|
|
24
|
+
|
|
25
|
+
- name: Build wheel and sdist
|
|
26
|
+
run: python -m build
|
|
27
|
+
|
|
28
|
+
- name: Check distribution
|
|
29
|
+
run: twine check dist/*
|
|
30
|
+
|
|
31
|
+
- name: Upload distribution artifacts
|
|
32
|
+
uses: actions/upload-artifact@v4
|
|
33
|
+
with:
|
|
34
|
+
name: dist
|
|
35
|
+
path: dist/
|
|
36
|
+
|
|
37
|
+
publish:
|
|
38
|
+
name: Publish to PyPI
|
|
39
|
+
needs: build
|
|
40
|
+
runs-on: ubuntu-latest
|
|
41
|
+
environment: pypi
|
|
42
|
+
permissions:
|
|
43
|
+
id-token: write
|
|
44
|
+
contents: read
|
|
45
|
+
|
|
46
|
+
steps:
|
|
47
|
+
- name: Download distribution artifacts
|
|
48
|
+
uses: actions/download-artifact@v4
|
|
49
|
+
with:
|
|
50
|
+
name: dist
|
|
51
|
+
path: dist/
|
|
52
|
+
|
|
53
|
+
- name: Publish to PyPI
|
|
54
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
featkit-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
featkit-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mirko
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
featkit-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: featkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: featkit — automated feature store generation from relational facts tables
|
|
5
|
+
Project-URL: Repository, https://github.com/Mirkiux/featkit
|
|
6
|
+
Project-URL: Documentation, https://mirkiux.github.io/featkit
|
|
7
|
+
Project-URL: Changelog, https://github.com/Mirkiux/featkit/blob/main/CHANGELOG.md
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/Mirkiux/featkit/issues
|
|
9
|
+
Author: Mirko
|
|
10
|
+
License: MIT License
|
|
11
|
+
|
|
12
|
+
Copyright (c) 2026 Mirko
|
|
13
|
+
|
|
14
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
15
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16
|
+
in the Software without restriction, including without limitation the rights
|
|
17
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
18
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
19
|
+
furnished to do so, subject to the following conditions:
|
|
20
|
+
|
|
21
|
+
The above copyright notice and this permission notice shall be included in all
|
|
22
|
+
copies or substantial portions of the Software.
|
|
23
|
+
|
|
24
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
25
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
26
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
27
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
28
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
29
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
30
|
+
SOFTWARE.
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Keywords: analytics,data engineering,databricks,feature engineering,feature store,pivot,pyspark,snowflake
|
|
33
|
+
Classifier: Development Status :: 2 - Pre-Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: Intended Audience :: Science/Research
|
|
36
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
37
|
+
Classifier: Programming Language :: Python :: 3
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
44
|
+
Requires-Python: >=3.10
|
|
45
|
+
Requires-Dist: sqlglot>=23.0
|
|
46
|
+
Provides-Extra: databricks
|
|
47
|
+
Requires-Dist: databricks-sql-connector>=3.0; extra == 'databricks'
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
50
|
+
Requires-Dist: hatch>=1.9; extra == 'dev'
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
55
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
56
|
+
Provides-Extra: docs
|
|
57
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocs>=1.6; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
60
|
+
Provides-Extra: ibis
|
|
61
|
+
Requires-Dist: ibis-framework>=9.0; extra == 'ibis'
|
|
62
|
+
Provides-Extra: spark
|
|
63
|
+
Requires-Dist: pyspark>=3.4; extra == 'spark'
|
|
64
|
+
Description-Content-Type: text/markdown
|
|
65
|
+
|
|
66
|
+
# featkit
|
|
67
|
+
|
|
68
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
69
|
+
|
|
70
|
+
It implements a three-layer architecture:
|
|
71
|
+
|
|
72
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
73
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
74
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
75
|
+
|
|
76
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
77
|
+
|
|
78
|
+
## Key concepts
|
|
79
|
+
|
|
80
|
+
| Layer | What it does |
|
|
81
|
+
|---|---|
|
|
82
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
83
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
84
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
85
|
+
|
|
86
|
+
## Installation
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
pip install featkit
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Quickstart
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
96
|
+
from featkit.dataset import SimpleDataset
|
|
97
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
98
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
99
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
100
|
+
|
|
101
|
+
# Define schema
|
|
102
|
+
fields = [
|
|
103
|
+
IDField(name="ID_CLIENTE"),
|
|
104
|
+
TimeField(name="PERIODO",
|
|
105
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
106
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
107
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
108
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
109
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
110
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
111
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
112
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
dataset = SimpleDataset(
|
|
116
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
117
|
+
fields=fields,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
config = FeatureStoreConfig(
|
|
121
|
+
dataset=dataset,
|
|
122
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
123
|
+
output_table_prefix="FS",
|
|
124
|
+
time_windows=[3, 6, 9, 12],
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
128
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
129
|
+
|
|
130
|
+
output.save("./output")
|
|
131
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Architecture
|
|
135
|
+
|
|
136
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
featkit-0.1.0/README.md
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# featkit
|
|
2
|
+
|
|
3
|
+
**featkit** is a Python framework for automated feature store generation from relational facts tables.
|
|
4
|
+
|
|
5
|
+
It implements a three-layer architecture:
|
|
6
|
+
|
|
7
|
+
- **Layer 1** — input facts table with typed columns (ID, time, categorical, measurement)
|
|
8
|
+
- **Layer 2** — horizontal concept table built via pivot (2A) and distributional aggregations (2B)
|
|
9
|
+
- **Layer 3** — temporal feature table produced by sliding operators over the Layer 2 columns
|
|
10
|
+
|
|
11
|
+
The framework is engine-agnostic: the same pipeline definition produces either a standalone SQL script (Snowflake, Databricks SQL, Spark SQL) or a lazy PySpark execution plan, with the choice abstracted behind a code generator interface.
|
|
12
|
+
|
|
13
|
+
## Key concepts
|
|
14
|
+
|
|
15
|
+
| Layer | What it does |
|
|
16
|
+
|---|---|
|
|
17
|
+
| Layer 2A — Pivot | `GROUP BY (ID, time)` + `CASE WHEN` per categorical combination × measurement × aggregator |
|
|
18
|
+
| Layer 2B — Distributional | Per-categorical CTEs computing entropy, HHI, dominant proportion, mode, count |
|
|
19
|
+
| Layer 3 — Temporal | Sliding window operators (PROM_U, SUM_U, CREC, FREQ, REC, …) over all Layer 2 columns |
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install featkit
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Quickstart
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from featkit import FeatureStorePipeline, FeatureStoreConfig
|
|
31
|
+
from featkit.dataset import SimpleDataset
|
|
32
|
+
from featkit.fields import IDField, TimeField, CategoricalField, MeasurementField
|
|
33
|
+
from featkit.enums import MeasurementType, TimeGranularity, CategoricalTreatment
|
|
34
|
+
from featkit.generators.sql import SnowflakeSQLCodeGenerator
|
|
35
|
+
|
|
36
|
+
# Define schema
|
|
37
|
+
fields = [
|
|
38
|
+
IDField(name="ID_CLIENTE"),
|
|
39
|
+
TimeField(name="PERIODO",
|
|
40
|
+
source_granularity=TimeGranularity.MONTHLY,
|
|
41
|
+
target_granularity=TimeGranularity.MONTHLY),
|
|
42
|
+
CategoricalField(name="SECTOR", treatment=CategoricalTreatment.PIVOT,
|
|
43
|
+
allowed_values=["RETAIL", "CORP", "PYME"]),
|
|
44
|
+
CategoricalField(name="CANAL", treatment=CategoricalTreatment.PIVOT,
|
|
45
|
+
allowed_values=["DIGITAL", "PRESENCIAL", "TELEFONO"]),
|
|
46
|
+
MeasurementField(name="MTO", measurement_type=MeasurementType.MONTO),
|
|
47
|
+
MeasurementField(name="TRX", measurement_type=MeasurementType.CANTIDAD),
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
dataset = SimpleDataset(
|
|
51
|
+
source_reference="MY_DB.MY_SCHEMA.FACTS_TABLE",
|
|
52
|
+
fields=fields,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
config = FeatureStoreConfig(
|
|
56
|
+
dataset=dataset,
|
|
57
|
+
output_schema="MY_DB.MY_SCHEMA",
|
|
58
|
+
output_table_prefix="FS",
|
|
59
|
+
time_windows=[3, 6, 9, 12],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
pipeline = FeatureStorePipeline(config).build()
|
|
63
|
+
output = pipeline.run(SnowflakeSQLCodeGenerator())
|
|
64
|
+
|
|
65
|
+
output.save("./output")
|
|
66
|
+
# Writes: output/script.sql, output/dag.json, output/diagram.md
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Architecture
|
|
70
|
+
|
|
71
|
+
See [docs/general_plan.md](docs/general_plan.md) for the full implementation plan.
|
|
72
|
+
|
|
73
|
+
## License
|
|
74
|
+
|
|
75
|
+
MIT
|
|
File without changes
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# Examples
|
|
2
|
+
|
|
3
|
+
## Example 1 — Pivot categoricals with marginals
|
|
4
|
+
|
|
5
|
+
Generate features for every category value plus the unconditional total (∅ marginal).
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from featkit.config import FeatureStoreConfig
|
|
9
|
+
from featkit.dataset.base import SimpleDataset
|
|
10
|
+
from featkit.enums import CategoricalTreatment, MeasurementType, TimeGranularity
|
|
11
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
12
|
+
from featkit.fields.id_field import IDField
|
|
13
|
+
from featkit.fields.measurement_field import MeasurementField
|
|
14
|
+
from featkit.fields.time_field import TimeField
|
|
15
|
+
from featkit.generators.sql.snowflake import SnowflakeSQLCodeGenerator
|
|
16
|
+
from featkit.pipeline import FeatureStorePipeline
|
|
17
|
+
|
|
18
|
+
ds = SimpleDataset(
|
|
19
|
+
"mydb.silver_sales",
|
|
20
|
+
[
|
|
21
|
+
IDField("client_id"),
|
|
22
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
23
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
24
|
+
MeasurementField("txn_count", MeasurementType.CANTIDAD),
|
|
25
|
+
CategoricalField(
|
|
26
|
+
"channel",
|
|
27
|
+
CategoricalTreatment.PIVOT,
|
|
28
|
+
allowed_values=["branch", "online", "mobile"],
|
|
29
|
+
),
|
|
30
|
+
CategoricalField(
|
|
31
|
+
"product",
|
|
32
|
+
CategoricalTreatment.PIVOT,
|
|
33
|
+
allowed_values=["current_account", "savings", "loan"],
|
|
34
|
+
),
|
|
35
|
+
],
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
cfg = FeatureStoreConfig(
|
|
39
|
+
dataset=ds,
|
|
40
|
+
output_schema="analytics",
|
|
41
|
+
output_table_prefix="feat_",
|
|
42
|
+
time_windows=[3, 6, 12],
|
|
43
|
+
include_marginals=True, # include ∅ (unconditional) totals
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
pipeline = FeatureStorePipeline(config=cfg).build()
|
|
47
|
+
print(f"Layer 2A columns : {len(pipeline.layer2a)}")
|
|
48
|
+
print(f"Layer 3 features: {len(pipeline.layer3)}")
|
|
49
|
+
|
|
50
|
+
result = SnowflakeSQLCodeGenerator().generate(pipeline)
|
|
51
|
+
result.save("output/snowflake/")
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
## Example 2 — Distributional metrics
|
|
56
|
+
|
|
57
|
+
Measure concentration, diversity, and dominant value of a categorical over time.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from featkit.enums import CategoricalTreatment, DistributionalMetric, MeasurementType, TimeGranularity
|
|
61
|
+
from featkit.fields.categorical_field import CategoricalField
|
|
62
|
+
|
|
63
|
+
# All five distributional metrics
|
|
64
|
+
region_field = CategoricalField(
|
|
65
|
+
"region",
|
|
66
|
+
CategoricalTreatment.DISTRIBUTIONAL,
|
|
67
|
+
distributional_metrics=[
|
|
68
|
+
DistributionalMetric.ENTROPY, # diversity index
|
|
69
|
+
DistributionalMetric.HHI, # Herfindahl–Hirschman index
|
|
70
|
+
DistributionalMetric.DOMINANT_PROPORTION, # share of the top category
|
|
71
|
+
DistributionalMetric.MODE, # most frequent category
|
|
72
|
+
DistributionalMetric.COUNT, # number of active categories
|
|
73
|
+
],
|
|
74
|
+
)
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The generated `feat_layer2b` table will have one column per
|
|
78
|
+
`(categorical × measurement × aggregator × metric)` combination.
|
|
79
|
+
For `ENTROPY` and `HHI` the output type is `NUMERIC`; for `MODE` it is
|
|
80
|
+
`CATEGORICAL`, which restricts the set of valid temporal operators
|
|
81
|
+
(`ULT_MES`, `PREV_MES`, `REC` only).
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
## Example 3 — Mixed pivot and distributional on the same field
|
|
85
|
+
|
|
86
|
+
Use `BOTH` treatment to get pivot columns *and* distributional metrics from a
|
|
87
|
+
single categorical field.
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from featkit.enums import CategoricalTreatment, DistributionalMetric
|
|
91
|
+
|
|
92
|
+
product_field = CategoricalField(
|
|
93
|
+
"product_type",
|
|
94
|
+
CategoricalTreatment.BOTH,
|
|
95
|
+
allowed_values=["A", "B", "C"],
|
|
96
|
+
distributional_metrics=[DistributionalMetric.ENTROPY, DistributionalMetric.HHI],
|
|
97
|
+
)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
## Example 4 — Multiple entity keys
|
|
102
|
+
|
|
103
|
+
The pipeline supports composite primary keys. List every ID field; all generated
|
|
104
|
+
`GROUP BY` and join clauses will include all of them.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from featkit.fields.id_field import IDField
|
|
108
|
+
|
|
109
|
+
ds = SimpleDataset(
|
|
110
|
+
"mydb.silver_transactions",
|
|
111
|
+
[
|
|
112
|
+
IDField("country_code"),
|
|
113
|
+
IDField("client_id"),
|
|
114
|
+
# ... other fields
|
|
115
|
+
],
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
## Example 5 — Targeting Databricks SQL
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from featkit.generators.sql.databricks import DatabricksSQLCodeGenerator
|
|
124
|
+
|
|
125
|
+
result = DatabricksSQLCodeGenerator().generate(pipeline)
|
|
126
|
+
|
|
127
|
+
# Databricks uses backtick quoting; syntax is otherwise identical to Snowflake
|
|
128
|
+
print(result.code.sql[:200])
|
|
129
|
+
result.save("output/databricks/")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
## Example 6 — Generating a PySpark script
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
from featkit.generators.pyspark.databricks import PySparkCodeGenerator
|
|
137
|
+
|
|
138
|
+
result = PySparkCodeGenerator().generate(pipeline)
|
|
139
|
+
|
|
140
|
+
# result.code is a PySparkOutput; .code contains the full Python script
|
|
141
|
+
script = result.code.code
|
|
142
|
+
print(script[:500])
|
|
143
|
+
|
|
144
|
+
# Save — writes script.py instead of script.sql
|
|
145
|
+
result.save("output/pyspark/")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
The generated script is a self-contained Python file. Execute it by submitting
|
|
149
|
+
it to a Databricks job or a `spark-submit` invocation:
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
databricks jobs submit --existing-cluster-id <id> --python-file output/pyspark/script.py
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
## Example 7 — Operators override
|
|
157
|
+
|
|
158
|
+
Restrict which temporal operators are applied to each Layer 2 output type.
|
|
159
|
+
Useful when you only need a subset of features.
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
from featkit.enums import Layer2OutputType, TemporalOperator
|
|
163
|
+
from featkit.config import FeatureStoreConfig
|
|
164
|
+
|
|
165
|
+
cfg = FeatureStoreConfig(
|
|
166
|
+
dataset=ds,
|
|
167
|
+
output_schema="analytics",
|
|
168
|
+
output_table_prefix="feat_",
|
|
169
|
+
time_windows=[3, 6],
|
|
170
|
+
operators_override={
|
|
171
|
+
# Only rolling averages and latest-month snapshots for numeric columns
|
|
172
|
+
Layer2OutputType.NUMERIC: [
|
|
173
|
+
TemporalOperator.PROM_U,
|
|
174
|
+
TemporalOperator.SUM_U,
|
|
175
|
+
TemporalOperator.ULT_MES,
|
|
176
|
+
],
|
|
177
|
+
},
|
|
178
|
+
)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
## Example 8 — Inspecting the DAG
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import json
|
|
186
|
+
|
|
187
|
+
result = SnowflakeSQLCodeGenerator().generate(pipeline)
|
|
188
|
+
|
|
189
|
+
# Programmatic access
|
|
190
|
+
for node in result.dag.nodes:
|
|
191
|
+
print(f"{node.step_name:35s} depends on: {node.depends_on}")
|
|
192
|
+
|
|
193
|
+
# JSON (suitable for CI artefacts or a lineage tool)
|
|
194
|
+
print(json.dumps(json.loads(result.dag.to_json()), indent=2))
|
|
195
|
+
|
|
196
|
+
# Mermaid diagram (paste into any Mermaid renderer)
|
|
197
|
+
print(result.mermaid)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Expected output:
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
facts_table depends on: []
|
|
204
|
+
mob_table depends on: ['facts_table']
|
|
205
|
+
layer2a_pivot depends on: ['facts_table']
|
|
206
|
+
layer2b_distributional_ctes depends on: ['facts_table']
|
|
207
|
+
layer2_join depends on: ['layer2a_pivot', 'layer2b_distributional_ctes']
|
|
208
|
+
layer3_temporal depends on: ['layer2_join', 'mob_table']
|
|
209
|
+
final_output depends on: ['layer2_join', 'layer3_temporal']
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
## Example 9 — Dynamic categorical domain resolution
|
|
214
|
+
|
|
215
|
+
When the categorical domain is not known at configuration time, supply a
|
|
216
|
+
`domain_resolver` callable. featkit will call it at `build()` time; no
|
|
217
|
+
database adapter or executor is built into the framework.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from featkit.builders.pivot_space import PivotSpaceBuilder
|
|
221
|
+
from featkit.config import FeatureStoreConfig
|
|
222
|
+
|
|
223
|
+
# Provide your own query executor
|
|
224
|
+
def resolve_domain(field):
|
|
225
|
+
return conn.execute(
|
|
226
|
+
f"SELECT DISTINCT {field.name} FROM mydb.silver_transactions"
|
|
227
|
+
).fetchall()
|
|
228
|
+
|
|
229
|
+
ds = SimpleDataset(
|
|
230
|
+
"mydb.silver_transactions",
|
|
231
|
+
[
|
|
232
|
+
IDField("client_id"),
|
|
233
|
+
TimeField("period", TimeGranularity.MONTHLY, TimeGranularity.MONTHLY),
|
|
234
|
+
MeasurementField("amount", MeasurementType.MONTO),
|
|
235
|
+
CategoricalField("segment", CategoricalTreatment.PIVOT), # no allowed_values
|
|
236
|
+
],
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
# Build the pivot space manually with the resolver, then pass to the config
|
|
240
|
+
pivot_cols = PivotSpaceBuilder(
|
|
241
|
+
dataset=ds,
|
|
242
|
+
include_marginals=True,
|
|
243
|
+
domain_resolver=resolve_domain,
|
|
244
|
+
).build()
|
|
245
|
+
```
|